# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


### Notes on running these queries:

Bodo is used by defaults, which distributes data chunks across cores automatically.

The output cells are from execution on one **m5.12xlarge** instance (24 cores, 192GiB memory) using dataset found [here](https://www.kaggle.com/currie32/crimes-in-chicago) which is ~1.5GB.


To run the code:
1. Make sure you add your AWS account credentials to access the data. 
2. If you want to run a query in regular pandas:
    1. Comment lines with Jupyter parallel magic (%%px) and bodo decorator (@bodo.jit) from all the code cells.
    2. Then, re-run cells from the beginning.


In [1]:
%%px
import os

os.environ["AWS_ACCESS_KEY_ID"] = "your_aws_access_key_id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your_aws_secret_access_key"
os.environ["AWS_DEFAULT_REGION"] = "us-east-2"

In [2]:
%%px
import numpy as np
import pandas as pd
import time
import bodo

## Load Crimes Data in Chicago 2005 - 2017

In [3]:
%%px
@bodo.jit(distributed=["crimes"], cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes1 = pd.read_csv('s3://bodo-examples-data/chicago-crimes/Chicago_Crimes_2005_to_2007.csv')
    crimes2 = pd.read_csv('s3://bodo-examples-data/chicago-crimes/Chicago_Crimes_2008_to_2011.csv')
    crimes3 = pd.read_csv('s3://bodo-examples-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv')
    crimes = pd.concat([crimes1, crimes2, crimes3], ignore_index=False, axis=0)
    crimes = crimes.sort_values(by="ID")    
    print("Reading time: ", ((time.time() - t1) * 1000), " (ms)")    
    print(crimes.head())
    return crimes

crimes = load_chicago_crimes()

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Reading time:  7073.769092559814  (ms)
         Unnamed: 0    ID Case Number                    Date  \
1324003     4897380  3012    HL101040  01/01/2005 01:15:00 PM   
1324004     4898204  3013    HK826899  01/02/2005 09:45:00 PM   
1324005     4898986  3014    HL106602  01/04/2005 04:39:00 PM   
1324006     4899770  3015    HL107444  01/05/2005 04:07:00 AM   
1324007     4900593  3016    HL112637  01/08/2005 03:15:00 AM   

                         Block  IUCR Primary Type          Description  

## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [4]:
%%px
@bodo.jit(distributed=["crimes"], cache=True)
def data_cleanup(crimes):
    t1 = time.time()    
    crimes = crimes.drop_duplicates()    
    crimes.drop(['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], inplace=True, axis=1)
    crimes.Date = pd.to_datetime(crimes.Date, format='%m/%d/%Y %I:%M:%S %p')
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor('D')
    crimes = crimes.sort_values(by="ID")    
    print("Data cleanup time: ", ((time.time() - t1) * 1000), " (ms)")
    print(crimes.head())
    return crimes

crimes = data_cleanup(crimes)

[stdout:0] 
Data cleanup time:  4358.8738441467285  (ms)
           ID                Date                  Block Primary Type  \
1324003  3012 2005-01-01 13:15:00  076XX S GREENWOOD AVE     HOMICIDE   
1324004  3013 2005-01-02 21:45:00        029XX E 82ND ST     HOMICIDE   
1324005  3014 2005-01-04 16:39:00  070XX S CONSTANCE AVE     HOMICIDE   
1324006  3015 2005-01-05 04:07:00     095XX S COLFAX AVE     HOMICIDE   
1324007  3016 2005-01-08 03:15:00      015XX N DAYTON ST     HOMICIDE   

                 Description Location Description  Arrest  Domestic  District  \
1324003  FIRST DEGREE MURDER           VACANT LOT    True     False       6.0   
1324004  FIRST DEGREE MURDER               STREET    True     False       4.0   
1324005  FIRST DEGREE MURDER               STREET   False     False       3.0   
1324006  FIRST DEGREE MURDER                 AUTO   False     False       4.0   
1324007  FIRST DEGREE MURDER                 CLUB    True     False      18.0   

         X Coordi

In [5]:
%%px
@bodo.jit(distributed=["crimes"], cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes['Primary Type'].value_counts().index[0:10]
    print("Getting top crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(top_crime_types)
    return top_crime_types

top_crime_types = get_top_crime_types(crimes)
top_crime_types = top_crime_types.tolist()

[stdout:0] 
Getting top crimes Time:  167.5889492034912  (ms)
Index(['BATTERY', 'THEFT', 'CRIMINAL DAMAGE', 'NARCOTICS', 'OTHER OFFENSE',
       'ASSAULT', 'BURGLARY', 'MOTOR VEHICLE THEFT', 'CRIMINAL TRESPASS',
       'HOMICIDE'],
      dtype='object')


In [6]:
%%px

@bodo.jit(distributed=["crimes", "top_crimes"], cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes['Primary Type'].isin(top_crime_types)]
    print("Filtering crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(top_crimes.head())
    return top_crimes

crimes = filter_crimes(crimes, top_crime_types)

[stdout:0] 
Filtering crimes Time:  675.4751205444336  (ms)
           ID                Date                  Block Primary Type  \
1324003  3012 2005-01-01 13:15:00  076XX S GREENWOOD AVE     HOMICIDE   
1324004  3013 2005-01-02 21:45:00        029XX E 82ND ST     HOMICIDE   
1324005  3014 2005-01-04 16:39:00  070XX S CONSTANCE AVE     HOMICIDE   
1324006  3015 2005-01-05 04:07:00     095XX S COLFAX AVE     HOMICIDE   
1324007  3016 2005-01-08 03:15:00      015XX N DAYTON ST     HOMICIDE   

                 Description Location Description  Arrest  Domestic  District  \
1324003  FIRST DEGREE MURDER           VACANT LOT    True     False       6.0   
1324004  FIRST DEGREE MURDER               STREET    True     False       4.0   
1324005  FIRST DEGREE MURDER               STREET   False     False       3.0   
1324006  FIRST DEGREE MURDER                 AUTO   False     False       4.0   
1324007  FIRST DEGREE MURDER                 CLUB    True     False      18.0   

         X Coo

## Crime Analysis

### Find Pattern of each crime over the years



In [22]:
%%px
def get_crimes_type_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(index='date only', columns='Primary Type', values='ID', aggfunc="count")
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(result.head())

pivot_values = {"crimes_count_date" : top_crime_types}
bodo_func = bodo.jit(distributed=["crimes"], pivots=pivot_values)(get_crimes_type_date)(crimes)

[stdout:0] 
Computing Crime Pattern Time:  2584.177017211914  (ms)
            BATTERY    THEFT  CRIMINAL DAMAGE  NARCOTICS  OTHER OFFENSE  \
2017-01-18  75408.0  63503.0          42674.0    37965.0        23870.0   
2017-01-17  76609.0  64153.0          43461.0    37918.0        24156.0   
2017-01-16  75685.0  64250.0          44021.0    36394.0        24157.0   
2017-01-15  74789.0  63724.0          42392.0    37402.0        23616.0   
2017-01-14  75516.0  63824.0          43487.0    37746.0        23592.0   

            ASSAULT  BURGLARY  MOTOR VEHICLE THEFT  CRIMINAL TRESPASS  \
2017-01-18  21641.0   20366.0              17302.0            13907.0   
2017-01-17  21552.0   20532.0              17239.0            14013.0   
2017-01-16  21751.0   20099.0              17236.0            13902.0   
2017-01-15  21565.0   20231.0              17091.0            13644.0   
2017-01-14  21724.0   20359.0              17263.0            13746.0   

            HOMICIDE  
2017-01-18   12708.0

## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [34]:
%%px
@bodo.jit(distributed=['crimes', 'crimes_days'], cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = crimes.groupby('dow', as_index=False)['ID'].count().sort_values(by='dow')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(crimes_days.head())
    return crimes_days
    
crimes_days = get_crimes_by_days(crimes)

[stdout:0] 
Group by days Time:  11.953115463256836  (ms)
   dow      ID
0    0  563268
1    1  569252
5    2  573212
6    3  567163
3    4  599854


### Determining the pattern on monthly basis

In [35]:
%%px
@bodo.jit(distributed=['crimes', 'crimes_months'], cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes['month'] = crimes["Date"].dt.month
    crimes_months = crimes.groupby('month', as_index=False)['ID'].count().sort_values(by='month')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(crimes_months.head())
    return crimes_months
    
crimes_months = get_crimes_by_months(crimes)

[stdout:0] 
Group by days Time:  32.850027084350586  (ms)
    month      ID
0       1  318177
9       2  268271
10      3  327407
1       4  328076
7       5  355451


### Determining the pattern by crime type

In [37]:
%%px
@bodo.jit(distributed=['crimes', 'crimes_type'], cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = crimes.groupby('Primary Type', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(crimes_type.head())
    return crimes_type
    
crimes_type = get_crimes_by_type(crimes)

[stdout:0] 
Group by days Time:  43.81203651428223  (ms)
       Primary Type      ID
0             THEFT  907831
9           BATTERY  778164
2   CRIMINAL DAMAGE  499426
11        NARCOTICS  473790
3     OTHER OFFENSE  264200


### Determining the pattern by location

In [40]:
%%px
@bodo.jit(distributed=['crimes', 'crimes_location'], cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = crimes.groupby('Location Description', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    print(crimes_location.head())
    return crimes_location
    
crimes_location = get_crimes_by_location(crimes)

[stdout:0] 
Group by days Time:  48.18105697631836  (ms)
    Location Description      ID
83                STREET  999487
34             RESIDENCE  660937
121            APARTMENT  459253
60              SIDEWALK  440330
84                 OTHER  145580
