# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


### Start an IPyParallel cluster 
Run the following code in a cell to start an IPyParallel cluster. 8 cores are used in this example. 

In [1]:
import os
if os.environ.get("BODO_PLATFORM_WORKSPACE_UUID",'NA') == 'NA':
    import ipyparallel as ipp
    import psutil; n = min(psutil.cpu_count(logical=False), 8)
    rc = ipp.Cluster(engines='mpi', n=n).start_and_connect_sync(activate=True)

Starting 8 engines with <class 'ipyparallel.cluster.launcher.MPIEngineSetLauncher'>
100%|██████████| 8/8 [00:07<00:00,  1.13engine/s]


In [2]:
%%px
import pandas as pd
import time
import bodo

## Load Crimes Data in Chicago 2005 - 2017

In [3]:
%%px
@bodo.jit(cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes1 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2005_to_2007.csv')
    crimes2 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2008_to_2011.csv')
    crimes3 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv')
    crimes = pd.concat([crimes1, crimes2, crimes3], ignore_index=False, axis=0)
    crimes = crimes.sort_values(by="ID")
    print("Reading time: ", ((time.time() - t1) * 1000), " (ms)")    
    return crimes

crimes1 = load_chicago_crimes()
if bodo.get_rank()==0:
    print(crimes1.head())

%px:   0%|          | 0/16 [00:00<?, ?tasks/s]

[stdout:0] Reading time:  9662.638648330358  (ms)
         Unnamed: 0    ID Case Number                    Date  \
1324003     4897380  3012    HL101040  01/01/2005 01:15:00 PM   
1324004     4898204  3013    HK826899  01/02/2005 09:45:00 PM   
1324005     4898986  3014    HL106602  01/04/2005 04:39:00 PM   
1324006     4899770  3015    HL107444  01/05/2005 04:07:00 AM   
1324007     4900593  3016    HL112637  01/08/2005 03:15:00 AM   

                         Block  IUCR Primary Type          Description  \
1324003  076XX S GREENWOOD AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324004        029XX E 82ND ST  0110     HOMICIDE  FIRST DEGREE MURDER   
1324005  070XX S CONSTANCE AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324006     095XX S COLFAX AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324007      015XX N DAYTON ST  0110     HOMICIDE  FIRST DEGREE MURDER   

        Location Description  Arrest  ...  Ward  Community Area  FBI Code  \
1324003           VACANT LOT    True 

## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [None]:
%%px
@bodo.jit(cache=True)
def data_cleanup(crimes):
    t1 = time.time()    
    crimes = crimes.drop_duplicates()    
    crimes.drop(['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], inplace=True, axis=1)
    crimes.Date = pd.to_datetime(crimes.Date, format='%m/%d/%Y %I:%M:%S %p')
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor('D')
    crimes = crimes.sort_values(by="ID")    
    print("Data cleanup time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes

crimes = data_cleanup(crimes1)
if bodo.get_rank()==0:
    print(crimes.head())

%px:   0%|          | 0/16 [00:00<?, ?tasks/s]

[stdout:0] Data cleanup time:  2002.4346562838673  (ms)
           ID                Date                  Block Primary Type  \
1324003  3012 2005-01-01 13:15:00  076XX S GREENWOOD AVE     HOMICIDE   
1324004  3013 2005-01-02 21:45:00        029XX E 82ND ST     HOMICIDE   
1324005  3014 2005-01-04 16:39:00  070XX S CONSTANCE AVE     HOMICIDE   
1324006  3015 2005-01-05 04:07:00     095XX S COLFAX AVE     HOMICIDE   
1324007  3016 2005-01-08 03:15:00      015XX N DAYTON ST     HOMICIDE   

                 Description Location Description  Arrest  Domestic  District  \
1324003  FIRST DEGREE MURDER           VACANT LOT    True     False       6.0   
1324004  FIRST DEGREE MURDER               STREET    True     False       4.0   
1324005  FIRST DEGREE MURDER               STREET   False     False       3.0   
1324006  FIRST DEGREE MURDER                 AUTO   False     False       4.0   
1324007  FIRST DEGREE MURDER                 CLUB    True     False      18.0   

         X Coordin

In [None]:
%%px
@bodo.jit(cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes['Primary Type'].value_counts().index[0:10]
    print("Getting top crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crime_types

top_crime_types = get_top_crime_types(crimes)
top_crime_types = bodo.allgatherv(top_crime_types)
if bodo.get_rank()==0:
    print(top_crime_types)

[stdout:0] Getting top crimes Time:  114.37122076108608  (ms)
Index(['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'OTHER OFFENSE',
       'BURGLARY', 'ASSAULT', 'MOTOR VEHICLE THEFT', 'DECEPTIVE PRACTICE',
       'ROBBERY'],
      dtype='object')


%px:   0%|          | 0/16 [00:00<?, ?tasks/s]

In [None]:
%%px

@bodo.jit(cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes['Primary Type'].isin(top_crime_types)]
    print("Filtering crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crimes

crimes = filter_crimes(crimes, top_crime_types)
if bodo.get_rank()==0:
    print(crimes.head())

[stdout:0] Filtering crimes Time:  79.46234036899114  (ms)
              ID                Date                          Block  \
1325007  3730318 2005-01-01 00:04:00            031XX W HARRISON ST   
1325010  3730326 2005-01-01 00:05:00  012XX N LUIS MUNOZ MARIN DR W   
1325012  3730338 2005-01-01 01:13:00              019XX N DRAKE AVE   
1325013  3730341 2005-01-01 01:30:00               002XX N CANAL ST   
1325015  3730348 2005-01-01 00:00:00              006XX E GRAND AVE   

            Primary Type                  Description  \
1325007  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325010  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325012  CRIMINAL DAMAGE                   TO VEHICLE   
1325013            THEFT               POCKET-PICKING   
1325015            THEFT               POCKET-PICKING   

                    Location Description  Arrest  Domestic  District  \
1325007  POLICE FACILITY/VEH PARKING LOT   False     False      11.0   
1325010                    

## Crime Analysis

### Find Pattern of each crime over the years



In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_count_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(index='date only', columns='Primary Type', values='ID', aggfunc="count")
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_count_date

crimes_count_date = get_crimes_count_date(crimes)

[stdout:0] Computing Crime Pattern Time:  38.28605442777189  (ms)


In [None]:
%%px

@bodo.jit
def get_crimes_type_date(crimes_count_date):
    t1 = time.time()
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return result

get_crimes_type_date = get_crimes_type_date(crimes_count_date)
if bodo.get_rank()==0:
    print(get_crimes_type_date.head())

%px:   0%|          | 0/16 [00:00<?, ?tasks/s]

[stdout:0] Computing Crime Pattern Time:  140.00831296061733  (ms)
            ROBBERY  OTHER OFFENSE  ASSAULT  BATTERY  NARCOTICS  BURGLARY  \
2017-01-18  13713.0        21310.0  21533.0  62066.0    37875.0   21522.0   
2017-01-17  13423.0        21662.0  21658.0  64943.0    37909.0   21059.0   
2017-01-16  13620.0        21607.0  21620.0  62946.0    38749.0   21605.0   
2017-01-15  13926.0        21329.0  21314.0  63364.0    37972.0   21965.0   
2017-01-14  13696.0        21404.0  20877.0  62356.0    38740.0   21912.0   

            MOTOR VEHICLE THEFT    THEFT  DECEPTIVE PRACTICE  CRIMINAL DAMAGE  
2017-01-18              15867.0  73785.0             14240.0          39976.0  
2017-01-17              15964.0  73476.0             13985.0          41336.0  
2017-01-16              16332.0  74904.0             14467.0          40764.0  
2017-01-15              16384.0  74765.0             13936.0          40584.0  
2017-01-14              16119.0  75275.0             14118.0          

## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = crimes.groupby('dow', as_index=False)['ID'].count().sort_values(by='dow')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_days
    
crimes_days = get_crimes_by_days(crimes)
if bodo.get_rank()==0:
    print(crimes_days.head())

[stdout:0] Group by days Time:  15.454323005087645  (ms)
   dow      ID
6    0  562811
4    1  568747
0    2  572240
5    3  566206
3    4  599764


### Determining the pattern on monthly basis

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes['month'] = crimes["Date"].dt.month
    crimes_months = crimes.groupby('month', as_index=False)['ID'].count().sort_values(by='month')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_months
    
crimes_months = get_crimes_by_months(crimes)
if bodo.get_rank()==0:
    print(crimes_months.head())

%px:   0%|          | 0/16 [00:00<?, ?tasks/s]

[stdout:0] Group by days Time:  267.65507394225097  (ms)
    month      ID
9       1  317796
3       2  267986
10      3  327381
7       4  328439
4       5  355734


### Determining the pattern by crime type

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = crimes.groupby('Primary Type', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_type
    
crimes_type = get_crimes_by_type(crimes)
if bodo.get_rank()==0:
    print(crimes_type.head())

[stdout:0] Group by days Time:  35.67897950688348  (ms)
      Primary Type      ID
7            THEFT  907831
3          BATTERY  778164
9  CRIMINAL DAMAGE  499426
4        NARCOTICS  473790
1    OTHER OFFENSE  264200


### Determining the pattern by location

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = crimes.groupby('Location Description', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_location
    
crimes_location = get_crimes_by_location(crimes)
if bodo.get_rank()==0:
    print(crimes_location.head())

[stdout:0] Group by days Time:  122.20459688614937  (ms)
   Location Description       ID
16               STREET  1001415
22            RESIDENCE   662907
17            APARTMENT   458007
86             SIDEWALK   443551
44                OTHER   145402
