# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


### Start an IPyParallel cluster (skip if running on Bodo Platform)
Run the following code in a cell to start an IPyParallel cluster. 8 cores are used in this example. 

In [1]:
import ipyparallel as ipp
import psutil; n = min(psutil.cpu_count(logical=False), 8)
rc = ipp.Cluster(engines='mpi', n=n).start_and_connect_sync(activate=True)

Starting 8 engines with <class 'ipyparallel.cluster.launcher.MPIEngineSetLauncher'>
100%|██████████| 8/8 [00:07<00:00,  1.13engine/s]


In [2]:
%%px
import pandas as pd
import time
import bodo

## Load Crimes Data in Chicago 2005 - 2017

In [3]:
%%px
@bodo.jit(cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes1 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2005_to_2007.csv')
    crimes2 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2008_to_2011.csv')
    crimes3 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv')
    crimes = pd.concat([crimes1, crimes2, crimes3], ignore_index=False, axis=0)
    crimes = crimes.sort_values(by="ID")
    print("Reading time: ", ((time.time() - t1) * 1000), " (ms)")    
    return crimes

crimes1 = load_chicago_crimes()
if bodo.get_rank()==0:
    print(crimes1.head())

%px:   0%|          | 0/8 [02:05<?, ?tasks/s]


Received Keyboard Interrupt. Sending signal SIGINT to engines...


## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [None]:
%%px
@bodo.jit(cache=True)
def data_cleanup(crimes):
    t1 = time.time()    
    crimes = crimes.drop_duplicates()    
    crimes.drop(['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], inplace=True, axis=1)
    crimes.Date = pd.to_datetime(crimes.Date, format='%m/%d/%Y %I:%M:%S %p')
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor('D')
    crimes = crimes.sort_values(by="ID")    
    print("Data cleanup time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes

crimes = data_cleanup(crimes1)
if bodo.get_rank()==0:
    print(crimes.head())

%px:   0%|          | 0/8 [00:41<?, ?tasks/s]

[stdout:0] Data cleanup time:  22520.818512000005  (ms)
           ID                Date                  Block Primary Type  \
1324003  3012 2005-01-01 13:15:00  076XX S GREENWOOD AVE     HOMICIDE   
1324004  3013 2005-01-02 21:45:00        029XX E 82ND ST     HOMICIDE   
1324005  3014 2005-01-04 16:39:00  070XX S CONSTANCE AVE     HOMICIDE   
1324006  3015 2005-01-05 04:07:00     095XX S COLFAX AVE     HOMICIDE   
1324007  3016 2005-01-08 03:15:00      015XX N DAYTON ST     HOMICIDE   

                 Description Location Description  Arrest  Domestic  District  \
1324003  FIRST DEGREE MURDER           VACANT LOT    True     False       6.0   
1324004  FIRST DEGREE MURDER               STREET    True     False       4.0   
1324005  FIRST DEGREE MURDER               STREET   False     False       3.0   
1324006  FIRST DEGREE MURDER                 AUTO   False     False       4.0   
1324007  FIRST DEGREE MURDER                 CLUB    True     False      18.0   

         X Coordin

%px: 100%|██████████| 8/8 [00:42<00:00,  5.31s/tasks]


In [None]:
%%px
@bodo.jit(cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes['Primary Type'].value_counts().index[0:10]
    print("Getting top crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crime_types

top_crime_types = get_top_crime_types(crimes)
top_crime_types = bodo.allgatherv(top_crime_types)
if bodo.get_rank()==0:
    print(top_crime_types)

%px:   0%|          | 0/8 [00:06<?, ?tasks/s]

[stdout:0] Getting top crimes Time:  284.19172400003845  (ms)
Index(['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'OTHER OFFENSE',
       'BURGLARY', 'ASSAULT', 'MOTOR VEHICLE THEFT', 'DECEPTIVE PRACTICE',
       'ROBBERY'],
      dtype='object')


%px: 100%|██████████| 8/8 [00:11<00:00,  1.44s/tasks]


In [None]:
%%px

@bodo.jit(cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes['Primary Type'].isin(top_crime_types)]
    print("Filtering crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crimes

crimes = filter_crimes(crimes, top_crime_types)
if bodo.get_rank()==0:
    print(crimes.head())

%px:   0%|          | 0/8 [00:06<?, ?tasks/s]

[stdout:0] Filtering crimes Time:  439.49683599998934  (ms)
              ID                Date                          Block  \
1325007  3730318 2005-01-01 00:04:00            031XX W HARRISON ST   
1325010  3730326 2005-01-01 00:05:00  012XX N LUIS MUNOZ MARIN DR W   
1325012  3730338 2005-01-01 01:13:00              019XX N DRAKE AVE   
1325013  3730341 2005-01-01 01:30:00               002XX N CANAL ST   
1325015  3730348 2005-01-01 00:00:00              006XX E GRAND AVE   

            Primary Type                  Description  \
1325007  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325010  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325012  CRIMINAL DAMAGE                   TO VEHICLE   
1325013            THEFT               POCKET-PICKING   
1325015            THEFT               POCKET-PICKING   

                    Location Description  Arrest  Domestic  District  \
1325007  POLICE FACILITY/VEH PARKING LOT   False     False      11.0   
1325010                   

%px: 100%|██████████| 8/8 [00:08<00:00,  1.00s/tasks]


## Crime Analysis

### Find Pattern of each crime over the years



In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_count_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(index='date only', columns='Primary Type', values='ID', aggfunc="count")
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_count_date

crimes_count_date = get_crimes_count_date(crimes)

%px:   0%|          | 0/8 [00:12<?, ?tasks/s]

[stdout:0] Computing Crime Pattern Time:  127.84549599996353  (ms)


%px: 100%|██████████| 8/8 [00:12<00:00,  1.58s/tasks]


In [None]:
%%px

@bodo.jit
def get_crimes_type_date(crimes_count_date):
    t1 = time.time()
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return result

get_crimes_type_date = get_crimes_type_date(crimes_count_date)
if bodo.get_rank()==0:
    print(get_crimes_type_date.head())

%px:  88%|████████▊ | 7/8 [00:32<00:00, 66.07tasks/s]

[stdout:0] Computing Crime Pattern Time:  590.0528099999747  (ms)
            ROBBERY    THEFT  OTHER OFFENSE  ASSAULT  BATTERY  NARCOTICS  \
2017-01-18  13787.0  74505.0        21276.0  21185.0  61742.0    37644.0   
2017-01-17  12720.0  69979.0        19365.0  19565.0  57182.0    32817.0   
2017-01-16  12393.0  69756.0        19199.0  19751.0  57872.0    32641.0   
2017-01-15  12712.0  69991.0        19391.0  19562.0  57254.0    32878.0   
2017-01-14  12951.0  72357.0        20128.0  20311.0  59756.0    34909.0   

            DECEPTIVE PRACTICE  BURGLARY  CRIMINAL DAMAGE  MOTOR VEHICLE THEFT  
2017-01-18             14244.0   21654.0          39808.0              15989.0  
2017-01-17             14354.0   19672.0          35676.0              14813.0  
2017-01-16             14387.0   19517.0          35538.0              13958.0  
2017-01-15             14341.0   19687.0          35673.0              14832.0  
2017-01-14             14333.0   20644.0          38017.0              1

%px: 100%|██████████| 8/8 [00:32<00:00,  4.09s/tasks]


## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = crimes.groupby('dow', as_index=False)['ID'].count().sort_values(by='dow')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_days
    
crimes_days = get_crimes_by_days(crimes)
if bodo.get_rank()==0:
    print(crimes_days.head())

[stdout:0] Group by days Time:  68.35632700006045  (ms)
   dow      ID
4    0  562811
1    1  568747
2    2  572240
3    3  566206
0    4  599764


### Determining the pattern on monthly basis

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes['month'] = crimes["Date"].dt.month
    crimes_months = crimes.groupby('month', as_index=False)['ID'].count().sort_values(by='month')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_months
    
crimes_months = get_crimes_by_months(crimes)
if bodo.get_rank()==0:
    print(crimes_months.head())

%px:   0%|          | 0/8 [00:01<?, ?tasks/s]

[stdout:0] Group by days Time:  170.03005099991242  (ms)
   month      ID
6      1  317796
7      2  267986
8      3  327381
3      4  328439


%px: 100%|██████████| 8/8 [00:01<00:00,  4.07tasks/s]


### Determining the pattern by crime type

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = crimes.groupby('Primary Type', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_type
    
crimes_type = get_crimes_by_type(crimes)
if bodo.get_rank()==0:
    print(crimes_type.head())

[stdout:0] Group by days Time:  165.71523000004618  (ms)
  Primary Type      ID
1        THEFT  907831
4      BATTERY  778164


### Determining the pattern by location

In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = crimes.groupby('Location Description', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_location
    
crimes_location = get_crimes_by_location(crimes)
if bodo.get_rank()==0:
    print(crimes_location.head())

%px:   0%|          | 0/8 [00:00<?, ?tasks/s]

[stdout:0] Group by days Time:  177.27735400001166  (ms)
   Location Description       ID
31               STREET  1001415
40            RESIDENCE   662907
32            APARTMENT   458007
74             SIDEWALK   443551
95                OTHER   145402


%px: 100%|██████████| 8/8 [00:00<00:00, 388.67tasks/s]
