In [1]:
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

import numpy as np
import pandas as pd
import datetime
import seaborn as sns
from statsmodels.formula.api import ols
import statsmodels.api as sm

pd.set_option('display.max_columns', 500)
local_path = r"C:\Users\Chris\OneDrive\Documents\MIDS\WS231\final"

# Retrieve Historic Crime Data

In [2]:
# read in Chicago crime incident data
chgo_crimes = pd.read_csv(r"{0}\chicago_crimes_2001_2017.csv".format(local_path))
chgo_crimes.drop(chgo_crimes[['X Coordinate','Y Coordinate','Location','Updated On']],inplace=True,axis=1)
chgo_crimes.columns = ['id','case_num','date','block','iucr','primary_type','crime_short_desc',
                      'location_desc','is_arrested','is_domestic','beat','district','ward',
                      'community_area','fbi_code','year','latitude','longitude']

chgo_crimes

Unnamed: 0,id,case_num,date,block,iucr,primary_type,crime_short_desc,location_desc,is_arrested,is_domestic,beat,district,ward,community_area,fbi_code,year,latitude,longitude
0,9716392,HX366301,07/28/2014 07:00:00 PM,050XX N WINTHROP AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT/GARAGE(NON.RESID.),False,False,2024,20.0,48.0,3.0,14,2014,41.973924,-87.657752
1,9716393,HX366390,07/30/2014 09:00:00 AM,068XX S KILBOURN AVE,0460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,833,8.0,13.0,65.0,08B,2014,41.769122,-87.734705
2,9716395,HX366298,07/29/2014 08:00:00 PM,003XX E 46TH ST,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,215,2.0,3.0,38.0,07,2014,41.811165,-87.618745
3,9716396,HX366485,07/30/2014 09:00:00 AM,115XX S MARSHFIELD AVE,0820,THEFT,$500 AND UNDER,ATHLETIC CLUB,False,False,2234,22.0,34.0,75.0,06,2014,41.684160,-87.662692
4,9716402,HX366296,07/29/2014 03:00:00 PM,001XX E 74TH ST,0820,THEFT,$500 AND UNDER,STREET,False,False,323,3.0,6.0,69.0,06,2014,41.760126,-87.620811
5,9716403,HX366400,07/30/2014 09:41:00 AM,0000X E GARFIELD BLVD,0560,ASSAULT,SIMPLE,GAS STATION,True,False,225,2.0,3.0,40.0,08A,2014,41.794672,-87.623571
6,9716404,HX366306,07/29/2014 10:00:00 AM,008XX W DIVERSEY PKWY,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,1933,19.0,44.0,6.0,14,2014,41.932713,-87.649312
7,9716405,HX366271,07/29/2014 11:00:00 PM,054XX N LOVEJOY AVE,0820,THEFT,$500 AND UNDER,STREET,False,False,1622,16.0,45.0,11.0,06,2014,41.980300,-87.770488
8,9716407,HX366370,07/30/2014 08:36:00 AM,029XX N AUSTIN AVE,0530,ASSAULT,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,2511,25.0,29.0,19.0,04A,2014,41.933890,-87.776060
9,9716408,HX365283,07/29/2014 12:30:00 PM,030XX W 63RD ST,0320,ROBBERY,STRONGARM - NO WEAPON,GAS STATION,False,False,823,8.0,15.0,66.0,03,2014,41.779040,-87.698573


# Feature Engineering

In [3]:
# Datetime features
chgo_crimes['date'] = pd.to_datetime(chgo_crimes['date'])
chgo_crimes['date_time'] = chgo_crimes['date'].apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))
chgo_crimes['hour'] = chgo_crimes['date'].dt.hour
chgo_crimes['month'] = chgo_crimes['date'].dt.month

# Crime description feature
chgo_crimes['crime_desc'] = chgo_crimes[['primary_type', 'crime_short_desc']].apply(lambda x: ' - '.join(x), axis=1)

# Geo features
chgo_crimes['short_lat'], chgo_crimes['short_long'] = chgo_crimes['latitude'], chgo_crimes['longitude']
chgo_crimes.round({'short_lat':2, 'short_long':2})
#chgo_crimes['short_geo'] = chgo_crimes[['short_lat', 'short_long']].apply(lambda x: ''.join(str(x)), axis=1)

chgo_crimes

Unnamed: 0,id,case_num,date,block,iucr,primary_type,crime_short_desc,location_desc,is_arrested,is_domestic,beat,district,ward,community_area,fbi_code,year,latitude,longitude,date_time,hour,month,crime_desc,short_lat,short_long
0,9716392,HX366301,2014-07-28 19:00:00,050XX N WINTHROP AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT/GARAGE(NON.RESID.),False,False,2024,20.0,48.0,3.0,14,2014,41.973924,-87.657752,2014-07-28 19:00:00,19,7,CRIMINAL DAMAGE - TO VEHICLE,41.973924,-87.657752
1,9716393,HX366390,2014-07-30 09:00:00,068XX S KILBOURN AVE,0460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,833,8.0,13.0,65.0,08B,2014,41.769122,-87.734705,2014-07-30 09:00:00,9,7,BATTERY - SIMPLE,41.769122,-87.734705
2,9716395,HX366298,2014-07-29 20:00:00,003XX E 46TH ST,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,215,2.0,3.0,38.0,07,2014,41.811165,-87.618745,2014-07-29 20:00:00,20,7,MOTOR VEHICLE THEFT - AUTOMOBILE,41.811165,-87.618745
3,9716396,HX366485,2014-07-30 09:00:00,115XX S MARSHFIELD AVE,0820,THEFT,$500 AND UNDER,ATHLETIC CLUB,False,False,2234,22.0,34.0,75.0,06,2014,41.684160,-87.662692,2014-07-30 09:00:00,9,7,THEFT - $500 AND UNDER,41.684160,-87.662692
4,9716402,HX366296,2014-07-29 15:00:00,001XX E 74TH ST,0820,THEFT,$500 AND UNDER,STREET,False,False,323,3.0,6.0,69.0,06,2014,41.760126,-87.620811,2014-07-29 15:00:00,15,7,THEFT - $500 AND UNDER,41.760126,-87.620811
5,9716403,HX366400,2014-07-30 09:41:00,0000X E GARFIELD BLVD,0560,ASSAULT,SIMPLE,GAS STATION,True,False,225,2.0,3.0,40.0,08A,2014,41.794672,-87.623571,2014-07-30 09:00:00,9,7,ASSAULT - SIMPLE,41.794672,-87.623571
6,9716404,HX366306,2014-07-29 10:00:00,008XX W DIVERSEY PKWY,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,1933,19.0,44.0,6.0,14,2014,41.932713,-87.649312,2014-07-29 10:00:00,10,7,CRIMINAL DAMAGE - TO PROPERTY,41.932713,-87.649312
7,9716405,HX366271,2014-07-29 23:00:00,054XX N LOVEJOY AVE,0820,THEFT,$500 AND UNDER,STREET,False,False,1622,16.0,45.0,11.0,06,2014,41.980300,-87.770488,2014-07-29 23:00:00,23,7,THEFT - $500 AND UNDER,41.980300,-87.770488
8,9716407,HX366370,2014-07-30 08:36:00,029XX N AUSTIN AVE,0530,ASSAULT,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,2511,25.0,29.0,19.0,04A,2014,41.933890,-87.776060,2014-07-30 08:00:00,8,7,ASSAULT - AGGRAVATED: OTHER DANG WEAPON,41.933890,-87.776060
9,9716408,HX365283,2014-07-29 12:30:00,030XX W 63RD ST,0320,ROBBERY,STRONGARM - NO WEAPON,GAS STATION,False,False,823,8.0,15.0,66.0,03,2014,41.779040,-87.698573,2014-07-29 12:00:00,12,7,ROBBERY - STRONGARM - NO WEAPON,41.779040,-87.698573


# District Crime Score

In [8]:
# Total number of arrests per crime description
arrested_desc = chgo_crimes[chgo_crimes['is_arrested']][['id','crime_desc','district','month','hour']]
arrested_desc = arrested_desc.groupby(['district','month','hour','crime_desc']).id.nunique().reset_index()
arrested_desc.columns = ['district','month','hour','crime_desc','arrested_total']

# Total number of incidents (arrest or non-arrest) per crime description
crime_desc = chgo_crimes[['id','crime_desc','district','month','hour']]
crime_desc = crime_desc.groupby(['district','month','hour','crime_desc']).id.nunique().reset_index()
crime_desc.columns = ['district','month','hour','crime_desc','total']

# Calculate score
chgo_district_scores = pd.merge(crime_desc, arrested_desc, how = 'left')
chgo_district_scores.fillna(0, inplace = True)
chgo_district_scores['raw_intensity'] = chgo_district_scores['arrested_total'] / chgo_district_scores['total']
chgo_district_scores['final_intensity'] = chgo_district_scores['arrested_total'] * chgo_district_scores['raw_intensity']

# Summarize score
chgo_district_scores = chgo_district_scores.groupby(['district','month','hour'])[['final_intensity','total','arrested_total']].sum().reset_index()
chgo_district_scores.sort_values('final_intensity', inplace = True)

# Save pickle file 
chgo_district_scores.to_pickle('district_crime_scores.pkl')

chgo_district_scores

Unnamed: 0,district,month,hour,final_intensity,total,arrested_total
6361,31.0,3,15,0.000000,1,0.0
6399,31.0,10,1,0.000000,1,0.0
6362,31.0,3,17,0.000000,1,0.0
5473,21.0,2,18,0.000000,1,0.0
6368,31.0,5,19,0.000000,1,0.0
6370,31.0,5,22,0.000000,1,0.0
6356,31.0,3,10,0.000000,1,0.0
3456,13.0,7,9,0.000000,1,0.0
5474,21.0,7,0,0.000000,1,0.0
6388,31.0,8,4,0.000000,1,0.0


# Training Set Auditing

In [10]:
# Load pickle file
# chgo_district_scores = pd.read_pickle('district_crime_scores.pkl')
