## Crime and Demographic Hotspot Clustering

In [55]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn import metrics
from scipy.spatial.distance import cdist

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#set working directory
os.chdir(os.getcwd()+'/Data')

In [3]:
#import data
demo = pd.read_csv('demographics.csv')
crime = pd.read_csv('crimes_2001_pres_geo.csv')

In [4]:
#check the years and align the data sets
print(demo.year.unique())
print(crime.Year.unique())

[2017 2016 2015 2014 2013 2012 2011 2010 2009]
[2003 2002 2006 2008 2005 2009 2017 2012 2004 2011 2018 2016 2015 2010
 2013 2014 2007 2001 2019]


In [5]:
#trim crime df to align years
crime = crime[(crime.Year > 2010)&(crime.Year < 2019)]
demo = demo[(demo.year > 2010)&(crime.Year < 2019)]

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
demo_2018 = demo[demo['year']==2017]
demo_2018['year'].replace({2017:2018},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
demo = pd.concat([demo,demo_2018],ignore_index=True) #fill in 2018 with 2017 values

In [8]:
#map file to consolidate categories
crime_mapper = pd.read_csv('crime_mapper.csv')

#merge to get new mapping in crime data set
crime = pd.merge(left=crime,right=crime_mapper,how='left',
                 left_on='Primary Type',right_on='OLD_TYPE')

In [9]:
#aggregate to the year and crime type level
crime_agg = pd.DataFrame(crime.groupby(['census_tract','Year','NEW_TYPE'])['IUCR'].count())

In [10]:
#reset index
crime_agg.reset_index(inplace=True)

In [11]:
#aggregate and pivot data
crime_agg = crime_agg.groupby(['census_tract', 'Year', 'NEW_TYPE'])['IUCR'].sum() \
                    .unstack('NEW_TYPE')

In [12]:
crime_agg.shape

(6408, 23)

In [13]:
#fill na
crime_agg.fillna(0,inplace=True)

In [14]:
#reset the index to join in demographic data
crime_agg.reset_index(inplace=True)

In [15]:
#read in census tract shape file
census_shapes = gpd.read_file('geo_export_ce0fbd70-d876-49ca-9432-ae1454719ab0.shp')

In [16]:
#convert column for joining
census_shapes['name_float'] = census_shapes.name10.astype(float)

In [17]:
#join to get geoid from tracts table
crime_agg = pd.merge(left=crime_agg,right=census_shapes,
                     how='left',left_on='census_tract',right_on='name_float')

In [18]:
#set column type for join
crime_agg['geoid10_int'] = crime_agg['geoid10'].astype(int)

In [19]:
#join demographic table
crime_agg = pd.merge(left=crime_agg,right=demo,
                     left_on=['geoid10_int','Year'],right_on=['geoid','year'])

In [20]:
#set max columns
pd.set_option('display.max_columns', 500)

In [21]:
#roll data up to the community area level
len(crime_agg.commarea.unique())
sum_list = ['ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION','households', 'own', 'rent','16+_tot',
       'med_income', 'total', 'white', 'black', 'amer_indian',
       'asian', 'hawaiian', 'other','all_families', 'married_fam','single_mom']

crime_agg_sums = pd.DataFrame(crime_agg.groupby(['year','commarea'])[sum_list].sum())

crime_agg_sums.reset_index(inplace=True)

In [23]:
crime_agg_sums.head()

Unnamed: 0,year,commarea,ARSON,ASSAULT AND BATTERY,BURGLARY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,GAMBLING,HOMICIDE,INTERFERENCE WITH PUBLIC OFFICER,INTIMIDATION,KIDNAPPING,LIQUOR LAW VIOLATION,NARCOTICS,OFFENSE INVOLVING CHILDREN,OTHER OFFENSE,PROSTITUTION,PUBLIC PEACE VIOLATION,ROBBERY,SEXUAL ASSAULT,STALKING,THEFT,TRESPASSING,VEHICLE THEFT,WEAPONS VIOLATION,households,own,rent,16+_tot,total,white,black,amer_indian,asian,hawaiian,other,all_families,married_fam,single_mom
0,2011,1,5.0,631.0,123.0,272.0,58.0,6.0,3.0,9.0,0.0,0.0,7.0,226.0,12.0,164.0,2.0,24.0,113.0,13.0,2.0,404.0,69.0,67.0,19.0,9442,2806,6636,18563,23793,9936,8282,7,1327,0,3450,4511,2743,1359
1,2011,10,2.0,273.0,138.0,259.0,84.0,0.0,0.0,1.0,0.0,3.0,3.0,32.0,8.0,132.0,0.0,12.0,14.0,11.0,0.0,384.0,36.0,44.0,1.0,14441,11238,3203,30122,37276,32431,295,75,1749,0,1812,9590,7360,1601
2,2011,11,1.0,249.0,132.0,205.0,92.0,0.0,2.0,6.0,1.0,0.0,1.0,53.0,19.0,136.0,0.0,6.0,16.0,6.0,4.0,286.0,30.0,65.0,3.0,9882,6739,3143,21357,26099,21514,247,74,2240,0,1551,6307,4709,1003
3,2011,12,0.0,64.0,59.0,66.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,2.0,27.0,0.0,1.0,3.0,3.0,0.0,73.0,8.0,11.0,1.0,3667,3080,587,7935,10325,7067,205,0,1801,0,896,2700,2245,321
4,2011,13,0.0,174.0,144.0,189.0,30.0,2.0,1.0,0.0,1.0,0.0,2.0,46.0,7.0,79.0,2.0,2.0,32.0,8.0,2.0,293.0,25.0,64.0,5.0,6448,3350,3098,14382,17954,9949,576,7,4745,0,1744,4071,3107,580


In [24]:
cols = ['ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION']

#adding one to avoid div by zero error, only affects one row
for col in cols:
    crime_agg_sums[col]= crime_agg_sums.apply(lambda row: row[col] / (row['households']+1), 
                                  axis=1)

In [34]:
cols = ['commarea','ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION']

crime_agg = crime_agg_sums[cols]

In [35]:
#take average for each community 
crime_agg = crime_agg.groupby('commarea')[cols].mean()

In [38]:
scaler = StandardScaler()
crime_agg_scaled = scaler.fit_transform(crime_agg)

In [65]:
#test kmeans model across k range
cluster_test = [i for i in range(2,15)] #cluster range to iterate over
scores = []
elbows = []
for i in cluster_test:
    cluster_mod = KMeans(n_clusters=i,random_state=13).fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}" \
          .format(silhouette_scores))
    scores.append(silhouette_scores)
    
    elbow = sum(np.min(cdist(crime_agg_scaled, cluster_mod.cluster_centers_, 
                        'euclidean'), axis=1)) / crime_agg_scaled.shape[0]
    elbows.append(elbow)

For n_clusters = 2, the average silhouette_score is : 0.859765867883428
For n_clusters = 3, the average silhouette_score is : 0.4640443808901255
For n_clusters = 4, the average silhouette_score is : 0.38341284153920735
For n_clusters = 5, the average silhouette_score is : 0.38257940394641965
For n_clusters = 6, the average silhouette_score is : 0.3803680960801778
For n_clusters = 7, the average silhouette_score is : 0.3811542555567015
For n_clusters = 8, the average silhouette_score is : 0.3865618823450311
For n_clusters = 9, the average silhouette_score is : 0.26861881283107947
For n_clusters = 10, the average silhouette_score is : 0.22280473902224562
For n_clusters = 11, the average silhouette_score is : 0.21379524832428412
For n_clusters = 12, the average silhouette_score is : 0.22655224939875215
For n_clusters = 13, the average silhouette_score is : 0.19294121147175144
For n_clusters = 14, the average silhouette_score is : 0.24078966969620128


In [44]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over
scores = []
elbows = []
for i in cluster_test:
    cluster_mod = GaussianMixture(n_components=i,random_state=13).fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod \
                                         .predict(crime_agg_scaled))
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}" \
                                                      .format(silhouette_scores))
    scores.append(silhouette_scores)

For n_clusters = 2, the average silhouette_score is : 0.859765867883428
For n_clusters = 3, the average silhouette_score is : 0.4640443808901255
For n_clusters = 4, the average silhouette_score is : 0.37665278021336196
For n_clusters = 5, the average silhouette_score is : 0.37807265584020183
For n_clusters = 6, the average silhouette_score is : 0.36299279609405377
For n_clusters = 7, the average silhouette_score is : 0.36999060303209796
For n_clusters = 8, the average silhouette_score is : 0.21587442829812337
For n_clusters = 9, the average silhouette_score is : 0.13988046221325
For n_clusters = 10, the average silhouette_score is : 0.1496592487439359
For n_clusters = 11, the average silhouette_score is : 0.12943233530460188
For n_clusters = 12, the average silhouette_score is : 0.1328038235000869
For n_clusters = 13, the average silhouette_score is : 0.12747550547076453
For n_clusters = 14, the average silhouette_score is : 0.18577616506877528


In [41]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over

for i in cluster_test:
    cluster_mod = AgglomerativeClustering(n_clusters=i,linkage='single').fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}".format(silhouette_scores))

For n_clusters = 2, the average silhouette_score is : 0.859765867883428
For n_clusters = 3, the average silhouette_score is : 0.6333181323398419
For n_clusters = 4, the average silhouette_score is : 0.618202749197191
For n_clusters = 5, the average silhouette_score is : 0.44518588082288807
For n_clusters = 6, the average silhouette_score is : 0.36017937663497346
For n_clusters = 7, the average silhouette_score is : 0.28294061131326975
For n_clusters = 8, the average silhouette_score is : 0.3404098590731111
For n_clusters = 9, the average silhouette_score is : 0.3285891596309882
For n_clusters = 10, the average silhouette_score is : 0.32002738850592233
For n_clusters = 11, the average silhouette_score is : 0.3148918623812519
For n_clusters = 12, the average silhouette_score is : 0.3303960629675287
For n_clusters = 13, the average silhouette_score is : 0.21319662943604825
For n_clusters = 14, the average silhouette_score is : 0.21184042939408887


### Use kmeans w/ K=4 

In [67]:
cluster_mod = KMeans(n_clusters=4,random_state=13).fit(crime_agg_scaled)
cluster_labels = pd.DataFrame(cluster_mod.labels_)

In [73]:
crime_agg.reset_index(inplace=True)
#crime_agg_sums.head()

In [74]:
#add cluster labels to crime data 
crime_cluster = pd.merge(left=crime_agg,right=cluster_labels,
                         how='left',left_index=True,right_index=True)

In [83]:
#join to all years table
crime_agg_sums = pd.merge(left=crime_agg_sums,right=crime_cluster,how='left',
                               left_on='commarea',right_on='commarea')

In [84]:
crime_agg_sums = crime_agg_sums[['year','commarea','ARSON_x','ASSAULT AND BATTERY_x','BURGLARY_x',
'CRIMINAL DAMAGE_x','DECEPTIVE PRACTICE_x','GAMBLING_x','HOMICIDE_x',
'INTERFERENCE WITH PUBLIC OFFICER_x','INTIMIDATION_x','KIDNAPPING_x',
'LIQUOR LAW VIOLATION_x','NARCOTICS_x','OFFENSE INVOLVING CHILDREN_x',
'OTHER OFFENSE_x','PROSTITUTION_x','PUBLIC PEACE VIOLATION_x',
'ROBBERY_x','SEXUAL ASSAULT_x','STALKING_x','THEFT_x','TRESPASSING_x',
'VEHICLE THEFT_x','WEAPONS VIOLATION_x','households',0]]

In [85]:
crime_agg_sums.shape

(491, 27)

### Export results tagged with geometry

In [86]:
#import community shape files
community_geo = gpd.read_file('Boundaries - Community Areas (current).geojson')

In [87]:
#merge 
crime_cluster = pd.merge(left=crime_agg_sums,right=community_geo,how='left',
                         left_on='commarea',right_on='area_numbe')

In [88]:
crime_cluster = crime_cluster[['year','commarea','ARSON_x','ASSAULT AND BATTERY_x','BURGLARY_x',
'CRIMINAL DAMAGE_x','DECEPTIVE PRACTICE_x','GAMBLING_x','HOMICIDE_x',
'INTERFERENCE WITH PUBLIC OFFICER_x','INTIMIDATION_x','KIDNAPPING_x',
'LIQUOR LAW VIOLATION_x','NARCOTICS_x','OFFENSE INVOLVING CHILDREN_x',
'OTHER OFFENSE_x','PROSTITUTION_x','PUBLIC PEACE VIOLATION_x',
'ROBBERY_x','SEXUAL ASSAULT_x','STALKING_x','THEFT_x','TRESPASSING_x',
'VEHICLE THEFT_x','WEAPONS VIOLATION_x','households',0,'geometry']]

In [93]:
#output data
crime_cluster.to_csv('crime_clusters.csv')

In [94]:
crime_cluster.year.unique()

array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

### Anomaly Detection

In [95]:
score = cluster_mod.fit_predict(crime_agg_scaled)
kmeans_df = pd.DataFrame(cluster_mod.transform(crime_agg_scaled))
kmeans_df = pd.concat((kmeans_df,pd.DataFrame(score)),axis=1)

In [96]:
kmeans_df.columns =[0,1,2,3,'cluster']
kmeans_df['dist_to_clus'] = kmeans_df.apply(lambda x: x[int(x["cluster"])],axis=1)

In [97]:
#rename columns
kmeans_df.rename({0:'dist_0',1:'dist_1',2:'dist_2',3:"dist_3"},axis=1,inplace=True)

In [98]:
kmeans_df.shape

(77, 6)

In [105]:
kmeans_df.head(20)

Unnamed: 0,dist_0,dist_1,dist_2,dist_3,cluster,dist_to_clus
0,7.902589,1.021124,30.584168,3.134119,1,1.021124
1,8.823239,0.987671,30.68223,3.960664,1,0.987671
2,8.251783,0.963548,30.613844,3.297643,1,0.963548
3,9.172202,1.422045,30.650851,4.436173,1,1.422045
4,8.219043,1.130647,30.565467,3.560019,1,1.130647
5,7.9998,0.757942,30.597896,3.121927,1,0.757942
6,8.360731,1.437486,30.606423,3.567463,1,1.437486
7,8.427723,0.488126,30.585842,3.588905,1,0.488126
8,8.803542,0.893884,30.590752,3.972398,1,0.893884
9,8.301144,0.944961,30.521787,3.547116,1,0.944961


In [107]:
kmeans_df = pd.merge(left=kmeans_df,right=crime_agg,left_index=True,right_index=True)

In [109]:
kmeans_df = kmeans_df[['commarea','dist_0', 'dist_1',
                       'dist_2', 'dist_3', 'cluster', 'dist_to_clus',]]

In [111]:
crime_cluster_w_anoms = pd.merge(left=crime_cluster,right=kmeans_df,
                                 left_on='commarea',right_on='commarea')

In [112]:
crime_cluster_w_anoms.to_csv('crime_clus_anomalies.csv')