## Crime and Demographic Hotspot Clustering

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn import metrics
from scipy.spatial.distance import cdist

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#set working directory
os.chdir(os.getcwd()+'/Data')

In [3]:
#import data
demo = pd.read_csv('demographics.csv')
crime = pd.read_csv('crimes_2001_pres_geo.csv')

In [4]:
crime.shape

(6146248, 26)

In [5]:
crime.head()

Unnamed: 0,index_right,census_tract,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,...,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,geo_tag_loc,geometry
0,6739173,8424.0,2630263,HJ231054,03/11/2003 08:00:00 AM,089XX S STATE ST,810,THEFT,OVER $500,STREET,...,6,1177854.0,1845897.0,2003,02/28/2018 03:56:25 PM,41.732459,-87.624029,"(41.732458945, -87.624029041)",POINT (-87.624029041 41.732458945),POINT (-87.624029041 41.732458945)
1,6431753,8424.0,2142068,HH386962,05/21/2002 11:00:00 AM,089XX S STATE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),...,7,1177854.0,1845897.0,2002,02/28/2018 03:56:25 PM,41.732459,-87.624029,"(41.732458945, -87.624029041)",POINT (-87.624029041 41.732458945),POINT (-87.624029041 41.732458945)
2,6584752,8424.0,2392787,HH482587,07/02/2002 02:20:48 PM,089XX S STATE ST,820,THEFT,$500 AND UNDER,OTHER,...,6,1177854.0,1845897.0,2002,02/28/2018 03:56:25 PM,41.732459,-87.624029,"(41.732458945, -87.624029041)",POINT (-87.624029041 41.732458945),POINT (-87.624029041 41.732458945)
3,1255946,8424.0,5087829,HM690486,10/30/2006 10:01:57 AM,089XX S STATE ST,610,BURGLARY,FORCIBLE ENTRY,COMMERCIAL / BUSINESS OFFICE,...,5,1177862.0,1845535.0,2006,02/28/2018 03:56:25 PM,41.731465,-87.624011,"(41.73146539, -87.624010662)",POINT (-87.624010662 41.73146539),POINT (-87.624010662 41.73146539)
4,1839891,8424.0,6387215,HP472412,07/24/2008 07:00:00 AM,089XX S STATE ST,1320,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT/GARAGE(NON.RESID.),...,14,1177854.0,1845885.0,2008,02/28/2018 03:56:25 PM,41.732426,-87.624029,"(41.732426015, -87.624029403)",POINT (-87.62402940299999 41.732426015),POINT (-87.62402940299999 41.732426015)


In [6]:
#check the years and align the data sets
print(demo.year.unique())
print(crime.Year.unique())

[2017 2016 2015 2014 2013 2012 2011 2010 2009]
[2003 2002 2006 2008 2005 2009 2017 2012 2004 2011 2018 2016 2015 2010
 2013 2014 2007 2001 2019]


In [7]:
#trim crime df to align years
crime = crime[(crime.Year > 2010)&(crime.Year < 2019)]
demo = demo[(demo.year > 2010)&(crime.Year < 2019)]

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
demo_2018 = demo[demo['year']==2017]
demo_2018['year'].replace({2017:2018},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
demo = pd.concat([demo,demo_2018],ignore_index=True) #fill in 2018 with 2017 values

In [10]:
#map file to consolidate categories
crime_mapper = pd.read_csv('crime_mapper.csv')

#merge to get new mapping in crime data set
crime = pd.merge(left=crime,right=crime_mapper,how='left',
                 left_on='Primary Type',right_on='OLD_TYPE')

In [11]:
#aggregate to the year and crime type level
crime_agg = pd.DataFrame(crime.groupby(['census_tract','Year','NEW_TYPE'])['IUCR'].count())

In [12]:
#reset index
crime_agg.reset_index(inplace=True)

In [13]:
#aggregate and pivot data
crime_agg = crime_agg.groupby(['census_tract', 'Year', 'NEW_TYPE'])['IUCR'].sum() \
                    .unstack('NEW_TYPE')

In [14]:
crime_agg.shape

(6408, 23)

In [15]:
#fill na
crime_agg.fillna(0,inplace=True)

In [16]:
#reset the index to join in demographic data
crime_agg.reset_index(inplace=True)

In [17]:
#read in census tract shape file
census_shapes = gpd.read_file('geo_export_ce0fbd70-d876-49ca-9432-ae1454719ab0.shp')

In [18]:
#convert column for joining
census_shapes['name_float'] = census_shapes.name10.astype(float)

In [19]:
#join to get geoid from tracts table
crime_agg = pd.merge(left=crime_agg,right=census_shapes,
                     how='left',left_on='census_tract',right_on='name_float')

In [20]:
#set column type for join
crime_agg['geoid10_int'] = crime_agg['geoid10'].astype(int)

In [21]:
#join demographic table
crime_agg = pd.merge(left=crime_agg,right=demo,
                     left_on=['geoid10_int','Year'],right_on=['geoid','year'])

In [22]:
#set max columns
pd.set_option('display.max_columns', 500)

In [23]:
#roll data up to the community area level
len(crime_agg.commarea.unique())
sum_list = ['ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION','households', 'own', 'rent','16+_tot',
       'med_income', 'total', 'white', 'black', 'amer_indian',
       'asian', 'hawaiian', 'other','all_families', 'married_fam','single_mom']

crime_agg_sums = pd.DataFrame(crime_agg.groupby(['year','commarea'])[sum_list].sum())

crime_agg_sums.reset_index(inplace=True)

In [24]:
crime_agg_sums.head()

Unnamed: 0,year,commarea,ARSON,ASSAULT AND BATTERY,BURGLARY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,GAMBLING,HOMICIDE,INTERFERENCE WITH PUBLIC OFFICER,INTIMIDATION,KIDNAPPING,LIQUOR LAW VIOLATION,NARCOTICS,OFFENSE INVOLVING CHILDREN,OTHER OFFENSE,PROSTITUTION,PUBLIC PEACE VIOLATION,ROBBERY,SEXUAL ASSAULT,STALKING,THEFT,TRESPASSING,VEHICLE THEFT,WEAPONS VIOLATION,households,own,rent,16+_tot,total,white,black,amer_indian,asian,hawaiian,other,all_families,married_fam,single_mom
0,2011,1,5.0,631.0,123.0,272.0,58.0,6.0,3.0,9.0,0.0,0.0,7.0,226.0,12.0,164.0,2.0,24.0,113.0,13.0,2.0,404.0,69.0,67.0,19.0,9442,2806,6636,18563,23793,9936,8282,7,1327,0,3450,4511,2743,1359
1,2011,10,2.0,273.0,138.0,259.0,84.0,0.0,0.0,1.0,0.0,3.0,3.0,32.0,8.0,132.0,0.0,12.0,14.0,11.0,0.0,384.0,36.0,44.0,1.0,14441,11238,3203,30122,37276,32431,295,75,1749,0,1812,9590,7360,1601
2,2011,11,1.0,249.0,132.0,205.0,92.0,0.0,2.0,6.0,1.0,0.0,1.0,53.0,19.0,136.0,0.0,6.0,16.0,6.0,4.0,286.0,30.0,65.0,3.0,9882,6739,3143,21357,26099,21514,247,74,2240,0,1551,6307,4709,1003
3,2011,12,0.0,64.0,59.0,66.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,2.0,27.0,0.0,1.0,3.0,3.0,0.0,73.0,8.0,11.0,1.0,3667,3080,587,7935,10325,7067,205,0,1801,0,896,2700,2245,321
4,2011,13,0.0,174.0,144.0,189.0,30.0,2.0,1.0,0.0,1.0,0.0,2.0,46.0,7.0,79.0,2.0,2.0,32.0,8.0,2.0,293.0,25.0,64.0,5.0,6448,3350,3098,14382,17954,9949,576,7,4745,0,1744,4071,3107,580


In [25]:
cols = ['ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION']

#adding one to avoid div by zero error, only affects one row
for col in cols:
    crime_agg_sums[col]= crime_agg_sums.apply(lambda row: row[col] / (row['households']+1), 
                                  axis=1)

In [26]:
cols = ['commarea','ARSON', 'ASSAULT AND BATTERY', 'BURGLARY',
       'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'NARCOTICS', 'OFFENSE INVOLVING CHILDREN',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION', 'ROBBERY',
       'SEXUAL ASSAULT', 'STALKING', 'THEFT', 'TRESPASSING', 'VEHICLE THEFT',
       'WEAPONS VIOLATION']

crime_agg = crime_agg_sums[cols]

In [29]:
#take average for each community 
crime_agg = crime_agg.groupby('commarea')[cols].mean()

In [31]:
crime_agg.reset_index(inplace=True)

In [34]:
crime_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 24 columns):
commarea                            77 non-null object
ARSON                               77 non-null float64
ASSAULT AND BATTERY                 77 non-null float64
BURGLARY                            77 non-null float64
CRIMINAL DAMAGE                     77 non-null float64
DECEPTIVE PRACTICE                  77 non-null float64
GAMBLING                            77 non-null float64
HOMICIDE                            77 non-null float64
INTERFERENCE WITH PUBLIC OFFICER    77 non-null float64
INTIMIDATION                        77 non-null float64
KIDNAPPING                          77 non-null float64
LIQUOR LAW VIOLATION                77 non-null float64
NARCOTICS                           77 non-null float64
OFFENSE INVOLVING CHILDREN          77 non-null float64
OTHER OFFENSE                       77 non-null float64
PROSTITUTION                        77 non-null float64
PU

In [35]:
crime_agg['commarea'] = crime_agg['commarea'].astype(float)

In [40]:
crime_agg = crime_agg[crime_agg['commarea']!=56]

In [43]:
crime_agg.set_index('commarea',inplace=True)

In [44]:
scaler = StandardScaler()
crime_agg_scaled = scaler.fit_transform(crime_agg)

In [45]:
#test kmeans model across k range
cluster_test = [i for i in range(2,15)] #cluster range to iterate over
scores = []
elbows = []
for i in cluster_test:
    cluster_mod = KMeans(n_clusters=i,random_state=13).fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}" \
          .format(silhouette_scores))
    scores.append(silhouette_scores)
    
    elbow = sum(np.min(cdist(crime_agg_scaled, cluster_mod.cluster_centers_, 
                        'euclidean'), axis=1)) / crime_agg_scaled.shape[0]
    elbows.append(elbow)

For n_clusters = 2, the average silhouette_score is : 0.47917286020426514
For n_clusters = 3, the average silhouette_score is : 0.41883599955752304
For n_clusters = 4, the average silhouette_score is : 0.414608147003916
For n_clusters = 5, the average silhouette_score is : 0.42388517047415203
For n_clusters = 6, the average silhouette_score is : 0.4176832616076343
For n_clusters = 7, the average silhouette_score is : 0.41709298428441893
For n_clusters = 8, the average silhouette_score is : 0.24335967218382146
For n_clusters = 9, the average silhouette_score is : 0.1699546939232783
For n_clusters = 10, the average silhouette_score is : 0.20848795060878567
For n_clusters = 11, the average silhouette_score is : 0.16153619859533488
For n_clusters = 12, the average silhouette_score is : 0.20910546058122356
For n_clusters = 13, the average silhouette_score is : 0.19752571975424024
For n_clusters = 14, the average silhouette_score is : 0.19011342981646565


In [47]:
elbows

[2.9203498768583827,
 2.57894962435732,
 2.3985377268371284,
 2.2725366760724497,
 2.149758972891493,
 2.048562457553399,
 1.9220495073378574,
 1.8229741996323139,
 1.7656636724813553,
 1.6841106493170444,
 1.6125566610062427,
 1.5710956120511363,
 1.5052391937052687]

In [None]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over
scores = []
for i in cluster_test:
    cluster_mod = GaussianMixture(n_components=i,random_state=13).fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod \
                                         .predict(crime_agg_scaled))
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}" \
                                                      .format(silhouette_scores))
    scores.append(silhouette_scores)

In [None]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over

for i in cluster_test:
    cluster_mod = AgglomerativeClustering(n_clusters=i,linkage='single').fit(crime_agg_scaled)
    silhouette_scores = silhouette_score(crime_agg_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}".format(silhouette_scores))

### Use kmeans w/ K=4 

In [48]:
cluster_mod = KMeans(n_clusters=4,random_state=13).fit(crime_agg_scaled)
cluster_labels = pd.DataFrame(cluster_mod.labels_)

In [49]:
crime_agg.reset_index(inplace=True)
#crime_agg_sums.head()

In [50]:
#add cluster labels to crime data 
crime_cluster = pd.merge(left=crime_agg,right=cluster_labels,
                         how='left',left_index=True,right_index=True)

In [56]:
#output data
crime_cluster.to_csv('crime_clusters.csv')

In [None]:
crime_cluster.year.unique()

### Anomaly Detection

In [57]:
score = cluster_mod.fit_predict(crime_agg_scaled)
kmeans_df = pd.DataFrame(cluster_mod.transform(crime_agg_scaled))
kmeans_df = pd.concat((kmeans_df,pd.DataFrame(score)),axis=1)

In [58]:
kmeans_df.columns =[0,1,2,3,'cluster']
kmeans_df['dist_to_clus'] = kmeans_df.apply(lambda x: x[int(x["cluster"])],axis=1)

In [59]:
#rename columns
kmeans_df.rename({0:'dist_0',1:'dist_1',2:'dist_2',3:"dist_3"},axis=1,inplace=True)

In [60]:
kmeans_df.shape

(76, 6)

In [64]:
kmeans_df = pd.merge(left=kmeans_df,right=crime_agg,left_index=True,right_index=True)

In [66]:
kmeans_df.columns

Index(['commarea_x', 'dist_0', 'dist_1', 'dist_2', 'dist_3', 'cluster',
       'dist_to_clus', 'commarea_y', 'ARSON', 'ASSAULT AND BATTERY',
       'BURGLARY', 'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'GAMBLING',
       'HOMICIDE', 'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION',
       'KIDNAPPING', 'LIQUOR LAW VIOLATION', 'NARCOTICS',
       'OFFENSE INVOLVING CHILDREN', 'OTHER OFFENSE', 'PROSTITUTION',
       'PUBLIC PEACE VIOLATION', 'ROBBERY', 'SEXUAL ASSAULT', 'STALKING',
       'THEFT', 'TRESPASSING', 'VEHICLE THEFT', 'WEAPONS VIOLATION'],
      dtype='object')

In [67]:
kmeans_df = kmeans_df[['commarea_x','dist_0', 'dist_1',
                       'dist_2', 'dist_3', 'cluster', 'dist_to_clus']]

In [69]:
crime_cluster_w_anoms = pd.merge(left=crime_cluster,right=kmeans_df,
                                 left_on='commarea',right_on='commarea_x')

In [70]:
crime_cluster_w_anoms.to_csv('crime_clus_anomalies.csv')