In [1]:
import pandas as pd
import geopandas as gpd
import os
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
os.chdir(os.getcwd()+"/Data")

In [3]:
#set max columns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_row', 500)

In [4]:
#import data sets
socio = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv')
health = pd.read_csv('Public_Health_Statistics-_Selected_public_health_indicators_by_Chicago_community_area.csv')

In [5]:
socio = socio.loc[0:76] #filter out last nan row

In [6]:
#merge data sets
sociohealth = pd.merge(left=health,right=socio,left_index=True,right_index=True)

In [7]:
sociohealth.head()

Unnamed: 0,Community Area,Community Area Name,Birth Rate,General Fertility Rate,Low Birth Weight,Prenatal Care Beginning in First Trimester,Preterm Births,Teen Birth Rate,Assault (Homicide),Breast cancer in females,Cancer (All Sites),Colorectal Cancer,Diabetes-related,Firearm-related,Infant Mortality Rate,Lung Cancer,Prostate Cancer in Males,Stroke (Cerebrovascular Disease),Childhood Blood Lead Level Screening,Childhood Lead Poisoning,Gonorrhea in Females,Gonorrhea in Males,Tuberculosis,Below Poverty Level,Crowded Housing,Dependency,No High School Diploma,Per Capita Income,Unemployment,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1,Rogers Park,16.4,62.0,11.0,73.0,11.2,40.8,7.7,23.3,176.9,25.3,77.1,5.2,6.4,36.7,21.7,33.7,364.7,0.5,322.5,423.3,11.4,22.7,7.9,28.8,18.1,23714,7.5,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2,West Ridge,17.3,83.3,8.1,71.1,8.3,29.9,5.8,20.2,155.9,17.3,60.5,3.7,5.1,36.0,14.2,34.7,331.4,1.0,141.0,205.7,8.9,15.1,7.0,38.3,19.6,21375,7.9,2.0,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3,Uptown,13.1,50.5,8.3,77.7,10.3,35.1,5.4,21.3,183.3,20.5,80.0,4.6,6.5,50.5,25.2,41.7,353.7,0.5,170.8,468.7,13.6,22.7,4.6,22.2,13.6,32355,7.7,3.0,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4,Lincoln Square,17.1,61.0,8.1,80.5,9.7,38.4,5.0,21.7,153.2,8.6,55.4,6.1,3.8,43.1,27.6,36.9,273.3,0.4,98.8,195.5,8.5,9.5,3.1,25.6,12.5,35503,6.8,4.0,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5,North Center,22.4,76.2,9.1,80.4,9.8,8.4,1.0,16.6,152.1,26.1,49.8,1.0,2.7,42.4,15.1,41.6,178.1,0.9,85.4,188.6,1.9,7.1,0.2,25.5,5.4,51615,4.5,5.0,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


In [8]:
#prep data for clustering
sociohealth.drop(['Community Area Number','COMMUNITY AREA NAME'],axis=1,inplace=True)
sociohealth.set_index(['Community Area','Community Area Name'],inplace=True)

In [9]:
#clean up gonorrhea columns
sociohealth['Gonorrhea in Females'].fillna(0,inplace=True)
sociohealth['Gonorrhea in Males'].replace({".":0},inplace=True)

In [10]:
#convert object to float before data preprocessing
sociohealth['Gonorrhea in Males'] = sociohealth['Gonorrhea in Males'].astype(float)

In [11]:
#fill any remaining nans 
sociohealth.fillna(0,inplace=True)

In [12]:
#standard scale data
scaler = StandardScaler()
sociohealth_scaled = scaler.fit_transform(sociohealth)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Cluster Testing

In [14]:
#test kmeans model across k range
cluster_test = [i for i in range(2,15)] #cluster range to iterate over
scores = []
elbows = []
for i in cluster_test:
    cluster_mod = KMeans(n_clusters=i,random_state=13).fit(sociohealth_scaled)
    silhouette_scores = silhouette_score(sociohealth_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}" \
          .format(silhouette_scores))
    
    scores.append(silhouette_scores)
    
    elbow = sum(np.min(cdist(sociohealth_scaled, cluster_mod.cluster_centers_, 
                        'euclidean'), axis=1)) / sociohealth_scaled.shape[0]
    elbows.append(elbow)

For n_clusters = 2, the average silhouette_score is : 0.3423426675859131
For n_clusters = 3, the average silhouette_score is : 0.30001775456499347
For n_clusters = 4, the average silhouette_score is : 0.25738426424833316
For n_clusters = 5, the average silhouette_score is : 0.18869454425413498
For n_clusters = 6, the average silhouette_score is : 0.1691462954114385
For n_clusters = 7, the average silhouette_score is : 0.17733050162794609
For n_clusters = 8, the average silhouette_score is : 0.15274412208363117
For n_clusters = 9, the average silhouette_score is : 0.15125150953806152
For n_clusters = 10, the average silhouette_score is : 0.18646196106878132
For n_clusters = 11, the average silhouette_score is : 0.1660680413855916
For n_clusters = 12, the average silhouette_score is : 0.16626658702230337
For n_clusters = 13, the average silhouette_score is : 0.15070415180143923
For n_clusters = 14, the average silhouette_score is : 0.14279914283363682


In [34]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over

for i in cluster_test:
    cluster_mod = GaussianMixture(n_components=i,random_state=13).fit(sociohealth_scaled)
    silhouette_scores = silhouette_score(sociohealth_scaled,cluster_mod.predict(sociohealth_scaled))
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}".format(silhouette_scores))

For n_clusters = 2, the average silhouette_score is : 0.3423426675859131
For n_clusters = 3, the average silhouette_score is : 0.1730497013159215
For n_clusters = 4, the average silhouette_score is : 0.23252797761785282
For n_clusters = 5, the average silhouette_score is : 0.20215898879828037
For n_clusters = 6, the average silhouette_score is : 0.16235202493280068
For n_clusters = 7, the average silhouette_score is : 0.1403408602384486
For n_clusters = 8, the average silhouette_score is : 0.19245818719182162
For n_clusters = 9, the average silhouette_score is : 0.18273268228460193
For n_clusters = 10, the average silhouette_score is : 0.18646196106878132
For n_clusters = 11, the average silhouette_score is : 0.14891106279035085
For n_clusters = 12, the average silhouette_score is : 0.1482273891720097
For n_clusters = 13, the average silhouette_score is : 0.13357318302896723
For n_clusters = 14, the average silhouette_score is : 0.1286930487853086


In [35]:
cluster_test = [i for i in range(2,15)] #cluster range to iterate over

for i in cluster_test:
    cluster_mod = AgglomerativeClustering(n_clusters=i,linkage='single').fit(sociohealth_scaled)
    silhouette_scores = silhouette_score(sociohealth_scaled,cluster_mod.labels_)
    print("For n_clusters = {},".format(i)+" the average silhouette_score is : {}".format(silhouette_scores))

For n_clusters = 2, the average silhouette_score is : 0.2646583542194551
For n_clusters = 3, the average silhouette_score is : 0.2584107004411183
For n_clusters = 4, the average silhouette_score is : 0.23130301102368683
For n_clusters = 5, the average silhouette_score is : 0.020518975356178124
For n_clusters = 6, the average silhouette_score is : -0.043730953876489446
For n_clusters = 7, the average silhouette_score is : -0.05555902042428653
For n_clusters = 8, the average silhouette_score is : -0.06101447975239741
For n_clusters = 9, the average silhouette_score is : -0.0843913899743957
For n_clusters = 10, the average silhouette_score is : -0.09321769309941778
For n_clusters = 11, the average silhouette_score is : -0.11890609222707499
For n_clusters = 12, the average silhouette_score is : -0.15635285012160738
For n_clusters = 13, the average silhouette_score is : -0.16569252974731763
For n_clusters = 14, the average silhouette_score is : -0.1920367918524907


### Use Kmeans = 4

In [36]:
cluster_mod = KMeans(n_clusters=4,random_state=13).fit(sociohealth_scaled)
cluster_labels = pd.DataFrame(cluster_mod.labels_)

In [38]:
sociohealth.reset_index(inplace=True)

In [39]:
#add cluster labels to crime data 
sociohealth_cluster = pd.merge(left=sociohealth,right=cluster_labels,
                         how='left',left_index=True,right_index=True)

In [41]:
#import community shape files
community_geo = gpd.read_file('Boundaries - Community Areas (current).geojson')

In [45]:
#convert merge column to appropriate data type
community_geo['area_numbe'] = community_geo['area_numbe'].astype(int)

In [46]:
#merge df to get geometry
sociohealth_cluster = pd.merge(left=sociohealth_cluster,right=community_geo,how='left',
                         left_on='Community Area',right_on='area_numbe')

In [54]:
#drop unneeded columns
sociohealth_cluster.drop(['community','area','shape_area','perimeter','area_num_1',
'area_numbe','comarea_id','comarea','shape_len'],axis=1,inplace=True)

In [56]:
#output data
sociohealth_cluster.to_csv('sociohealth_clusters.csv')

### Anomaly Detection

In [57]:
score = cluster_mod.fit_predict(sociohealth_scaled)
kmeans_df = pd.DataFrame(cluster_mod.transform(sociohealth_scaled))
kmeans_df = pd.concat((kmeans_df,pd.DataFrame(score)),axis=1)

In [58]:
kmeans_df.columns =[0,1,2,3,'cluster']
kmeans_df['dist_to_clus'] = kmeans_df.apply(lambda x: x[int(x["cluster"])],axis=1)

In [59]:
#rename columns
kmeans_df.rename({0:'dist_0',1:'dist_1',2:'dist_2',3:"dist_3"},axis=1,inplace=True)

In [60]:
health_cluster_w_anoms = pd.merge(left=sociohealth_cluster,right=kmeans_df,
                                 left_index=True,right_index=True)

In [61]:
health_cluster_w_anoms.to_csv('sociohealth_clus_anomalies.csv')