# Loading data

In [27]:
#loading package
import pandas as pd

import matplotlib.cm
from matplotlib import colors
import matplotlib.pyplot as plt 
import numpy as np              
import sklearn.cluster as sklc  # For clustering
import sklearn.metrics as sklm  # For the silhouette score

from sklearn.preprocessing import RobustScaler
import sklearn

In [28]:
df = pd.read_csv('London_imd_2019_cleaning.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4835 entries, 0 to 4834
Data columns (total 12 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ls11cd                                     4835 non-null   object 
 1   la11nm                                     4835 non-null   object 
 2   london_imd_rank                            4835 non-null   int64  
 3   london_imd_decile                          4835 non-null   int64  
 4   Index of Multiple Deprivation (IMD) Score  4835 non-null   float64
 5   Income Score (rate)                        4835 non-null   float64
 6   Employment Score (rate)                    4835 non-null   float64
 7   Education, Skills and Training Score       4835 non-null   float64
 8   Health Deprivation and Disability Score    4835 non-null   float64
 9   Crime Score                                4835 non-null   float64
 10  Barriers to Housing and 

Unnamed: 0,ls11cd,la11nm,london_imd_rank,london_imd_decile,Index of Multiple Deprivation (IMD) Score,Income Score (rate),Employment Score (rate),"Education, Skills and Training Score",Health Deprivation and Disability Score,Crime Score,Barriers to Housing and Services Score,Living Environment Score
0,E01000001,City of London,4576,10,6.208,0.007,0.01,0.024,-1.654,-2.012,29.472,31.873
1,E01000002,City of London,4700,10,5.143,0.034,0.027,0.063,-1.115,-2.343,24.412,23.084
2,E01000003,City of London,2562,6,19.402,0.086,0.086,5.804,-0.102,-1.032,40.103,40.535
3,E01000005,City of London,1325,3,28.652,0.211,0.136,22.26,-0.121,-1.317,39.9,28.979
4,E01000006,Barking and Dagenham,2485,6,19.837,0.117,0.059,14.798,-0.359,-0.147,45.171,26.888


In [29]:
# rename
df.rename(columns = {'Index of Multiple Deprivation (IMD) Score':'imd',
                     'Income Score (rate)':'income',
                     'Employment Score (rate)':'employment',
                     'Education, Skills and Training Score':'edu_ski_tra',
                     'Health Deprivation and Disability Score':'health_depri',
                     'Crime Score':'crime',
                     'Barriers to Housing and Services Score':'hous_serv',
                     'Living Environment Score':'living_env'}, 
          inplace = True)
df.head()

Unnamed: 0,ls11cd,la11nm,london_imd_rank,london_imd_decile,imd,income,employment,edu_ski_tra,health_depri,crime,hous_serv,living_env
0,E01000001,City of London,4576,10,6.208,0.007,0.01,0.024,-1.654,-2.012,29.472,31.873
1,E01000002,City of London,4700,10,5.143,0.034,0.027,0.063,-1.115,-2.343,24.412,23.084
2,E01000003,City of London,2562,6,19.402,0.086,0.086,5.804,-0.102,-1.032,40.103,40.535
3,E01000005,City of London,1325,3,28.652,0.211,0.136,22.26,-0.121,-1.317,39.9,28.979
4,E01000006,Barking and Dagenham,2485,6,19.837,0.117,0.059,14.798,-0.359,-0.147,45.171,26.888


# Clustering

In [30]:
mos = df
data=mos[["income","employment","edu_ski_tra","health_depri","crime","hous_serv","living_env"]]
print(data)

      income  employment  edu_ski_tra  health_depri  crime  hous_serv  \
0      0.007       0.010        0.024        -1.654 -2.012     29.472   
1      0.034       0.027        0.063        -1.115 -2.343     24.412   
2      0.086       0.086        5.804        -0.102 -1.032     40.103   
3      0.211       0.136       22.260        -0.121 -1.317     39.900   
4      0.117       0.059       14.798        -0.359 -0.147     45.171   
...      ...         ...          ...           ...    ...        ...   
4830   0.078       0.040        7.808        -0.394 -0.190     40.546   
4831   0.133       0.061        9.232        -0.196  0.214     50.293   
4832   0.184       0.119       17.868         0.879  0.365     37.315   
4833   0.154       0.095        9.758        -0.201 -0.276     45.060   
4834   0.145       0.085        6.379        -0.406  0.679     23.996   

      living_env  
0         31.873  
1         23.084  
2         40.535  
3         28.979  
4         26.888  
...      

In [31]:
# Standardisation
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

data_standardised = data.copy()
for c in data.columns.values:
    data_standardised[c] = mms.fit_transform(data[c].values.reshape(-1,1))
data_standardised.head()

Unnamed: 0,income,employment,edu_ski_tra,health_depri,crime,hous_serv,living_env
0,0.00232,0.022293,0.000187,0.326228,0.072289,0.35505,0.306702
1,0.064965,0.076433,0.000848,0.438871,0.002325,0.275423,0.204685
2,0.185615,0.264331,0.098214,0.650575,0.279434,0.522346,0.407245
3,0.475638,0.423567,0.377304,0.646604,0.219193,0.519151,0.27311
4,0.257541,0.178344,0.25075,0.596865,0.466498,0.602099,0.248839


In [32]:
# K-means clustering and silhouette score
X = data_standardised[["income","employment","edu_ski_tra","health_depri","crime","hous_serv","living_env"]]
random_state_seed = 100
df_silhouette_score = pd.DataFrame({'n_cluster':[2,3,4,5,6,7,8,9,10], 'silhouette_score':[0,0,0,0,0,0,0,0,0]})
for index, row in df_silhouette_score.iterrows():
    n_clusters = row['n_cluster']
    clusterer = sklc.KMeans(n_clusters=n_clusters, random_state=random_state_seed).fit(X)
    cluster_labels = clusterer.labels_
   
    silhouette_avg = sklm.silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )
    row['silhouette_score'] = silhouette_avg

For n_clusters = 2 The average silhouette_score is : 0.3564570594511862
For n_clusters = 3 The average silhouette_score is : 0.2496001412442607
For n_clusters = 4 The average silhouette_score is : 0.19178924727491767
For n_clusters = 5 The average silhouette_score is : 0.17933966466565676
For n_clusters = 6 The average silhouette_score is : 0.1782950637611331
For n_clusters = 7 The average silhouette_score is : 0.1672370708790995
For n_clusters = 8 The average silhouette_score is : 0.16332670459043389
For n_clusters = 9 The average silhouette_score is : 0.15859507590102145
For n_clusters = 10 The average silhouette_score is : 0.15612465777173593


In [33]:
# K-means clustering
# 4 clusters
num_clusters = 4
random_state_seed = 100
kmeans_output = sklc.KMeans(n_clusters=num_clusters, random_state=random_state_seed).fit(data_standardised)

print(kmeans_output) 


KMeans(n_clusters=4, random_state=100)


In [34]:
clustering_ids_kmeans = kmeans_output.labels_

print(clustering_ids_kmeans)

# combine the clustering IDs to the dataframe
data4 = data.assign(cluster_id = clustering_ids_kmeans)

#Have a look at the result:
print(data4)

[3 3 1 ... 2 2 1]
      income  employment  edu_ski_tra  health_depri  crime  hous_serv  \
0      0.007       0.010        0.024        -1.654 -2.012     29.472   
1      0.034       0.027        0.063        -1.115 -2.343     24.412   
2      0.086       0.086        5.804        -0.102 -1.032     40.103   
3      0.211       0.136       22.260        -0.121 -1.317     39.900   
4      0.117       0.059       14.798        -0.359 -0.147     45.171   
...      ...         ...          ...           ...    ...        ...   
4830   0.078       0.040        7.808        -0.394 -0.190     40.546   
4831   0.133       0.061        9.232        -0.196  0.214     50.293   
4832   0.184       0.119       17.868         0.879  0.365     37.315   
4833   0.154       0.095        9.758        -0.201 -0.276     45.060   
4834   0.145       0.085        6.379        -0.406  0.679     23.996   

      living_env  cluster_id  
0         31.873           3  
1         23.084           3  
2         40

In [35]:
# 2 clusters
num_clusters = 2
random_state_seed = 100
kmeans_output = sklc.KMeans(n_clusters=num_clusters, random_state=random_state_seed).fit(data_standardised)

print(kmeans_output) 

clustering_ids_kmeans = kmeans_output.labels_

print(clustering_ids_kmeans)

# combine the clustering IDs to the dataframe
data2 = data.assign(cluster_id = clustering_ids_kmeans)

#Have a look at the result:
print(data2)

KMeans(n_clusters=2, random_state=100)
[1 1 1 ... 0 0 1]
      income  employment  edu_ski_tra  health_depri  crime  hous_serv  \
0      0.007       0.010        0.024        -1.654 -2.012     29.472   
1      0.034       0.027        0.063        -1.115 -2.343     24.412   
2      0.086       0.086        5.804        -0.102 -1.032     40.103   
3      0.211       0.136       22.260        -0.121 -1.317     39.900   
4      0.117       0.059       14.798        -0.359 -0.147     45.171   
...      ...         ...          ...           ...    ...        ...   
4830   0.078       0.040        7.808        -0.394 -0.190     40.546   
4831   0.133       0.061        9.232        -0.196  0.214     50.293   
4832   0.184       0.119       17.868         0.879  0.365     37.315   
4833   0.154       0.095        9.758        -0.201 -0.276     45.060   
4834   0.145       0.085        6.379        -0.406  0.679     23.996   

      living_env  cluster_id  
0         31.873           1  
1   

In [36]:
# 3 clusters
num_clusters = 3
random_state_seed = 100
kmeans_output = sklc.KMeans(n_clusters=num_clusters, random_state=random_state_seed).fit(data_standardised)

print(kmeans_output) 

clustering_ids_kmeans = kmeans_output.labels_

print(clustering_ids_kmeans)

# combine the clustering IDs to the dataframe
data3 = data.assign(cluster_id = clustering_ids_kmeans)

#Have a look at the result:
print(data3)

KMeans(n_clusters=3, random_state=100)
[2 2 0 ... 1 0 0]
      income  employment  edu_ski_tra  health_depri  crime  hous_serv  \
0      0.007       0.010        0.024        -1.654 -2.012     29.472   
1      0.034       0.027        0.063        -1.115 -2.343     24.412   
2      0.086       0.086        5.804        -0.102 -1.032     40.103   
3      0.211       0.136       22.260        -0.121 -1.317     39.900   
4      0.117       0.059       14.798        -0.359 -0.147     45.171   
...      ...         ...          ...           ...    ...        ...   
4830   0.078       0.040        7.808        -0.394 -0.190     40.546   
4831   0.133       0.061        9.232        -0.196  0.214     50.293   
4832   0.184       0.119       17.868         0.879  0.365     37.315   
4833   0.154       0.095        9.758        -0.201 -0.276     45.060   
4834   0.145       0.085        6.379        -0.406  0.679     23.996   

      living_env  cluster_id  
0         31.873           2  
1   