In [1]:
# !pip install descartes
# !pip install geopandas
# !pip install pandas_bokeh

In [2]:
#!unzip FinalOutputs.zip

In [3]:
import geopandas as gpd
from shapely.geometry import Point
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import pandas_bokeh # library for visualization dashboard
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
from bokeh.models.widgets import Panel, Tabs
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.layouts import layout

print('Libraries imported.')


Libraries imported.


In [4]:
pandas_bokeh.output_file("Classification Visualization.html")

In [5]:
# Read files from local machine
ctstats = gpd.read_file('/Users/lirui/Desktop/ReginaOutputs/CT_Stats.shp')
dbstats = gpd.read_file('/Users/lirui/Desktop/ReginaOutputs/DB_Stats.shp')
gridstats = gpd.read_file('/Users/lirui/Desktop/ReginaOutputs/GRID_Stats.shp')

# 1. Census Tract

In [6]:
ctstats.head()

Unnamed: 0,CTUID,PRNAME,CMANAME,CMATYPE,area,AvgSize,BldgCount,BD,BldgArea,BCR,ProxMean,ContCount,ContRatio,geometry
0,7050100.12,Saskatchewan,Regina,B,9886176.0,207.743248,3421,0.00034604,710689.652329,0.071887,2.26936,1146,0.33499,"POLYGON ((5313856.75143 1677717.71143, 5313403..."
1,7050100.13,Saskatchewan,Regina,B,8509538.0,262.318343,2054,0.00024138,538801.877355,0.063317,2.242403,849,0.41334,"POLYGON ((5313190.09143 1678349.31143, 5313158..."
2,7050100.14,Saskatchewan,Regina,B,664985900.0,269.965845,13,2e-08,3509.555987,5e-06,1.862394,6,0.461538,"POLYGON ((5317260.705715 1691091.03143, 531730..."
3,7050005.0,Saskatchewan,Regina,B,2138134.0,102.769961,3141,0.00146904,322800.44703,0.150973,2.637148,990,0.315186,"POLYGON ((5303740.34 1679038.68, 5303777.06285..."
4,7050006.0,Saskatchewan,Regina,B,12731070.0,277.171637,2442,0.00019181,676853.138049,0.053165,3.868232,762,0.312039,"POLYGON ((5305754.04857 1677692.21143, 5305760..."


### We need only the 5 statistics, the Census Tract ID and the geometry of the CT polygons

In [7]:
df_ct = ctstats.drop(['PRNAME','CMANAME','CMATYPE','area', 'BldgArea', 'BldgCount', 'ContCount'], axis=1)
df_ct = df_ct.dropna(axis='rows')
df_ct.head()

Unnamed: 0,CTUID,AvgSize,BD,BCR,ProxMean,ContRatio,geometry
0,7050100.12,207.743248,0.00034604,0.071887,2.26936,0.33499,"POLYGON ((5313856.75143 1677717.71143, 5313403..."
1,7050100.13,262.318343,0.00024138,0.063317,2.242403,0.41334,"POLYGON ((5313190.09143 1678349.31143, 5313158..."
2,7050100.14,269.965845,2e-08,5e-06,1.862394,0.461538,"POLYGON ((5317260.705715 1691091.03143, 531730..."
3,7050005.0,102.769961,0.00146904,0.150973,2.637148,0.315186,"POLYGON ((5303740.34 1679038.68, 5303777.06285..."
4,7050006.0,277.171637,0.00019181,0.053165,3.868232,0.312039,"POLYGON ((5305754.04857 1677692.21143, 5305760..."


## 1.1 Classification Task on the Census Tract data

### Creating Standardized/normalized data for scale invariance as below:

### For every given variable x we standardize it as: $\frac{(x-\mu)}{\sigma}$ , where $\mu$ is the mean and $\sigma$ is standard deviation

In [8]:
from sklearn.preprocessing import StandardScaler #for standardization

X = df_ct.values[:,1:-1] #we are taking only the 5 statistics
cluster_dataset = StandardScaler().fit_transform(X) #creating standard scalar from unscaled data a 



### k-means clustering on Census Tract

In [9]:
k_means = KMeans(init = "k-means++", n_clusters = 3, n_init = 12)
# fit the X value to the model
k_means.fit(cluster_dataset)
labels = k_means.labels_

In [10]:
new_lab = []
for label in labels:
    new_lab.append("Class "+str(label))

In [11]:
new_lab = np.asarray(new_lab)

In [12]:
df_ct['Labels'] = new_lab
df_ct['LabelNum']=labels
df_ct.head()

Unnamed: 0,CTUID,AvgSize,BD,BCR,ProxMean,ContRatio,geometry,Labels,LabelNum
0,7050100.12,207.743248,0.00034604,0.071887,2.26936,0.33499,"POLYGON ((5313856.75143 1677717.71143, 5313403...",Class 1,1
1,7050100.13,262.318343,0.00024138,0.063317,2.242403,0.41334,"POLYGON ((5313190.09143 1678349.31143, 5313158...",Class 1,1
2,7050100.14,269.965845,2e-08,5e-06,1.862394,0.461538,"POLYGON ((5317260.705715 1691091.03143, 531730...",Class 1,1
3,7050005.0,102.769961,0.00146904,0.150973,2.637148,0.315186,"POLYGON ((5303740.34 1679038.68, 5303777.06285...",Class 0,0
4,7050006.0,277.171637,0.00019181,0.053165,3.868232,0.312039,"POLYGON ((5305754.04857 1677692.21143, 5305760...",Class 1,1


In [13]:
df_ct['Labels'] = df_ct['Labels'].astype(object)

### Interpretation of Classes

In [14]:
CT_cluster0 = df_ct[df_ct.Labels=="Class 0"]

In [15]:
CT_cluster1 = df_ct[df_ct.Labels=="Class 1"]

In [16]:
CT_cluster2 = df_ct[df_ct.Labels=="Class 2"]

## 1.2 Visualization for classifications

### BCR visualization

In [17]:
ct_bcr = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Coverage Ratio': [CT_cluster0.median()["BCR"], CT_cluster1.median()["BCR"],CT_cluster2.median()["BCR"]]
  
}
df_ct_bcr = pd.DataFrame(ct_bcr).set_index("Class")

ct_bcr_bar = df_ct_bcr.plot_bokeh.bar(
    show_figure=False,
    figsize=(300, 200),  
    ylabel="value", 
    title="Building Coverage Ratio",
    color="blue",
    alpha=0.6)

### BD visualization

In [18]:
ct_bd = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Density': [CT_cluster0.median()["BD"], CT_cluster1.median()["BD"],CT_cluster2.median()["BD"]]
  
}
df_ct_bd = pd.DataFrame(ct_bd).set_index("Class")

ct_bd_bar = df_ct_bd.plot_bokeh.bar(
    show_figure=False,
    figsize=(300, 200),
    ylabel="value", 
    title="Building Density",
    color="purple",
    alpha=0.6)

### Proximity visualization

In [19]:
ct_p = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Proximity': [CT_cluster0.median()["ProxMean"], CT_cluster1.median()["ProxMean"],CT_cluster2.median()["ProxMean"]]
  
}
df_ct_p = pd.DataFrame(ct_p).set_index("Class")

ct_p_bar = df_ct_p.plot_bokeh.bar(
    figsize=(300, 200),
    show_figure=False, 
    ylabel="value", 
    title="Proximity",
    color="Green",
    alpha=0.6)

### Size visualization

In [20]:
ct_s = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Average Size': [CT_cluster0.median()["AvgSize"], CT_cluster1.median()["AvgSize"],CT_cluster2.median()["AvgSize"]]
  
}
df_ct_s = pd.DataFrame(ct_s).set_index("Class")

ct_s_bar = df_ct_s.plot_bokeh.bar(
    show_figure=False,
    figsize=(300, 200), 
    ylabel="value", 
    title="Average Size",
    color="Red",
    alpha=0.6)

### Contiguity Ratio visualization

In [21]:
ct_c = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    "Contiguity Ratio": [CT_cluster0.median()["ContRatio"], CT_cluster1.median()["ContRatio"],CT_cluster2.median()["ContRatio"]]
  
}
df_ct_c = pd.DataFrame(ct_c).set_index("Class")

ct_c_bar = df_ct_c.plot_bokeh.bar(
    figsize=(300, 200),
    show_figure=False, 
    ylabel="value", 
    title="Contiguity Ratio",
    color="Grey",
    alpha=0.6)

### Integrated visualization for census tract

In [22]:
Rct_class=df_ct.plot_bokeh(
    figsize=(900, 600),
    show_figure=False,
    category="LabelNum",
    show_colorbar=True,
    colormap="Viridis",
    hovertool_columns=['CTUID','Labels'],
    legend="Census Tract",
    toolbar_location="above",
    title="Classification for Census Tract")

In [23]:
ct = layout([[Rct_class],
                        [ct_bcr_bar, ct_bd_bar],
                        [ct_p_bar, ct_s_bar,ct_c_bar]],sizing_mode='fixed')

# 2. Dissemination Block

In [24]:
df_db = dbstats.drop(['PRNAME','CMANAME','CMATYPE','area', 'BldgArea', 'BldgCount', 'ContCount'], axis=1)

In [25]:
df_db = df_db.dropna(axis='rows')

In [26]:
df_db.head()

Unnamed: 0,DBUID,AvgSize,BD,BCR,ProxMean,ContRatio,geometry
0,47060240002,159.403587,0.001367,0.217917,1.761887,0.30303,"POLYGON ((5310555.5 1677478.5, 5310619.5 16774..."
1,47060240003,193.716986,0.000754,0.145967,1.194216,0.142857,"POLYGON ((5310597.5 1677715.5, 5310833 1677671..."
2,47060241002,183.151856,0.001349,0.247159,1.18073,0.235294,"POLYGON ((5309899 1677747.625715, 5309992 1677..."
3,47060241003,204.489614,0.000772,0.157864,1.062728,0.4,"POLYGON ((5309923 1677656.75143, 5310007 16776..."
5,47060241007,422.638431,0.00063,0.266473,1.717315,0.666667,"POLYGON ((5310145.3 1677681.74, 5310247.442855..."


## 2.1 Classification Task on the Dissemination Block data

In [27]:
X = df_db.values[:,1:-1] #we are taking only the 5 statistics
cluster_dataset = StandardScaler().fit_transform(X) #creating standard scalar from unscaled data a 
cluster_dataset



array([[-0.09806439, -0.37797652,  0.24305496, -0.34465445, -0.36405927],
       [ 0.02671966, -1.10138149, -0.99954492, -0.54620972, -1.29294363],
       [-0.01170148, -0.39873888,  0.74805804, -0.55099802, -0.75687838],
       ...,
       [ 0.60972664, -0.83037432,  2.49216273, -0.3753711 ,  0.58490912],
       [-0.06927343, -0.89740102, -0.84310687,  0.86848266, -1.43914321],
       [-0.33616412, -0.32303466, -1.22716565, -0.1468848 , -1.0167888 ]])

In [28]:
k_means = KMeans(init = "k-means++", n_clusters = 3, n_init = 12)
# fit the X value to the model
k_means.fit(cluster_dataset)
labels = k_means.labels_

In [29]:
new_lab = []
for label in labels:
    new_lab.append("Class "+str(label))

In [30]:
new_lab = np.asarray(new_lab)

In [31]:
df_db['Labels'] = new_lab
df_db['LabelNum']=labels
df_db.head()

Unnamed: 0,DBUID,AvgSize,BD,BCR,ProxMean,ContRatio,geometry,Labels,LabelNum
0,47060240002,159.403587,0.001367,0.217917,1.761887,0.30303,"POLYGON ((5310555.5 1677478.5, 5310619.5 16774...",Class 1,1
1,47060240003,193.716986,0.000754,0.145967,1.194216,0.142857,"POLYGON ((5310597.5 1677715.5, 5310833 1677671...",Class 1,1
2,47060241002,183.151856,0.001349,0.247159,1.18073,0.235294,"POLYGON ((5309899 1677747.625715, 5309992 1677...",Class 1,1
3,47060241003,204.489614,0.000772,0.157864,1.062728,0.4,"POLYGON ((5309923 1677656.75143, 5310007 16776...",Class 1,1
5,47060241007,422.638431,0.00063,0.266473,1.717315,0.666667,"POLYGON ((5310145.3 1677681.74, 5310247.442855...",Class 0,0


In [32]:
df_db['Labels'] = df_db['Labels'].astype(object)

### Interpretation of Classes

In [33]:
DB_cluster0 = df_db[df_db.Labels=="Class 0"]
DB_cluster1 = df_db[df_db.Labels=="Class 1"]
DB_cluster2 = df_db[df_db.Labels=="Class 2"]

## 2.2 Visualization for classification

### BCR visualization

In [34]:
db_bcr = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Coverage Ratio': [DB_cluster0.median()["BCR"], DB_cluster1.median()["BCR"],DB_cluster2.median()["BCR"]]
  
}
df_db_bcr = pd.DataFrame(db_bcr).set_index("Class")

db_bcr_bar = df_db_bcr.plot_bokeh.bar(
    show_figure=False, 
    figsize=(300,200),
    ylabel="value", 
    title="Building Coverage Ratio",
    color="blue",
    alpha=0.6)

### BD visualization

In [35]:
db_bd = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Density': [DB_cluster0.median()["BD"], DB_cluster1.median()["BD"],DB_cluster2.median()["BD"]]
  
}
df_db_bd = pd.DataFrame(db_bd).set_index("Class")

db_bd_bar = df_db_bd.plot_bokeh.bar(
    show_figure=False,  
    figsize=(300,200),
    ylabel="value", 
    title="Building Density",
    color="purple",
    alpha=0.6)

### Proximity visualization

In [36]:
db_p = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Proximity': [DB_cluster0.median()["ProxMean"], DB_cluster1.median()["ProxMean"],DB_cluster2.median()["ProxMean"]]
  
}
df_db_p = pd.DataFrame(db_p).set_index("Class")

db_p_bar = df_db_p.plot_bokeh.bar(
    show_figure=False, 
    figsize=(300,200),
    ylabel="value", 
    title="Proximity",
    color="Green",
    alpha=0.6)

### Size visualization

In [37]:
db_s = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Average Size': [DB_cluster0.median()["AvgSize"], DB_cluster1.median()["AvgSize"],DB_cluster2.median()["AvgSize"]]
  
}
df_db_s = pd.DataFrame(db_s).set_index("Class")

db_s_bar = df_db_s.plot_bokeh.bar(
    show_figure=False, 
    figsize=(300,200),
    ylabel="value", 
    title="Average Size",
    color="Red",
    alpha=0.6)

### Contiguity Ratio visualization

In [38]:
db_c = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    "Contiguity Ratio": [DB_cluster0.median()["ContRatio"], DB_cluster1.median()["ContRatio"],CT_cluster2.median()["ContRatio"]]
  
}
df_db_c = pd.DataFrame(db_c).set_index("Class")

db_c_bar = df_db_c.plot_bokeh.bar(
    show_figure=False, 
    figsize=(300,200),
    ylabel="value", 
    title="Contiguity Ratio",
    color="Grey",
    alpha=0.6)

### Integrated visualization for Dessimination Blocks

In [39]:
Rdb_class=df_db.plot_bokeh(
    show_figure=False,
    figsize=(900, 600),
    category="LabelNum",
    show_colorbar=True,
    colormap="Viridis",
    hovertool_columns=['DBUID','Labels'],
    legend="Dessimination Block",
    toolbar_location="above",
    title="Classification for Dessimination Block")

In [40]:
db = layout([[Rdb_class],
                        [db_bcr_bar, db_bd_bar],
                        [db_p_bar, db_s_bar,db_c_bar]],sizing_mode='fixed')

# 3. 1 km<sup>2</sup> Grid

In [41]:
df_grid = gridstats.drop(['area', 'BldgArea', 'BldgCount', 'ContCount'], axis=1)

In [42]:
df_grid = df_grid.dropna(axis='rows')

In [43]:
df_grid.head()

Unnamed: 0,id,AvgSize,BD,BCR,ProxMean,ContRatio,geometry
0,3922,40.357167,1.2e-05,0.000484,2.761683,0.5,"POLYGON ((5294439.034285 1683576.202855, 52954..."
1,3923,2570.1148,1.3e-05,0.033411,32.097878,0.153846,"POLYGON ((5294439.034285 1682576.202855, 52954..."
3,4020,148.553032,1.1e-05,0.001634,51.955178,0.181818,"POLYGON ((5295439.034285 1682576.202855, 52964..."
4,4021,9993.058286,1e-05,0.099931,20.197038,0.3,"POLYGON ((5295439.034285 1681576.202855, 52964..."
7,4115,149.531578,2.7e-05,0.004037,33.231388,0.111111,"POLYGON ((5296439.034285 1684576.202855, 52974..."


## 3.1 Classification Task on the Grid 1km^2 data

In [44]:
X = df_grid.values[:,1:-1] #we are taking only the 5 statistics
cluster_dataset = StandardScaler().fit_transform(X) #creating standard scalar from unscaled data a 



In [45]:
k_means = KMeans(init = "k-means++", n_clusters = 3, n_init = 12)
# fit the X value to the model
k_means.fit(cluster_dataset)
labels = k_means.labels_

In [46]:
new_lab = []
for label in labels:
    new_lab.append("Class "+str(label))

In [47]:
new_lab = np.asarray(new_lab)

In [48]:
df_grid['Labels'] = new_lab
df_grid['LabelNum']= labels
df_grid.head()

Unnamed: 0,id,AvgSize,BD,BCR,ProxMean,ContRatio,geometry,Labels,LabelNum
0,3922,40.357167,1.2e-05,0.000484,2.761683,0.5,"POLYGON ((5294439.034285 1683576.202855, 52954...",Class 0,0
1,3923,2570.1148,1.3e-05,0.033411,32.097878,0.153846,"POLYGON ((5294439.034285 1682576.202855, 52954...",Class 0,0
3,4020,148.553032,1.1e-05,0.001634,51.955178,0.181818,"POLYGON ((5295439.034285 1682576.202855, 52964...",Class 0,0
4,4021,9993.058286,1e-05,0.099931,20.197038,0.3,"POLYGON ((5295439.034285 1681576.202855, 52964...",Class 2,2
7,4115,149.531578,2.7e-05,0.004037,33.231388,0.111111,"POLYGON ((5296439.034285 1684576.202855, 52974...",Class 0,0


In [49]:
df_grid['Labels'] = df_grid['Labels'].astype(object)

## Interpretation of Classes

In [50]:
GRID_cluster0 = df_grid[df_grid.Labels=="Class 0"]
GRID_cluster1 = df_grid[df_grid.Labels=="Class 1"]
GRID_cluster2 = df_grid[df_grid.Labels=="Class 2"]

## 3.2 Visualization for classification

### BCR visualization

In [51]:
gd_bcr = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Coverage Ratio': [GRID_cluster0.median()["BCR"], GRID_cluster1.median()["BCR"],GRID_cluster2.median()["BCR"]]
  
}
df_gd_bcr = pd.DataFrame(gd_bcr).set_index("Class")

gd_bcr_bar = df_gd_bcr.plot_bokeh.bar(
    show_figure=False,   
    figsize=(300,200),
    ylabel="value", 
    title="Building Coverage Ratio",
    color="blue",
    alpha=0.6)

### BD visualization

In [52]:
gd_bd = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Buliding Density': [GRID_cluster0.median()["BD"], GRID_cluster1.median()["BD"],GRID_cluster2.median()["BD"]]
  
}
df_gd_bd = pd.DataFrame(gd_bd).set_index("Class")

gd_bd_bar = df_gd_bd .plot_bokeh.bar(
    figsize=(300,200),
    show_figure=False,  
    ylabel="value", 
    title="Building Density",
    color="purple",
    alpha=0.6)

### Proximity visualization

In [53]:
gd_p = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Proximity': [GRID_cluster0.median()["ProxMean"], GRID_cluster1.median()["ProxMean"],GRID_cluster2.median()["ProxMean"]]
  
}
df_gd_p = pd.DataFrame(gd_p).set_index("Class")

gd_p_bar = df_gd_p.plot_bokeh.bar(
    figsize=(300,200),
    show_figure=False, 
    ylabel="value", 
    title="Proximity",
    color="Green",
    alpha=0.6)

### Size visualization

In [54]:
gd_s = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    'Average Size': [GRID_cluster0.median()["AvgSize"], GRID_cluster1.median()["AvgSize"],GRID_cluster2.median()["AvgSize"]]
  
}
df_gd_s = pd.DataFrame(gd_s).set_index("Class")

gd_s_bar = df_gd_s.plot_bokeh.bar(
    show_figure=False, 
    figsize=(300,200),
    ylabel="value", 
    title="Average Size",
    color="Red",
    alpha=0.6)

### Contiguity Ratio

In [55]:
gd_c = {
    'Class':['Class 1', 'Class 2', 'Class 3'],
    "Contiguity Ratio": [GRID_cluster0.median()["ContRatio"], GRID_cluster1.median()["ContRatio"],GRID_cluster2.median()["ContRatio"]]
  
}
df_gd_c = pd.DataFrame(gd_c).set_index("Class")

gd_c_bar = df_gd_c.plot_bokeh.bar(
    figsize=(300,200),
    show_figure=False, 
    ylabel="value", 
    title="Contiguity Ratio",
    color="Grey",
    alpha=0.6)



### Integrated visualization for Grid 1km^2

In [56]:
Rgd_class=df_grid.plot_bokeh(
    show_figure=False,
    figsize=(900, 600),
    category="LabelNum",
    show_colorbar=True,
    colormap="Viridis",
    hovertool_columns=['id','Labels'],
    legend="1km^2 Grid",
    toolbar_location="above",
    title="Classification for 1km^2 Grid")

In [57]:
gd=layout([[Rgd_class],
                        [gd_bcr_bar, gd_bd_bar],
                        [gd_p_bar, gd_s_bar,gd_c_bar]], sizing_mode='fixed')

# 3. Integrated visualization for 3 Geographic Units

In [58]:
tab1 = Panel(child=ct,title="Census Tract")
tab2 = Panel(child=db,title="Dissimination Classification")
tab3 = Panel(child=gd,title="Grid 1km^2 Classification")
tabs = Tabs(tabs=[ tab1 ,tab2, tab3])

In [59]:
p1 = layout([[Rdb_class],
                        [db_bcr_bar, db_bd_bar],
                        [db_p_bar, db_s_bar,db_c_bar]],sizing_mode='fixed')



In [60]:
show(tabs)