#Canada Neighbourhood Assignment#
##Clustering neighbourhood of Canada on Wikipedia Page##

In [28]:
#Imports
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import numpy as np
import geocoder
import folium
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler

In [6]:
#Set url
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [7]:
#Get page request
request = requests.get(url)

In [8]:
#Parsing
soup = BeautifulSoup(request.content,'html.parser')

In [9]:
#Building dataframe
data_dict = {}
header=soup.find_all(class_="wikitable sortable")
splitted=header[0].find_all("tr")
columns=splitted[0].find_all("th")
for col in columns:
    data_dict[col.text.rstrip('\n')]=[]
for rows in splitted[1:]:
    for element,key in zip(rows.find_all('td'),data_dict.keys()):
        data_dict[key].append(element.text.rstrip('\n'))
df=pd.DataFrame(data_dict)

In [10]:
df=df[df.Borough != 'Not assigned']
df.reset_index(drop=True,inplace=True)

In [11]:
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [12]:
df.shape

(103, 3)

#Creating the two columns
for postal_code in df['Postal Code']:
    lat_lng_coords=None
    k=0
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    df['Latitude'][k]=latitude
    df['Longitute'][k]=longitude
    k+=1
This should be the correct code but it stuck in calling the geodata for hours. Revert to using the csv instead.

In [13]:
#Opening the CSV and converting it into data frame
df_geo=pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_merge=pd.merge(df,df_geo,on=['Postal Code'])
df_merge.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [19]:
#Toronto map
toronto_map = folium.Map(['43.6532','-79.3832'],zoom_start=12)
toronto_map

In [21]:
#Plotting Boroughs
for lat, lng, label in zip(df_merge['Latitude'],df_merge['Longitude'],df_merge['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
toronto_map

In [25]:
#Opening population data
df_pop=pd.read_csv('Canada_population_postal_code_2016.csv')
df_pop=df_pop[['Geographic code','Population, 2016']]
df_pop.rename(columns={'Geographic code':'Postal Code','Population, 2016':'Population'},inplace=True)
df_pop.head()

Unnamed: 0,Postal Code,Population
0,A0A,46587
1,A0B,19792
2,A0C,12587
3,A0E,22294
4,A0G,35266


In [26]:
#Merging in population data
df_toronto=pd.merge(df_merge,df_pop,on='Postal Code')
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Population
0,M3A,North York,Parkwoods,43.753259,-79.329656,34615
1,M4A,North York,Victoria Village,43.725882,-79.315572,14443
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,41078
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,21048
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,10


In [34]:
#Clustering by population
sklearn.utils.check_random_state(1000)
Clus_dataSet = df_toronto[['Latitude','Longitude','Population']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
db = DBSCAN(eps=0.5, min_samples=5).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
df_toronto['Clus_db']=labels
realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))
df_toronto['Colors']=None
#Assigning colors to clusters:
k=0
for clus in df_toronto['Clus_db']:
    if clus == -1:
        df_toronto['Colors'].loc[k]='#818281'
    elif clus == 0:
        df_toronto['Colors'].loc[k]='#094BF3'
    elif clus == 1:
        df_toronto['Colors'].loc[k]='#F32509'
    else:
        df_toronto['Colors'].loc[k]='#18E002'
    k+=1
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Population,Clus_db,Colors
0,M3A,North York,Parkwoods,43.753259,-79.329656,34615,-1,#818281
1,M4A,North York,Victoria Village,43.725882,-79.315572,14443,-1,#818281
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,41078,-1,#818281
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,21048,2,#18E002
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,10,0,#094BF3


In [35]:
#Clean map:
toronto_map = folium.Map(['43.6532','-79.3832'],zoom_start=12)
#Plotting clusters into map:
for lat, lng, label, color, popu in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Neighbourhood'],df_toronto['Colors'], df_toronto['Population']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup='Neighbourhoods: {}\nPopulation: {}'.format(label,popu),
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
toronto_map

As shown above, the different postal code regions can be clusted according to population. There are 3 clusters and others without similarities as outliers.