In [95]:
from bs4 import BeautifulSoup
import os,io
import parser,urllib.request,requests
from lxml import html,etree
import re
import pandas as pd

import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

download the html

In [96]:
opener = urllib.request.FancyURLopener({})
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
f = pd.read_html(url,header=0)
df = pd.DataFrame(f[0])
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

rename the column to format

In [97]:
df.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'},inplace=True)

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [98]:
df = df[df['Borough'] != 'Not assigned']
df.reset_index(drop=True,inplace=True)

In [99]:
df.shape

(211, 3)

These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

The neighborhood will be combined in accordance to postalcode and borough (grouped).

In [100]:
df = df.groupby(['PostalCode','Borough']).Neighborhood.agg([('Neighborhood',', '.join)])
df.reset_index(inplace=True)

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

This basicall
y pinpoint where neighborhood content is 'Not assigned' and copy/paste its borough content into it.

In [101]:
df.loc[df[df['Neighborhood']=='Not assigned']['Neighborhood'].index.values[0]]['Neighborhood'] = df.loc[df[df['Neighborhood']=='Not assigned']['Neighborhood'].index.values[0]]['Borough']

In [102]:
df.shape

(103, 3)

Now import the csv file containing the coordinates

In [103]:
coordinates = requests.get("http://cocl.us/Geospatial_data").content
coor = pd.read_csv(io.StringIO(coordinates.decode('utf-8')))

In [104]:
coor.rename(columns={'Postal Code':'PostalCode'},inplace=True)

Now Merge the two

In [105]:
df_2 = pd.merge(df,coor,left_on='PostalCode', right_on='PostalCode')
df_2.drop(['PostalCode'],axis=1,inplace=True)

In [106]:
df_2.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


In [107]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
Latitude = location.latitude
Longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(Latitude, Longitude))

# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[Latitude, Longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_2['Latitude'], df_2['Longitude'], df_2['Borough'], df_2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Explore and cluster the neighborhoods in Toronto

In [108]:
df.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,18,18
East Toronto,5,5
East York,5,5
Etobicoke,12,12
Mississauga,1,1
North York,24,24
Queen's Park,1,1
Scarborough,17,17
West Toronto,6,6


In [109]:
# one hot encoding
toronto_onehot = pd.get_dummies(df[['Neighborhood']], prefix="", prefix_sep="")

#add encoded neighborhood to df
toronto_onehot['Borough'] = df['Borough'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Borough,"Adelaide, King, Richmond",Agincourt,"Agincourt North, L'Amoreaux East, Milliken, Steeles East","Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown","Alderwood, Long Branch","Bathurst Manor, Downsview North, Wilson Heights",Bayview Village,"Bedford Park, Lawrence Manor East",Berczy Park,"Birch Cliff, Cliffside West","Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe","Brockton, Exhibition Place, Parkdale Village",Business Reply Mail Processing Centre 969 Eastern,"CFB Toronto, Downsview East","CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara","Cabbagetown, St. James Town",Caledonia-Fairbanks,Canada Post Gateway Processing Centre,Cedarbrae,Central Bay Street,"Chinatown, Grange Park, Kensington Market",Christie,Church and Wellesley,"Clairlea, Golden Mile, Oakridge","Clarks Corners, Sullivan, Tam O'Shanter","Cliffcrest, Cliffside, Scarborough Village West","Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park","Commerce Court, Victoria Hotel",Davisville,Davisville North,"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West","Del Ray, Keelesdale, Mount Dennis, Silverthorn","Design Exchange, Toronto Dominion Centre",Don Mills North,"Dorset Park, Scarborough Town Centre, Wexford Heights","Dovercourt Village, Dufferin",Downsview Central,Downsview Northwest,Downsview West,"Downsview, North Park, Upwood Park","East Birchmount Park, Ionview, Kennedy Park",East Toronto,"Emery, Humberlea","Fairview, Henry Farm, Oriole","First Canadian Place, Underground city","Flemingdon Park, Don Mills South","Forest Hill North, Forest Hill West",Glencairn,"Guildwood, Morningside, West Hill","Harbord, University of Toronto","Harbourfront East, Toronto Islands, Union Station","Harbourfront, Regent Park","High Park, The Junction South","Highland Creek, Rouge Hill, Port Union",Hillcrest Village,"Humber Bay Shores, Mimico South, New Toronto","Humber Bay, King's Mill Park, Kingsway Park South East, Mimico NE, Old Mill South, The Queensway East, Royal York South East, Sunnylea",Humber Summit,Humewood-Cedarvale,Islington Avenue,"Kingsview Village, Martin Grove Gardens, Richview Gardens, St. Phillips","Kingsway Park South West, Mimico NW, The Queensway West, Royal York South West, South of Bloor",L'Amoreaux West,"Lawrence Heights, Lawrence Manor",Lawrence Park,Leaside,"Little Portugal, Trinity","Maryvale, Wexford","Moore Park, Summerhill East","Newtonbrook, Willowdale",North Toronto West,Northwest,"Northwood Park, York University","Parkdale, Roncesvalles",Parkwoods,Queen's Park,Rosedale,Roselawn,"Rouge, Malvern","Runnymede, Swansea","Ryerson, Garden District",Scarborough Village,"Silver Hills, York Mills",St. James Town,Stn A PO Boxes 25 The Esplanade,Studio District,"The Annex, North Midtown, Yorkville",The Beaches,"The Beaches West, India Bazaar","The Danforth West, Riverdale","The Junction North, Runnymede","The Kingsway, Montgomery Road, Old Mill North",Thorncliffe Park,Upper Rouge,Victoria Village,Westmount,Weston,Willowdale South,Willowdale West,Woburn,"Woodbine Gardens, Parkview Hill",Woodbine Heights,York Mills West
0,Scarborough,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Scarborough,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Scarborough,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Scarborough,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,Scarborough,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [119]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Borough,"Adelaide, King, Richmond",Agincourt,"Agincourt North, L'Amoreaux East, Milliken, Steeles East","Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown","Alderwood, Long Branch","Bathurst Manor, Downsview North, Wilson Heights",Bayview Village,"Bedford Park, Lawrence Manor East",Berczy Park,"Birch Cliff, Cliffside West","Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe","Brockton, Exhibition Place, Parkdale Village",Business Reply Mail Processing Centre 969 Eastern,"CFB Toronto, Downsview East","CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara","Cabbagetown, St. James Town",Caledonia-Fairbanks,Canada Post Gateway Processing Centre,Cedarbrae,Central Bay Street,"Chinatown, Grange Park, Kensington Market",Christie,Church and Wellesley,"Clairlea, Golden Mile, Oakridge","Clarks Corners, Sullivan, Tam O'Shanter","Cliffcrest, Cliffside, Scarborough Village West","Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park","Commerce Court, Victoria Hotel",Davisville,Davisville North,"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West","Del Ray, Keelesdale, Mount Dennis, Silverthorn","Design Exchange, Toronto Dominion Centre",Don Mills North,"Dorset Park, Scarborough Town Centre, Wexford Heights","Dovercourt Village, Dufferin",Downsview Central,Downsview Northwest,Downsview West,"Downsview, North Park, Upwood Park","East Birchmount Park, Ionview, Kennedy Park",East Toronto,"Emery, Humberlea","Fairview, Henry Farm, Oriole","First Canadian Place, Underground city","Flemingdon Park, Don Mills South","Forest Hill North, Forest Hill West",Glencairn,"Guildwood, Morningside, West Hill","Harbord, University of Toronto","Harbourfront East, Toronto Islands, Union Station","Harbourfront, Regent Park","High Park, The Junction South","Highland Creek, Rouge Hill, Port Union",Hillcrest Village,"Humber Bay Shores, Mimico South, New Toronto","Humber Bay, King's Mill Park, Kingsway Park South East, Mimico NE, Old Mill South, The Queensway East, Royal York South East, Sunnylea",Humber Summit,Humewood-Cedarvale,Islington Avenue,"Kingsview Village, Martin Grove Gardens, Richview Gardens, St. Phillips","Kingsway Park South West, Mimico NW, The Queensway West, Royal York South West, South of Bloor",L'Amoreaux West,"Lawrence Heights, Lawrence Manor",Lawrence Park,Leaside,"Little Portugal, Trinity","Maryvale, Wexford","Moore Park, Summerhill East","Newtonbrook, Willowdale",North Toronto West,Northwest,"Northwood Park, York University","Parkdale, Roncesvalles",Parkwoods,Queen's Park,Rosedale,Roselawn,"Rouge, Malvern","Runnymede, Swansea","Ryerson, Garden District",Scarborough Village,"Silver Hills, York Mills",St. James Town,Stn A PO Boxes 25 The Esplanade,Studio District,"The Annex, North Midtown, Yorkville",The Beaches,"The Beaches West, India Bazaar","The Danforth West, Riverdale","The Junction North, Runnymede","The Kingsway, Montgomery Road, Old Mill North",Thorncliffe Park,Upper Rouge,Victoria Village,Westmount,Weston,Willowdale South,Willowdale West,Woburn,"Woodbine Gardens, Parkview Hill",Woodbine Heights,York Mills West
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Downtown Toronto,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.055556,0.055556,0.055556,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.055556,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0
4,Etobicoke,0.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0.083333,0.083333,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
k = 11

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)


kmeans = KMeans(n_clusters=k, random_state=42).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([ 7, 10,  6,  5,  8,  2,  0,  1,  9,  3])

In [121]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_grouped[["Borough","Cluster Labels"]]

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = df_2.join(toronto_merged.set_index('Borough'), on='Borough')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353,9
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,9
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,9
3,Scarborough,Woburn,43.770992,-79.216917,9
4,Scarborough,Cedarbrae,43.773136,-79.239476,9


In [122]:
# create map
map_clusters = folium.Map(location=[Latitude, Longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
toronto_merged['Cluster Labels'].fillna(0, inplace=True)
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
#     print(cluster)
    cluster = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters