#  Segmenting and clustering neighbourhoods in Toronto
##   Scrape a table from Wikipedia: List of postal codes of Canada: M
Let's import some Libraries

In [2]:
from  bs4  import  BeautifulSoup
import  requests
import pandas as pd

Get the Html link

In [3]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

Use 'LXML' parser in 'Beautiful Soup'

In [4]:
soup=BeautifulSoup(source,'lxml')

In [5]:
table=soup.findAll('table',{'class':'wikitable sortable'})

In [6]:
df=pd.read_html(str(table), header=0)[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


After Importing the table, let's start Pre-Processing 

In [7]:
df=df[df.Borough!='Not assigned']

In [8]:
df.reset_index(drop=True,inplace=True)

In [9]:
df.head(3)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront


In [10]:
df=df.groupby(['Postcode','Borough'],as_index=False).agg(lambda x:','.join(set(x.dropna())))

In [11]:
df.loc[df.Neighbourhood=='Not assigned','Neighbourhood']=df.Borough

In [12]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,Guildwood,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview,Kennedy Park,East Birchmount Park"
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge"
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest"
9,M1N,Scarborough,"Cliffside West,Birch Cliff"


Table after Pre-Processing

In [13]:
df.shape

(103, 3)

# Add coordinate to Neighbourhoods
## Load geospatial data from a csv file.

In [14]:
df_geo=pd.read_csv("https://cocl.us/Geospatial_data")

In [15]:
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476



The column Postal Code in the csv file is the same as the column Postcode in the dataframe. So we change the name Postal Code into Postcode so that we can merge later.

In [16]:
df_geo.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [17]:
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_merge=pd.merge(df,df_geo)

In [19]:
df_merge.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [20]:
df_toronto=df_merge

#  Clustering Toronto by k-Mean
Let's import some libraries

In [21]:
import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [22]:
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude
print('The  Geographical Coordinates of Toronto are:',  latitude, longitude)

The  Geographical Coordinates of Toronto are: 43.653963 -79.387207


In [23]:
map_toronto=folium.Map(location=[latitude,longitude],zoom_start=11)

for lat,lon,label  in  zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Neighbourhood']):
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto    

## Now, we use Foursquare to explore Toronto venues.

In [24]:
CLIENT_ID = 'R4YPEVJDPTPFAF2GSSU2WEZXDKY2CQYX5VAYYVQ12UVEJLVM' # your Foursquare ID
CLIENT_SECRET = 'YKP5NYWRG1RIYUUCLB3AWOG2XLC4DOJC3AMI220E33XYGHSM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: R4YPEVJDPTPFAF2GSSU2WEZXDKY2CQYX5VAYYVQ12UVEJLVM
CLIENT_SECRET:YKP5NYWRG1RIYUUCLB3AWOG2XLC4DOJC3AMI220E33XYGHSM


In [25]:
neighborhood_latitude = df_merge.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merge.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_merge.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern,Rouge are 43.806686299999996, -79.19435340000001.


In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:

toronto_venues =pd.DataFrame( getNearbyVenues(names=df_merge['Neighbourhood'],
                                   latitudes=df_merge['Latitude'],
                                   longitudes=df_merge['Longitude']
                                  ))

toronto_venues.head()

Malvern,Rouge
Highland Creek,Rouge Hill,Port Union
Morningside,Guildwood,West Hill
Woburn
Cedarbrae
Scarborough Village
Ionview,Kennedy Park,East Birchmount Park
Golden Mile,Clairlea,Oakridge
Cliffside,Scarborough Village West,Cliffcrest
Cliffside West,Birch Cliff
Scarborough Town Centre,Wexford Heights,Dorset Park
Wexford,Maryvale
Agincourt
Sullivan,Clarks Corners,Tam O'Shanter
Agincourt North,Steeles East,L'Amoreaux East,Milliken
L'Amoreaux West,Steeles West
Upper Rouge
Hillcrest Village
Fairview,Oriole,Henry Farm
Bayview Village
Silver Hills,York Mills
Willowdale,Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South,Flemingdon Park
Downsview North,Wilson Heights,Bathurst Manor
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale,The Danforth West
The Beac

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern,Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern,Rouge",43.806686,-79.194353,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
2,"Malvern,Rouge",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
3,"Malvern,Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
4,"Malvern,Rouge",43.806686,-79.194353,LCBO,43.796671,-79.204586,Liquor Store


In [28]:
toronto_venues.shape

(6780, 7)

In [29]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,Richmond,King",100,100,100,100,100,100
Agincourt,63,63,63,63,63,63
"Agincourt North,Steeles East,L'Amoreaux East,Milliken",72,72,72,72,72,72
"Alderwood,Long Branch",46,46,46,46,46,46
Bayview Village,13,13,13,13,13,13
"Bedford Park,Lawrence Manor East",74,74,74,74,74,74
Berczy Park,100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
"CFB Toronto,Downsview East",27,27,27,27,27,27
Caledonia-Fairbanks,72,72,72,72,72,72


In [30]:
len(toronto_venues['Venue Category'].unique())

342

In [31]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
toronto_onehot.shape

(6780, 343)

In [33]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adelaide,Richmond,King",0.0,0.00,0.000000,0.000000,0.000000,0.0,0.030000,0.0,0.00,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,Agincourt,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00,...,0.015873,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,"Agincourt North,Steeles East,L'Amoreaux East,M...",0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00,...,0.027778,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,"Alderwood,Long Branch",0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00,...,0.000000,0.000000,0.000000,0.0,0.000000,0.021739,0.000000,0.000000,0.0,0.0
4,Bayview Village,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
5,"Bedford Park,Lawrence Manor East",0.0,0.00,0.000000,0.000000,0.000000,0.0,0.013514,0.0,0.00,...,0.000000,0.000000,0.000000,0.0,0.000000,0.013514,0.000000,0.000000,0.0,0.0
6,Berczy Park,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.020000,0.0,0.00,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
7,Business Reply Mail Processing Centre 969 Eastern,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.020000,0.0,0.00,...,0.020000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
8,"CFB Toronto,Downsview East",0.0,0.00,0.000000,0.000000,0.037037,0.0,0.000000,0.0,0.00,...,0.037037,0.000000,0.037037,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
9,Caledonia-Fairbanks,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00,...,0.013889,0.000000,0.000000,0.0,0.000000,0.013889,0.000000,0.000000,0.0,0.0


In [34]:
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

In [35]:
kclusters = 8

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 6, 6, 0, 7, 4, 1, 4, 0, 4, 0, 0, 1, 1, 1, 2, 7, 1, 4, 4, 1, 0,
       0, 4, 7, 2, 7, 2, 1, 5, 1, 0, 4, 7, 5, 1, 4, 4, 4, 6, 4, 4, 7, 0,
       1, 0, 6, 0, 4, 0, 1, 7, 0, 2, 7, 5, 4, 5, 5, 4, 5, 2, 1, 5, 1, 4,
       4, 1, 7, 7, 5, 5, 4, 1, 1, 1, 4, 7, 4, 4, 5, 4, 4, 4, 5, 4, 0, 1,
       1, 3, 5, 0, 5, 7, 0, 1, 5, 0, 5, 7, 5, 5, 1], dtype=int32)

In [36]:

df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,1,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,6,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,6,M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
3,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,7,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ': Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters