Segmenting and Clustering Neighbourhoods in Toronto: Part 1

In [113]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Imported required Libs')

Imported required Libs


In [6]:
# Scrape the webpage for the table using BeautifulSoup
Table_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
Data_Source = requests.get(Table_url).text
soup = BeautifulSoup(Data_Source, 'xml')
table=soup.find('table')

In [9]:
# Create dataframe with 3 colums: PostalCode, Borough, and Neighborhood
Column_names = ['PostalCode', 'Borough', 'Neighborhood']
df1=pd.DataFrame(columns = Column_names)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


In [14]:
#Assign values to columns
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df1.loc[len(df1)] = row_data

In [15]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [18]:
# Remove all Not Assigned Values from Df
df2=df1[df1['Borough']!= 'Not assigned']
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [24]:
#Merge Neighbourhoods from same postcode
#First create temporary dataframe with the merges rows
temp_df2=df2.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ','.join(x))
temp_df2=temp_df2.reset_index(drop=False)
temp_df2.rename(columns={'Neighborhood':'Joined_Neighborhood'}, inplace=True)
temp_df2.head()

Unnamed: 0,PostalCode,Joined_Neighborhood
0,M1B,Malvern / Rouge
1,M1C,Rouge Hill / Port Union / Highland Creek
2,M1E,Guildwood / Morningside / West Hill
3,M1G,Woburn
4,M1H,Cedarbrae


In [27]:
#Add the merged dataframe to the original and then drop the extra columns
df_merged = pd.merge(df2, temp_df2, on='PostalCode')
df_merged.drop(['Neighborhood'], axis=1, inplace=True)
df_merged.head()

Unnamed: 0,PostalCode,Borough,Joined_Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [33]:
#Rename column and drop duplicates
df_merged.rename(columns={'Joined_Neighborhood':'Neighborhood'}, inplace=True)
df_merged.drop_duplicates(inplace=True)
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [35]:
df_merged.shape

(103, 3)

# Part 2:

In [36]:
#Import Latitute and Logtitude Data for each location

In [44]:
# import geocoder function not working, therefor we will take the data from the .csv file

In [45]:
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

In [50]:
geo_df=pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [53]:
#Merge the dataframes into 1
# To do this we must first rename the geo_df column for Postal Code to PostalCode to match other df
geo_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
geo_df2_merg= pd.merge(geo_df, df_merged, on='PostalCode')

In [55]:
geo_df2_merg.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,Malvern / Rouge
1,M1C,43.784535,-79.160497,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,43.763573,-79.188711,Scarborough,Guildwood / Morningside / West Hill
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [60]:
#reorder the columns to match required Dataframe
geo_df=geo_df2_merg[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]
geo_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [61]:
# Neighborhoods in Toronto

In [65]:
toronto_df=geo_df[geo_df['Borough'].str.contains("Toronto")]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [None]:
#Create a dataframe of the Neighborhoods
col_names=['Borough', 'Neighborhoods', 'Latitude', 'Longitude']
neighborhoods=pd.DataFrame(columns=col_names)
for data in toronto_df:
    borough = neighboorhood


Lets look at our Neighborhoods on a map

In [79]:
#Determine the co-ordinates for Toronto, Canada
address = "Toronto, Canada"

geolocator = Nominatim(user_agent="Tor_explor")
location = geolocator.geocode(address)
latitude= location.latitude
longitude = location.longitude
print('The Coordinates of Toronto are {}, {}'.format(latitude, longitude))

The Coordinates of Toronto are 43.6534817, -79.3839347


In [89]:
#Create the Folium map and add markers
Canada_map1 = folium.Map(location=[latitude, longitude], zoom_start=10)

for latitude, longitude, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Canada_map1)
    
       
Canada_map1

In [None]:
Time to analyse the Venues Around the Area

In [66]:
# Initiate API
CLIENT_ID = 'KQAXVT22LMEYG2SWTR5OOLLWKXXYYIEYY55AHCODRNY3CHBB'
CLIENT_SECRET = 'URAV3RDJKJMBI45AEYF2JF4GMHCWT1KYT2EOUU0ZIIKZVHYM'
VERSION = '20200421'

In [68]:
#Generate a list of all Venues 
def getNearbyVenues(names, latitudes, longitudes):
    radius=500
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [70]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

The Beaches
The Danforth West / Riverdale
India Bazaar / The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park / Summerhill East
Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park
Rosedale
St. James Town / Cabbagetown
Church and Wellesley
Regent Park / Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond / Adelaide / King
Harbourfront East / Union Station / Toronto Islands
Toronto Dominion Centre / Design Exchange
Commerce Court / Victoria Hotel
Roselawn
Forest Hill North & West
The Annex / North Midtown / Yorkville
University of Toronto / Harbord
Kensington Market / Chinatown / Grange Park
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport
Stn A PO Boxes
First Canadian Place / Underground city
Christie
Dufferin / Dovercourt Village
Little Portugal / Trinity
Brockton / Parkdale Village / Exhibition Place
High Park / 

In [71]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth West / Riverdale,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [91]:
# We will convert the data to allow for clustering - one hot encoding and k-means
toronto_venues.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
Brockton / Parkdale Village / Exhibition Place,23,23,23,23,23,23
Business reply mail Processing CentrE,17,17,17,17,17,17
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,16,16,16,16,16,16
Central Bay Street,65,65,65,65,65,65
Christie,18,18,18,18,18,18
Church and Wellesley,72,72,72,72,72,72
Commerce Court / Victoria Hotel,100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,7,7,7,7,7,7


In [92]:

tor_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
tor_onehot.drop(['Neighborhood'],axis=1,inplace=True) 
tor_onehot.insert(loc=0, column='Neighborhood', value=toronto_venues['Neighborhood'] )
tor_onehot.shape

(1620, 230)

In [95]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,Brockton / Parkdale Village / Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CN Tower / King and Spadina / Railway Lands / ...,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.0,0.0,0.015385


In [96]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [116]:
#Take 25 top venues to analyse
num_top_venues = 25

indicators = ['st', 'nd', 'rd']

# create columns for number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue,21th Most Common Venue,22th Most Common Venue,23th Most Common Venue,24th Most Common Venue,25th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Farmers Market,Bakery,Italian Restaurant,Cheese Shop,Café,Restaurant,...,Museum,Comfort Food Restaurant,Greek Restaurant,Mexican Restaurant,Shopping Mall,Liquor Store,Diner,Juice Bar,Jazz Club,Japanese Restaurant
1,Brockton / Parkdale Village / Exhibition Place,Café,Nightclub,Breakfast Spot,Coffee Shop,Stadium,Bar,Bakery,Intersection,Italian Restaurant,...,Grocery Store,Burrito Place,Convenience Store,Distribution Center,Dog Run,Discount Store,Doner Restaurant,Diner,Dance Studio,Dessert Shop
2,Business reply mail Processing CentrE,Park,Auto Workshop,Comic Shop,Pizza Place,Butcher,Recording Studio,Restaurant,Burrito Place,Light Rail Station,...,Gym / Fitness Center,Garden Center,Distribution Center,Gaming Cafe,Garden,Diner,Dessert Shop,Gas Station,Department Store,Deli / Bodega
3,CN Tower / King and Spadina / Railway Lands / ...,Airport Lounge,Airport Service,Airport Terminal,Airport,Boat or Ferry,Rental Car Location,Sculpture Garden,Plane,Coffee Shop,...,Dumpling Restaurant,Donut Shop,Electronics Store,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Cupcake Shop,Dessert Shop,Department Store
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Burger Joint,Middle Eastern Restaurant,Ice Cream Shop,Fried Chicken Joint,...,Discount Store,Ramen Restaurant,Falafel Restaurant,Diner,Business Service,Korean Restaurant,Department Store,Park,Office,Modern European Restaurant


In [117]:
# Cluster the Neighborhood for analysis

In [118]:
kclusters = 6

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [119]:
# add clustering labels and merge tor_grouped with tor_data to add latitude/longitude for each neighborhood
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = toronto_df

tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue,21th Most Common Venue,22th Most Common Venue,23th Most Common Venue,24th Most Common Venue,25th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,5,Trail,Health Food Store,Pub,Yoga Studio,...,Deli / Bodega,Cuban Restaurant,Cupcake Shop,Ethiopian Restaurant,Creperie,Coworking Space,Costume Shop,Cosmetics Shop,Convenience Store,Concert Hall
41,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,...,Fruit & Vegetable Store,Spa,Diner,Dessert Shop,Cosmetics Shop,Caribbean Restaurant,Café,Bubble Tea Shop,Brewery,Bakery
42,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572,0,Sandwich Place,Park,Fast Food Restaurant,Food & Drink Shop,...,Pet Store,Gym,Movie Theater,Coworking Space,Department Store,Discount Store,Diner,Colombian Restaurant,Comfort Food Restaurant,Comic Shop
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Bakery,...,Coworking Space,Middle Eastern Restaurant,Diner,Latin American Restaurant,Italian Restaurant,Ice Cream Shop,Fish Market,Gym / Fitness Center,Seafood Restaurant,Yoga Studio
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Bus Line,Swim School,Ethiopian Restaurant,...,Yoga Studio,Dance Studio,Cupcake Shop,Cuban Restaurant,Creperie,Coworking Space,Costume Shop,Cosmetics Shop,Convenience Store,Concert Hall


In [120]:
# Visulise on a Folium Map

In [121]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="Tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [141]:
#Create the Folium map
Canada_map = folium.Map(location=[latitude, longitude], zoom_start=11)
Canada_map

In [142]:
# Add the clusters to the map
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for latitude, longitude, poi, neighborhood in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str() + ' Cluster ' + str(kclusters), parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color=rainbow[kclusters-1],
        fill=True,
        fill_color=rainbow[kclusters-1],
        fill_opacity=0.7).add_to(Canada_map)
Canada_map