## Toronto Neighborhoods Project

The aim of this project is to study the neighborhoods of Toronto: based on the venues they contain, we wish classify them in clusters containing similar neighborhoods.

The list of the neighborhoods will be obtain from a wikipedia web page.

The coordinates of the neighborhoods will be assigned with the help of geopy geocoders. 

Then the data regarding venues will be obtained through Foursquare. 

Finally, we will use a machine learning method to cluster the neighborhoods.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
!pip install pgeocode
import pgeocode



In [3]:
!pip install folium 
import folium # map rendering library



### 1 - Scraping content from Wikipedia

In [4]:
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html, 'html5lib')
table = soup.find('table')

In [5]:
data = []
rows = table.find_all('td')
for t in rows: 
        data.append(t.text.strip())

In [6]:
df = pd.DataFrame(columns = ['postalcode', 'borough', 'neighborhood'])

In [7]:
for i in data:
    if (i.find('Not assigned') == -1):
        p_code = i[0:3]
        if (i.find('(') == -1):
            borough = i[3:]
            neighborhood = borough
        else:
            borough = i[3:i.find('(')]
            neighborhood = i [i.find('(')+1:i.find(')')]
        df.loc[len(df.index)] = [p_code, borough, neighborhood]

In [8]:
#cleaning the borough values
for i, b in enumerate(df['borough']): 
    if (b.find('East York') != -1):
        df.at[i, 'borough'] = 'East York'
    elif (b.find('Mississauga') != -1): 
        df.at[i, 'borough'] = 'Mississauga'
    elif (b.find('East Toronto') != -1): 
        df.at[i, 'borough'] = 'East Toronto'
    elif (b.find('Downtown Toronto') != -1): 
        df.at[i, 'borough'] = 'Downtown Toronto'
    elif (b.find('Etobicoke') != -1): 
        df.at[i, 'borough'] = 'Etobicoke'

In [9]:
df.borough.unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [10]:
df.shape

(103, 3)

In [11]:
df.tail()

Unnamed: 0,postalcode,borough,neighborhood
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Enclave of M4L
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...
102,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


### 2 - Getting the location of each neighborhood

In [12]:
#I didn't manage to retrieve the addresses (same method as the new york assignment)
#This code was found in the Coursera Forum
pgeocode.Nominatim('ca')
geolocator = pgeocode.Nominatim('ca')
pcodes = df['postalcode'].tolist()
latitudes = []
longitudes = []
for i, pcode in enumerate(pcodes):
    # initialize your variable to None
    #print(f'--Getting Postal Code: {postal_code}')
    g = geolocator.query_postal_code(pcode)
    
    if not g.empty:
        #print(f'Postal Code {pcode} has been retrieved. {len(pcodes) - (i + 1)} codes left')
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)

In [13]:
df['latitude'] = latitudes
df['longitude'] = longitudes

In [14]:
df.head()

Unnamed: 0,postalcode,borough,neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.6555,-79.3626
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


In [15]:
df = df.drop(df.index[df.latitude.isnull() == True].tolist(), axis=0)
df = df.drop(df.index[df.longitude.isnull() == True].tolist(), axis=0)
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,postalcode,borough,neighborhood,latitude,longitude
97,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.6518,-79.5076
98,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.383
99,M7Y,East Toronto,Enclave of M4L,43.7804,-79.2505
100,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.6325,-79.4939
101,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...,43.6256,-79.5231


Display the map of Toronto and its Neighborhoods:

In [16]:
#Coordinates of the city of Toronto
toronto_lat = 43.6529
toronto_long = -79.3849
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_lat, toronto_long], zoom_start=12)
#map_toronto

In [17]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['borough'], df['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
#map_toronto

### 3 - Studying the venues in each Neighborhoods

In [18]:
CLIENT_ID = 'EAHIB1LNGH2NX0MP2T1MFOC0M3BAVOSMUZO3K5NGPXICK0RD' # your Foursquare ID
CLIENT_SECRET = 'LASH0OOTLYYAYVUCC5Q4LWARC4PVDWHUL5KBI0I5JKSBNPUW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EAHIB1LNGH2NX0MP2T1MFOC0M3BAVOSMUZO3K5NGPXICK0RD
CLIENT_SECRET:LASH0OOTLYYAYVUCC5Q4LWARC4PVDWHUL5KBI0I5JKSBNPUW


In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print('retrieval completed')
    
    return(nearby_venues)

In [20]:
toronto_venues = getNearbyVenues(names=df['postalcode'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

retrieval completed


In [21]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.7545,-79.33,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [22]:
count = toronto_venues.groupby('Neighborhood').count()
count = count[['Venue Category']]
count.rename(columns = {'Venue Category' : 'Number of Venues'}, inplace=True)
print(count.shape)
count.head()

(98, 1)


Unnamed: 0_level_0,Number of Venues
Neighborhood,Unnamed: 1_level_1
M1B,1
M1E,31
M1G,1
M1H,1
M1J,3


In [23]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 260 uniques categories.


In [24]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
first_column = toronto_onehot.pop('Neighborhood')
toronto_onehot.insert(0, 'Neighborhood', first_column)
toronto_onehot.head()

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
toronto_onehot.shape

(2172, 260)

In [26]:
#Average the venues for each neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
toronto_grouped.shape

(98, 260)

### 4 - Clustering the Neighborhoods

In [28]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 4, 0, 1, 0, 0, 0, 0, 0])

In [29]:
# add clustering labels
count.insert(0, 'Cluster Labels', kmeans.labels_)
neigh_labeled = count
neigh_labeled.reset_index(inplace = True)
print(neigh_labeled.shape)
neigh_labeled.head()

(98, 3)


Unnamed: 0,Neighborhood,Cluster Labels,Number of Venues
0,M1B,2,1
1,M1E,0,31
2,M1G,4,1
3,M1H,0,1
4,M1J,1,3


In [30]:
# Merge the labels with the locations dataframe

toronto_merged = df.set_index('postalcode').join(neigh_labeled.set_index('Neighborhood'), how = 'inner')

toronto_merged.reset_index(inplace = True)


#Change cluster labels to integers
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

print(toronto_merged.shape)
toronto_merged.head() 

(98, 7)


Unnamed: 0,index,borough,neighborhood,latitude,longitude,Cluster Labels,Number of Venues
0,M3A,North York,Parkwoods,43.7545,-79.33,1,3
1,M4A,North York,Victoria Village,43.7276,-79.3148,0,7
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.6555,-79.3626,0,23
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.7223,-79.4504,0,71
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889,0,28


In [31]:
# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters