# Explore the city of Toronto and its neighborhoods and cluster the neighborhoods in groups based on similarity

For this project we use data from Foursquare platform 


In [755]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import geocoder
from sklearn.cluster import KMeans
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors
import FSquare

## 1. Web Scraping to retrieve the Toronto neighborhoods table from wikipedia 
###### _Link to data: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M_



In [148]:
url = r"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r= requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
#a=pd.read_html(r.text) #This code finds the table automatically use a[0] for the table

#### 1.1 Find the table headers

In [149]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('th')
table = soup.find('table', attrs={'class':'wikitable sortable'})
headers= table.find_all('th')

Headers=np.array([headers[i].text[:-1] for i, title in enumerate(headers)])


#### 1.2 Find the Neighborhoods

In [2]:
rows= table.find_all('td')
Neighborhoods= np.array([rows[3*i+2].text[:-1] for i in range(0,int(len(rows)/3)) ])  


#### 1.3 Find the Boroughs

In [151]:
Boroughs = np.array([rows[i].text[:-1] for i, borough in enumerate(rows) if i%3==1  ])

#### 1.4 Find the post codes

In [152]:
Post_codes= np.array([rows[i].text[:-1] for i, borough in enumerate(rows) if i%3==0  ])

## 2. Create dataframe from retrieved data

In [206]:
Toronto_init = pd.DataFrame({'PostalCode' : Post_codes, 'Borough' : Boroughs, 'Neighborhood' : Neighborhoods})

In [207]:
Toronto_init.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### 2.1 Delete the rows with not assigned boroughs

In [215]:
Toronto_init.drop(Toronto_init[Toronto_init['Borough']=='Not assigned'].index, inplace=True)
Toronto_init.reset_index(drop=True, inplace=True)

Toronto_init.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### 2.2 Check for neighborhoods with 'not assigned' inputs
###### Apparently there are none after we cleaned the data in 2.1

In [209]:
Toronto_init[Toronto_init['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [239]:
Toronto_init.shape

(103, 3)

## 3. Find geo co-ordinates for each post code

###### We will do this with the provided CSV file https://cocl.us/Geospatial_data

In [265]:
Cods= pd.read_csv("C:\\Users\\con74781\\Downloads\\Geospatial_Coordinates.csv")
Cods.columns.values[0] = Toronto_init.columns.values[0]
Cods.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### 3.1 Update the DataFrame with co-ordinates

In [271]:
Toronto_cods = pd.merge(Toronto_init, Cods,how='inner', on='PostalCode')
Toronto_cods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 4. Cluster the neigborhoods in Toronto

##### _Clustering of the neighborhoods will be based on the categories of venues in the neighborhood._
##### _For this purpose we will use the foursquare API_

In [756]:
# function to loop through the neighborhoods and the venues in each borough
# The radius of search is selected as 500m.

# these are the credentials for the foursquare API
CLIENT_ID = FSquare.CLIENT_ID
CLIENT_SECRET = FSquare.CLIENT_SECRET 
VERSION = "20201128"
LIMIT=100
radius = 500


def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### 4.1 Create dataframe with the venues in each neighborhood

In [400]:
Venues_by_neighborhood = getNearbyVenues(Toronto_cods["Neighborhood"], Toronto_cods["Latitude"], Toronto_cods["Longitude"] )

In [638]:
Venues_by_neighborhood.head()


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


#### 4.2 Summarise the number of each category for each Neighborhood

In [489]:
Venues_onehot = pd.get_dummies(Venues_by_neighborhood['Venue Category'], prefix="", prefix_sep="")
Venues_onehot.drop(columns="Neighborhood", inplace=True) # Drop category Neighborhood to avoid clashing with real neighborhood column
Venues_onehot.insert(0, 'Neighborhood', Venues_by_neighborhood["Neighborhood"])
Venues_onehot.head(5)

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 4.3 Create a look up table

In [639]:
Venues_lookup = Venues_onehot.groupby(by="Neighborhood").mean() # to find the % of categories in each neighborhood
Venues_lookup = pd.DataFrame(index =Venues_lookup.index,data= Venues_lookup.values.round(2),columns = Venues_lookup.columns )

## 5. Cluster the neighborhoods

In [737]:
kclusters = 10

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42).fit(Venues_lookup.values)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2,
       2, 2, 6, 2, 0, 2, 2, 2, 6, 2, 6, 6, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       6, 2, 6, 2, 2, 1, 5, 2, 5, 2, 5, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 5,
       2, 7, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 8, 2, 2, 2, 2, 9, 2,
       0, 2, 2, 2, 2, 6, 0, 4])

In [739]:
Venues_lookup.insert(0, column = "Label", value = kmeans.labels_)
# Venues_lookup.drop("Label",1,inplace=True)

In [740]:
Clustered_neighborhoods = pd.merge(Venues_lookup.reset_index(), Toronto_cods ,how='inner', on='Neighborhood')

In [741]:
columns = list(Clustered_neighborhoods.columns[:2]) + list(Clustered_neighborhoods.columns[-4:]) + list(Clustered_neighborhoods.columns[2:-4])

In [742]:
Clustered_neighborhoods = Clustered_neighborhoods[columns]

In [743]:
Clustered_neighborhoods.head()

Unnamed: 0,Neighborhood,Label,PostalCode,Borough,Latitude,Longitude,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,2,M1S,Scarborough,43.7942,-79.262029,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",2,M8W,Etobicoke,43.602414,-79.543484,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",2,M3H,North York,43.754328,-79.442259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,2,M2K,North York,43.786947,-79.385975,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",2,M5M,North York,43.733283,-79.41975,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 6. Visualise the clustered neighborhoods on the map

In [745]:
Toronto_map = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

colors_array = cm.rainbow(np.linspace(0, 1,10))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neigh, cluster in zip(Clustered_neighborhoods['Latitude'], Clustered_neighborhoods['Longitude'], Clustered_neighborhoods['Neighborhood'], Clustered_neighborhoods['Label']):
    label = folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(Toronto_map)



Toronto_map
