# Segmenting and Clustering Neighborhoods in Toronto

### import necessary module

In [1]:
import pandas as pd
import geocoder
import folium
from geopy.geocoders import Nominatim
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

### scrape chart from website

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tb = pd.read_html(url)[0]
tb = tb[tb.Borough != 'Not assigned']
tb['Neighborhood'] = list(map(', '.join,[l.split('/') for l in tb.Neighborhood])) # change ‘/’ to ‘,’
tb.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


**How many boroughs and neighboehoods in Toronto?**

In [3]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(tb['Borough'].unique()),
        tb.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


**Since geocoder is unstable on my computer in China, let's get the coordinates from the csv file**

In [4]:
cor = pd.read_csv('Geospatial_Coordinates.csv')
tb['Latitude'] = list([float(cor[cor.iloc[:,0] == i]['Latitude']) for i in tb.iloc[:,0]])
tb['Longitude'] = list([float(cor[cor.iloc[:,0] == i]['Longitude']) for i in tb.iloc[:,0]])
tb.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


**get the coordinates of Toronto**

In [4]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
address = 'Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**Let's let's visualizat Toronto neighborhoods in it..**

In [6]:
map_toronto = folium.Map(location = [latitude,longitude],zoom_start = 10)

for bor, neigh,la,lo in zip(tb['Borough'],tb['Neighborhood'],tb['Latitude'],tb['Longitude']):
    label = '{}, {}'.format(la,lo)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [la, lo],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 

map_toronto

**to simpify the question and save the work, we will only analyse those boroughs whose name contain 'Toronto'**

In [13]:
index = (tb['Borough'] == 'Downtown Toronto')| (tb['Borough'] =='East Toronto')|\
    (tb['Borough'] =='West Toronto')|(tb['Borough'] =='Central Toronto')
tb = tb[index]
tb.reset_index(drop = True)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031
5,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,Downtown Toronto,Christie,43.669542,-79.422564
8,Downtown Toronto,"Richmond , Adelaide , King",43.650571,-79.384568
9,West Toronto,"Dufferin , Dovercourt Village",43.669005,-79.442259


### Foursquare

In [14]:
CLIENT_ID = 'PSUI5HTST3BUOACVFELLPJVQFNXDKGLITSISKQ4BEQIQXASS' # your Foursquare ID
CLIENT_SECRET = 'P2HZLUJ4TXDDIGFNRCPXOS35DJAG4ZEVFNQUFV2Z2A3A0JE1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

**define the fonction to explore the venues and transform it to a dataframe**

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
LIMIT = 100
toronto_venues = getNearbyVenues(names=tb['Neighborhood'],
                                   latitudes=tb['Latitude'],
                                   longitudes=tb['Longitude']
                                  )

Regent Park ,  Harbourfront
Queen's Park ,  Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond ,  Adelaide ,  King
Dufferin ,  Dovercourt Village
Harbourfront East ,  Union Station ,  Toronto Islands
Little Portugal ,  Trinity
The Danforth West ,  Riverdale
Toronto Dominion Centre ,  Design Exchange
Brockton ,  Parkdale Village ,  Exhibition Place
India Bazaar ,  The Beaches West
Commerce Court ,  Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park ,  The Junction South
North Toronto West
The Annex ,  North Midtown ,  Yorkville
Parkdale ,  Roncesvalles
Davisville
University of Toronto ,  Harbord
Runnymede ,  Swansea
Moore Park ,  Summerhill East
Kensington Market ,  Chinatown ,  Grange Park
Summerhill West ,  Rathnelly ,  South Hill ,  Forest Hill SE ,  Deer Park
CN Tower ,  King and Spadina ,  Railway Lands ,  Harbourfront West ,  Bathurst  Quay ,  Sout

**Let's see the shape and the five rows of the dataframe**

In [17]:
print(toronto_venues.shape)
toronto_venues.head()

(1635, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


**How may categories of those venues in the dataframe?**

In [18]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


## Analyze Each Neighborhood

In [19]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**And then let's group rows by neighborhoods and take the means of the frequency of occurrence of exch category**

In [20]:
toronto_grouped = toronto_onehot.groupby('neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
3,"CN Tower , King and Spadina , Railway Lands ...",0.071429,0.071429,0.071429,0.071429,0.142857,0.071429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0,0.015625


## Cluster Neighborhoods

**Run *k*-means to cluster the neighborhood into 5 clusters.**

In [21]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 0, 0, 2, 0, 2, 2, 2, 0], dtype=int32)

**Insert the result to the dataframe**

In [22]:
tb.insert(4, 'Cluster Labels', kmeans.labels_)
tb.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
2,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,2
4,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,2
9,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
15,Downtown Toronto,St. James Town,43.651494,-79.375418,0
19,East Toronto,The Beaches,43.676357,-79.293031,2


### Let's visualize the neighborhoods based on the clustering result

In [23]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tb['Latitude'], tb['Longitude'], tb['Neighborhood'], tb['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters