# Segmenting and clustering
## Neighborhoods of Toronto

This Notebook will go through the gathering of location information for Toronto Neighborhoods.

We will then explore each neighborhood's venues and attempt to classify/cluster the neighborhoods based on the venue composition.

### Setting up the environment

In [1]:


import requests  # this module helps us to download a web page
import pandas as pd # primary data structure library
import numpy as np  # useful for many scientific computing in Python

from bs4 import BeautifulSoup # this module helps in web scrapping.

from sklearn.cluster import KMeans # import k-means from clustering stage

from pandas.io.json import json_normalize # tranforming json file into a pandas dataframe library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library



# Section 1
### Building a dataframe with all boroughs and neighborhoods

In [2]:
# url that contains all required information
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# download the web html and store it as a soup object
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")  # create a soup object using the variable 'data'

# create an empty list
table_contents=[]

# extract all tables
table=soup.find('table')

# parse all table cells and extract relevant data via text manipulation
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] # As postal code contains upto 3 characters extract that using tablerow.p.text
        # Next use split ,strip and replace functions for getting Borough and Neighborhood information.
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')

        # Append data to the list created
        table_contents.append(cell)

# Create a dataframe from the list
df=pd.DataFrame(table_contents)

# clean up the list of borough's
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

# Display the shape of the new dataframe
df.shape

(103, 3)

# Section 2
### Adding coordinates for the locations

In [3]:
# url to CSV file with all longitudes and latitudes
geospatial_url = r'https://cocl.us/Geospatial_data'

# retrieving data from the URL using get method
r = requests.get(geospatial_url)

# giving a name and saving it in any required format
with open("Data\\Toronto_Geospatial_Coordinates.csv", 'wb') as f: # opening the file in write mode
    f.write(r.content) # writes the URL contents from the server

# Import CSV into a Panda Dataframe
geo_df = pd.read_csv ("Data\\Toronto_Geospatial_Coordinates.csv")

# Rename postol code column of CSV, this is so that PostalCode can be used as a key
geo_df.rename(columns = {"Postal Code": "PostalCode"}, inplace=True)

# Merge the two dataframes
ll_df = pd.merge(df, geo_df, on = "PostalCode", how = "inner")

# Filter on boroughs containing "Toronto"
toronto_ll = ll_df[ll_df['Borough'].str.contains("Toronto")]

# Reset the index
toronto_ll.reset_index(inplace=True,drop=True)

# Display Longitude/Latitude Dataframe
toronto_ll.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


# Section 3
### Exploring and Clustering the neighborhoods

In [4]:
# Variables required for leveraging FourSqaure API
CLIENT_ID = '0CQLTF4AAQCEZHB1NIFIX4H4PGNUXTFU3OCAHIL0JPA2UOU2' # Foursquare ID
CLIENT_SECRET = '5JUH5EMSQ3ANXM5JDEDV5PPNAM0TYXLIE5D5LW33QLXNTWUM' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [5]:
# Define function for exploring a location for all nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [6]:
# Gather all toronto venues into a dataframe
toronto_venues = getNearbyVenues(names=toronto_ll['Neighborhood'],
                                   latitudes=toronto_ll['Latitude'],
                                   longitudes=toronto_ll['Longitude']
                                  )

In [7]:
# Let's check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",22,22,22,22,22,22
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,62,62,62,62,62,62
Christie,16,16,16,16,16,16
Church and Wellesley,72,72,72,72,72,72
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,36,36,36,36,36,36
Davisville North,9,9,9,9,9,9
"Dufferin, Dovercourt Village",14,14,14,14,14,14


In [8]:
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


## Analyzing each neighborhood
### Preparing the data for analyses

In [9]:
# set up one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Display onehot encoding
toronto_onehot.head()

Unnamed: 0,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# confirm neighborhood info is in dataframe
toronto_onehot['Neighborhood'].head()

0    Regent Park, Harbourfront
1    Regent Park, Harbourfront
2    Regent Park, Harbourfront
3    Regent Park, Harbourfront
4    Regent Park, Harbourfront
Name: Neighborhood, dtype: object

In [11]:
# let's examine the new dataframe size
toronto_onehot.shape

(1580, 231)

In [12]:
# let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016129,0.0,0.0,0.016129,0.0,0.016129
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.013889,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013889,0.027778
6,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding the most prominent venues per neighborhood

In [13]:
# let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [14]:
# Now let's create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe with neighborhoods
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

# add most common venues
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

# Display the dataframe
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Bakery,Cocktail Bar,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Irish Pub
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Grocery Store,Bakery,Performing Arts Venue,Pet Store,Nightclub,Climbing Gym,Restaurant
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Bar,Boat or Ferry,Boutique,Coffee Shop
3,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Thai Restaurant,Burger Joint,Salad Place,New American Restaurant,Comic Shop
4,Christie,Grocery Store,Café,Park,Coffee Shop,Italian Restaurant,Baby Store,Candy Store,Athletics & Sports,Nightclub,Restaurant


### Clustering neighborhoods by venue composition

In [15]:
# set number of clusters
kclusters = 4

# Drop neighborhood column for clustering to be done
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Adding the cluster labels back into the venue analysis

In [16]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_ll

# merge toronto_ll with toronto_merged to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Park,Bakery,Pub,Café,Breakfast Spot,Theater,Yoga Studio,Spa,Shoe Store
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Clothing Store,Coffee Shop,Bubble Tea Shop,Cosmetics Shop,Middle Eastern Restaurant,Café,Lingerie Store,Fast Food Restaurant,Diner,Hotel
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Café,Cosmetics Shop,Cocktail Bar,Beer Bar,Gym,Hotel,Farmers Market,Italian Restaurant,Lingerie Store
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Health Food Store,Asian Restaurant,Trail,Pub,Yoga Studio,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Bakery,Cocktail Bar,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Irish Pub


### Reviewing the results of the clustering

### Attempt to describe the cluster classifications

Cluster 0 - Park area with a mix of recreational venues

Cluster 1 - Park area with a mix of retail venues

Cluster 2 - Retail area (high concentration of coffee shops)

Cluster 3 - Garden Area

In [17]:
# Cluster 0 - Park area with a mix of recreational venues
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,East York/East Toronto,0,Park,Intersection,Convenience Store,Yoga Studio,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store
33,Downtown Toronto,0,Park,Playground,Trail,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store


In [18]:
# Cluster 1 - Park area with a mix of retail venues
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,1,Park,Bus Line,Swim School,Yoga Studio,Dog Run,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room
21,Central Toronto,1,Park,Trail,Jewelry Store,Bus Line,Sushi Restaurant,Yoga Studio,Dog Run,Falafel Restaurant,Event Space,Ethiopian Restaurant
29,Central Toronto,1,Park,Restaurant,Tennis Court,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store


In [19]:
# Cluster 2 - Retail area (high concentration of coffee shops)
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Coffee Shop,Park,Bakery,Pub,Café,Breakfast Spot,Theater,Yoga Studio,Spa,Shoe Store
1,Downtown Toronto,2,Clothing Store,Coffee Shop,Bubble Tea Shop,Cosmetics Shop,Middle Eastern Restaurant,Café,Lingerie Store,Fast Food Restaurant,Diner,Hotel
2,Downtown Toronto,2,Coffee Shop,Café,Cosmetics Shop,Cocktail Bar,Beer Bar,Gym,Hotel,Farmers Market,Italian Restaurant,Lingerie Store
3,East Toronto,2,Health Food Store,Asian Restaurant,Trail,Pub,Yoga Studio,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
4,Downtown Toronto,2,Coffee Shop,Bakery,Cocktail Bar,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Irish Pub
5,Downtown Toronto,2,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Thai Restaurant,Burger Joint,Salad Place,New American Restaurant,Comic Shop
6,Downtown Toronto,2,Grocery Store,Café,Park,Coffee Shop,Italian Restaurant,Baby Store,Candy Store,Athletics & Sports,Nightclub,Restaurant
7,Downtown Toronto,2,Coffee Shop,Café,Restaurant,Thai Restaurant,Gym,Clothing Store,Deli / Bodega,Salad Place,Hotel,Burrito Place
8,West Toronto,2,Pharmacy,Bakery,Park,Music Venue,Café,Middle Eastern Restaurant,Supermarket,Bar,Bank,Brewery
10,Downtown Toronto,2,Coffee Shop,Aquarium,Hotel,Café,Fried Chicken Joint,Restaurant,Brewery,Sporting Goods Shop,Italian Restaurant,Scenic Lookout


In [20]:
# Cluster 3 - Garden Area
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,3,Garden,Home Service,Ice Cream Shop,Yoga Studio,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store


## Visualizing the neighborhood clusters on a map

In [21]:
# Coordiates to be used as the centerpoint of the map
toronto_lat = 43.651070
toronto_long = -79.347015

# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

# Display the map
map_clusters