# Segmenting and Clustering Neighborhoods in Toronto
# Using webscraping , converting the table in the wiki page(Canada postal codes : M) into a        # pandas dataframe.
# Using "GeoSpatial Dataset" to get the geographical coordinates of each postal code as            # geocoder is not working

In [1]:
#importing required modules 
import re
import requests
import sys
import json
# Import the Beautiful Soup Library
from bs4 import BeautifulSoup,NavigableString
# Import the plotting library
import matplotlib.pyplot as plt
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-OpenCE

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    certifi-2021.5.30          |   py37h89c1867_0         141 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    python_abi-3.7             |          2_cp37m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    widgetsnbextension-3.5.1   |   py37h89c

In [2]:
#webscrapping the table using beautiful soup and getting all the table rows 
try:
	URL="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
	req = requests.get(URL)
	html_data = req.text
	beautiful_soup=BeautifulSoup(html_data,"html5lib")
	t_data = beautiful_soup.find("table").find("tbody").find_all("tr")
except:
	print("Error occured")

#dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
t_frame = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])


In [3]:
# traversing each table data in each row
for data in t_data:
	
	td = data.findAll('td')
	for d in td:
		ele = d.find('p') # identifying all the <p> elements in the html text
		s = ele.get_text(separator=" ").strip() #Extract only the text from the html text and use a separator(" ") for readibility
		if "Not assigned" not in s: #Ignore cells with a borough that is Not assigned. Not assigned neibourhoods will be equal to borough name
			#print(s)
			p_code = s[0:3] # get the postal code(the first three characters) from html text
			nei = re.findall(r"\((.*?)\)",s)
			nei_hud = nei[0] # find only the neighbourhood name. The neighbourhood is in brackets in html text
			nei_hud = nei_hud.replace('/',',') # the '/' in neighbourhood in html text replaced by ',' combining multiple neighbourhoods into a borough
			bor = re.search(r"(.*?)\s*\(",s)
			bor_ug = (bor[1])[4:] # find the borough name which is excluding p_code and nei_hud.
			#print(p_code)
			#print(bor_ug)
			#print(nei_hud)
			t_frame = t_frame.append({
						"PostalCode" : p_code,
						"Borough" : bor_ug,
						"Neighborhood" : nei_hud
			},ignore_index=True) # create a dataframe containing all three

In [4]:
print(t_frame.shape) #use the .shape method to print the number of rows of your dataframe.

(103, 3)


In [5]:
t_frame.head() #print the dataframe first 5 rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


# Now lets use "GeoSpatial Dataset" to get the geographical coordinates of each postal code  # as geocoder is not working. I will use IBM developer docs to upload the csv file provided in    # question and then convert that into a dataframe. Link (https://developer.ibm.com/technologies/object-storage/tutorials/ibm-cloud-object-storage-usage-ibm-datascience-experience/)

In [6]:
# Its very simple

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
# I  have removed my credentials as I am  sharing file


if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# access the pandas dataframe
l_frame  = pd.read_csv(body)
l_frame.head() # print first five rows of the new dataframe


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
l_frame.rename(columns={"Postal Code":"PostalCode"},inplace=True) # lets rename the columns before we merge dataframes
df_merge = pd.merge(t_frame,l_frame, on="PostalCode")
df_merge.head() # print the merged dataframe (5 rows) df_merge which contains the coordinates also

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [8]:
df_merge.shape

(103, 5)

# Now lets cluster the neighbourhoods, shall we !!
# Lets begin by working with only boroughs that contain the word Toronto 

In [9]:
toronto_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"])

#only choose the boroughs with the word Toronto. Eg Downtown Toronto
for pcode,boro,neigh,lat,lng in zip(df_merge['PostalCode'],df_merge['Borough'],df_merge['Neighborhood'],df_merge['Latitude'], df_merge['Longitude']):
	if "Toronto" in boro:
		toronto_data=toronto_data.append({
						"PostalCode" : pcode,
						"Borough" : boro,
						"Neighborhood" : neigh,
						"Latitude" : lat,
						"Longitude" : lng
			},ignore_index=True)

In [10]:
toronto_data.shape #Lets get the size of the resulting dataframe

(39, 5)

In [11]:
#the geographical coordinates of Toronto
latitude=43.651070
longitude=-79.347015

In [27]:
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
#Lets view the map of boroughs with Toronto in their name
map_toronto

# Lets create a function to get all the nearby venues of each neighbourhood
# Will remove all the client_id etc before sharing notebook

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):

    
    LIMIT = 100 # A default Foursquare API limit value
    radius = 500 # define radius
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
        
      for n in name.split(","):		
        # create the API request URL
        print(n)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            n, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

 Regent Park  
  Harbourfront 
 Garden District 
  Ryerson 
 St. James Town 
 The Beaches 
Berczy Park
Central  Bay Street 
Christie
 Richmond  
 Adelaide 
  King 
 Dufferin  
  Dovercourt Village 
 The Danforth    East
 Harbourfront  East 
  Union Station  
  Toronto Islands 
 Little Portugal  
  Trinity 
 The Danforth  West 
  Riverdale 
 Toronto Dominion Centre  
  Design Exchange 
 Brockton  
  Parkdale Village  
  Exhibition Place 
 India Bazaar  
  The Beaches  West
 Commerce Court  
  Victoria Hotel 
Studio District
 Lawrence Park 
Roselawn
 Davisville  North
 Forest Hill  North & West
 High Park  
  The Junction  South
 North Toronto  West
 The Annex  
  North Midtown  
  Yorkville 
 Parkdale  
  Roncesvalles 
 Davisville 
 University of Toronto  
 Harbord
 Runnymede  
  Swansea 
 Moore Park  
  Summerhill  East
 Kensington Market  
  Chinatown  
  Grange Park 
 Summerhill  West 
  Rathnelly  
  South Hill  
  Forest Hill  SE 
  Deer Park 
 CN Tower  
  King and Spadina  
  Rai

In [15]:
# Lets analyze the dataframes
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [17]:
# Lets check the onehot dataframe
toronto_onehot.tail()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
3071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3073,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()


In [19]:
#print each neighborhood along with the top 5 most common venues
num_top_venues = 5
for hood in toronto_grouped['Neighborhood']:
    #print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

                venue  freq
0         Coffee Shop  0.09
1          Restaurant  0.06
2                Café  0.06
3  Italian Restaurant  0.04
4         Pizza Place  0.04


                           venue  freq
0                           Café  0.08
1                            Bar  0.06
2                    Coffee Shop  0.05
3          Vietnamese Restaurant  0.05
4  Vegetarian / Vegan Restaurant  0.05


                venue  freq
0         Coffee Shop  0.14
1          Bagel Shop  0.07
2  Light Rail Station  0.07
3                Bank  0.07
4          Restaurant  0.07


                venue  freq
0         Coffee Shop  0.11
1               Hotel  0.07
2                Café  0.05
3          Restaurant  0.04
4  Seafood Restaurant  0.03


                  venue  freq
0                Bakery  0.12
1              Pharmacy  0.12
2  Gym / Fitness Center  0.06
3                   Bar  0.06
4               Brewery  0.06


            venue  freq
0            Café  0.13
1     Coffee Shop  0.09


In [20]:
#lets sort the venues in descending order and display the top 10 venues for each neighborhood.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
	
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))


In [21]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)


In [22]:
#Lets preview our sorted venues dataframe
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Cabbagetown,Coffee Shop,Café,Restaurant,Italian Restaurant,Bakery,Chinese Restaurant,Pizza Place,Pub,Beer Store,Indian Restaurant
1,Chinatown,Café,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant,Bakery,Burger Joint,Caribbean Restaurant,Dumpling Restaurant
2,Deer Park,Coffee Shop,Pizza Place,Liquor Store,Restaurant,Bank,Pub,Bagel Shop,Supermarket,Fried Chicken Joint,Sushi Restaurant
3,Design Exchange,Coffee Shop,Hotel,Café,Restaurant,Seafood Restaurant,Bakery,Salad Place,Italian Restaurant,Japanese Restaurant,Steakhouse
4,Dovercourt Village,Pharmacy,Bakery,Music Venue,Middle Eastern Restaurant,Supermarket,Brazilian Restaurant,Café,Furniture / Home Store,Bar,Bank


In [23]:
#Run k-means to cluster the neighborhood into 7 clusters.

# set number of clusters
kclusters = 7

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10]) 

[0 0 3 0 0 0 3 0 0 1]


In [24]:
#Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged = toronto_merged.dropna() #dropping the null values in columns as they have little/no effect on our cluster

toronto_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0.0,Coffee Shop,Café,Restaurant,Bakery,Cosmetics Shop,Cocktail Bar,Clothing Store,Gym,Park,Moroccan Restaurant
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.0,Pub,Trail,Health Food Store,Wine Bar,Doner Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Dumpling Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0.0,Bakery,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Pharmacy,Restaurant,Farmers Market,Cheese Shop,Clothing Store
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0.0,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Salad Place,Deli / Bodega,Restaurant,Bubble Tea Shop,Japanese Restaurant,Burger Joint
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0.0,Grocery Store,Café,Park,Baby Store,Coffee Shop,Restaurant,Italian Restaurant,Athletics & Sports,Candy Store,Nightclub


In [26]:
#Now lets visualize the clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    #print(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters