# Segmenting and Clustering Neighborhoods in Toronto #

## PART 1 - Web Scraping ##

### Firstly, I'll be extracting the text of the Wikipedia page in order to push it into a Pandas dataframe ###

In [1]:
import pandas as pd
import requests

source_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" # set wiki page as a variable

raw_page = requests.get(source_url).text # pull the XML of the wiki page and convert it to a text string

page = raw_page[raw_page.find("</tr>")+6:raw_page.find("</table>")] # cut out the table section only

can_df = pd.DataFrame(columns = ["PostalCode", "Borough", "Neighborhood"]) # create an empty dataframe to add rows to

### Now I intend to cut the table apart row by row, extracting the PostalCode, Borough and Neighborhood values, then adding them to our dataframe created above ###

In [2]:
temp_page = page # set new variable to be sliced, retaining page

row_start = temp_page.find("<tr>") # find the start index of the table row

row_end = temp_page.find("</tr>") # find the end index of the table row

while temp_page.find("<td>") > -1:
        
    temp_row = temp_page[row_start:row_end] # extract the row with slicing
    
    PC_start = temp_row.find("<td>")+4 # find the start index of the Postcode string
    
    PC_end = temp_row.find("</td>") # find the end index of the Postcode string
    
    PC = temp_row[PC_start:PC_end] # slice the string and assign it to the variable PC
    
    Bo_start = temp_row.find("<td>", PC_end)+4 # find the start index of the Borough string, starting after the Postcode cell
    
    Bo_end = temp_row.find("</td>", Bo_start) # find the end index of the Borough string, starting after the Borough start
    
    Bo = temp_row[Bo_start:Bo_end] # slice the string and assign it to the variable Bo
    
    if Bo.find("<") > -1: # check if Bo has hyperlink tags
        
        Bo_sub_start = Bo.find(">") + 1 # find the start index for the Bo substring
        
        Bo_sub_end = Bo.find("<", Bo_sub_start) # find the end index for the Bo substring
        
        Bo = Bo[Bo_sub_start:Bo_sub_end] # slice the substring to isolate the actual value of Bo
    
    if Bo != "Not assigned": # check that the Bo has a value - if it's "Not assigned", there's no need to find Nh value and add it to can_df
        
        Nh_start = temp_row.find("<td>", Bo_end)+4 # find the start index of the Neighborhood string, starting after the Borough cell
        
        Nh_end = temp_row.find("</td>", Nh_start)-1 # find the end index of the Neighborhood string, starting after the Neighborhood start
        
        Nh = temp_row[Nh_start:Nh_end] # slice the string and assign it to the variable Nh
        
        if Nh.find("<") > -1: # check if Nh has hyperlink tags
            
            Nh_sub_start = Nh.find(">") + 1 # find the start index for the Nh substring
        
            Nh_sub_end = Nh.find("<", Nh_sub_start) # find the end index for the Nh substring
        
            Nh = Nh[Nh_sub_start:Nh_sub_end] # slice the substring to isolate the actual value of Nh
    
        if Nh == "Not assigned": # check whether Nh is "Not assigned"
        
            Nh = Bo # set NH to Bo if it's "Not assigned"
    
        can_df = can_df.append({"PostalCode":PC, "Borough":Bo, "Neighborhood":Nh}, ignore_index = True) # add the new row to the dataframe
    
    temp_page = temp_page[row_end+4:] # slice the page to remove the row we've just worked on
    
    row_start = temp_page.find("<tr>") # find the start index of the next table row
    
    row_end = temp_page.find("</tr>") # find the end index of the next table row
    
print("Dataframe 'can_df' has been populated.")

Dataframe 'can_df' has been populated.


### Now the table has been recreated as a dataframe, but we only want one row per PostalCode, so the rows must be grouped and the Neighborhood values comma-separated. ###

In [3]:
can_df = can_df.groupby(["PostalCode", "Borough"], as_index = False).agg(lambda col: ", ".join(col))

can_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Now to check the shape of the new dataframe ###

In [4]:
can_df.shape

(103, 3)

## Part 2 - Looking up location data ##

### As geocoder requests are timing out, we need to use the CSV provided ###

In [5]:
location_filepath = "https://cocl.us/Geospatial_data"

location_data = pd.read_csv(location_filepath)

location_data.shape # check the number of rows are the same

(103, 3)

### Before adding the location data, it's worth sorting both dataframes to ensure the rows match up ###

In [6]:
can_df.sort_values(by = ["PostalCode"], inplace = True) # sort by postcode

can_df.tail(10) # check the bottom rows, as these are most likely to show sync problems

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Etobicoke,Islington Avenue
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
95,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Emery, Humberlea"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [7]:
location_data.sort_values(by = ["Postal Code"], inplace = True) # sort by postcode

location_data.tail(10) # check the bottom rows, as these are most likely to show sync problems

Unnamed: 0,Postal Code,Latitude,Longitude
93,M9A,43.667856,-79.532242
94,M9B,43.650943,-79.554724
95,M9C,43.643515,-79.577201
96,M9L,43.756303,-79.565963
97,M9M,43.724766,-79.532242
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437
102,M9W,43.706748,-79.594054


### Now to add the Lat/Long columns to the existing dataframe ###

In [8]:
can_df[["Latitude", "Longitude"]] = location_data[["Latitude", "Longitude"]] # set two new columns to be the location data columns

can_df.head() # check the merge

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3 - Clustering and analysis ##

### The first step will be to visualise can_df using Folium ###

In [9]:
import folium

### To centre our map, we must first have latitude and longitude values for Toronto ###

In [10]:
Toronto_lat = can_df["Latitude"].mean() # take the mean latitude of our dataframe

Toronto_long = can_df["Longitude"].mean() # take the mean longitude of our dataframe

print(Toronto_lat, ", ", Toronto_long)

43.70460773398059 ,  -79.39715291165048


In [11]:
Toronto_map = folium.Map(location = [Toronto_lat, Toronto_long], zoom_start = 11) # create a map object centred on our previous coordinates

for lat, long, borough, neighborhood in zip(can_df["Latitude"], can_df["Longitude"], can_df["Borough"], can_df["Neighborhood"]): # add labels for neighborhoods/borough
    
    label = '{} - {}'.format(neighborhood, borough)
    
    label = folium.Popup(label, parse_html = True)
    
    folium.CircleMarker(
        
        [lat, long],
        radius = 5,
        popup = label,
        color = "blue",
        fill = True,
        fill_color = "#3186cc",
        fill_opacity = 0.7,
        parse_html = False).add_to(Toronto_map)
    
Toronto_map

### There are a lot of boroughs that are spread apart, so let's focus on the central areas only ###

In [12]:
central_toronto_df = can_df[can_df["Borough"].isin(["Downtown Toronto", "East Toronto", "West Toronto"])].reset_index(drop = True) # create new dataframe, limited to Downtown, East and West Toronto only

central_toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529


In [13]:
Central_Toronto_lat = central_toronto_df["Latitude"].mean() # take the mean latitude of our new dataframe

Central_Toronto_long = central_toronto_df["Longitude"].mean() # take the mean longitude of our new dataframe

Central_Toronto_map = folium.Map(location = [Central_Toronto_lat, Central_Toronto_long], zoom_start = 13) # create a map object centred on our previous coordinates

for lat, long, borough, neighborhood in zip(central_toronto_df["Latitude"], central_toronto_df["Longitude"], central_toronto_df["Borough"], central_toronto_df["Neighborhood"]): # add labels for neighborhoods/borough
    
    label = '{} - {}'.format(neighborhood, borough)
    
    label = folium.Popup(label, parse_html = True)
    
    folium.CircleMarker(
        
        [lat, long],
        radius = 5,
        popup = label,
        color = "blue",
        fill = True,
        fill_color = "#3186cc",
        fill_opacity = 0.7,
        parse_html = False).add_to(Central_Toronto_map)
    
Central_Toronto_map

### Using Foursquare, we'll now explore the venues nearest to each PostalCode area's centre ###

In [14]:
##################
##################
##################
##################
##################

# TO BE REDACTED #

##################
##################
##################
##################
##################

CLIENT_ID = 'REDACTED'
CLIENT_SECRET = 'REDACTED'
VERSION = '20180605'

In [15]:
def getNearbyVenues(PostalCodes, names, latitudes, longitudes, radius = 500, LIMIT = 100): # define a function to explore the nearest venues to a coordinate
    
    venues_list = []
    for PostalCode, name, lat, long in zip(PostalCodes, names, latitudes, longitudes):
                               
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format( # create the API request URL
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            long,
            radius,
            LIMIT)
                
        results = requests.get(url).json()["response"]["groups"][0]["items"] # make the GET request
                
        venues_list.append([( # return only relevant information for each nearby venue
            PostalCode,
            name,
            lat,
            long,
            v["venue"]["name"],
            v["venue"]["location"]["lat"],
            v["venue"]["location"]["lng"],
            v["venue"]["categories"][0]["name"]) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    
    nearby_venues.columns = ["Postal Code", "Neighborhoods", "Neighborhood Latitude", "Neighborhood Longitude", "Venue", "Venue Latitude", "Venue Longitude", "Venue Category"]
    
    return(nearby_venues)

In [16]:
Central_Toronto_venues = getNearbyVenues( # create a new dataframe of venues near to the relevant postcodes
    PostalCodes = central_toronto_df["PostalCode"],
    names = central_toronto_df["Neighborhood"],
    latitudes = central_toronto_df["Latitude"],
    longitudes = central_toronto_df["Longitude"])

Central_Toronto_venues.head()

Unnamed: 0,Postal Code,Neighborhoods,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,M4E,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,M4E,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,M4E,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
4,M4E,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


### With a LIMIT of 100 venues per postcode and a city centre as our target, we probably have a lot of venue data to deal with ###

In [17]:
Central_Toronto_venues.shape

(1586, 8)

### Onehot encoding and grouping will help collapse this back down to a per-postcode level ###

In [18]:
onehot = pd.get_dummies(Central_Toronto_venues[["Venue Category"]], prefix = "", prefix_sep = "") # onehot encoding to turn venue type into 1/0 values

onehot[["Postal Code", "Neighborhoods"]] = Central_Toronto_venues[["Postal Code", "Neighborhoods"]] # add postcode/neighborhoods into the new onehot dataframe

# move these columns back to the start

fixed_columns = list(onehot.columns[-2:]) + list(onehot.columns[:-2])

onehot = onehot[fixed_columns]

onehot.head() # check it worked!

Unnamed: 0,Postal Code,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
Toronto_grouped = onehot.groupby(["Postal Code", "Neighborhoods"]).mean().reset_index() # group by postcode/neighborhoods, taking the average of the 1/0 values per area

Toronto_grouped.head()

Unnamed: 0,Postal Code,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,"The Danforth West, Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
2,M4L,"The Beaches West, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027
4,M4W,Rosedale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### With so many types of venue, we should focus on the most common in each area ###

In [20]:
def return_most_common_venues(row, X_venues): # define a function to pick the top X values for each row where the values are in descending order
    
    row_categories = row.iloc[2:]
    
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:X_venues]

In [21]:
import numpy as np

X_venues = 10 # set that we want the top 10

indicators = ["st", "nd", "rd"] # set up exceptions for English-language ordinals

columns = ["Postal Code", "Neighborhoods"] # set up list that will be our dataframe columns

for ind in np.arange(X_venues): # create new column headings for Xth most common
    
    try:
        
        columns.append("{}{} Most Common Type".format(ind + 1, indicators[ind])) # if there's a non-'th' indicator, use it
        
    except:
        
        columns.append("{}th Most Common Type".format(ind + 1)) # else just use the number and 'th'

neighborhoods_venues_sorted = pd.DataFrame(columns = columns) # create a blank dataframe with our new columns

neighborhoods_venues_sorted[["Postal Code", "Neighborhoods"]] = Toronto_grouped[["Postal Code", "Neighborhoods"]] # add our postcode/neighborhoods to the blank dataframe

for ind in np.arange(Toronto_grouped.shape[0]): # loop through each row index
    
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], X_venues) # run our function and set the values for columns 2 onwards

neighborhoods_venues_sorted.head() # check the results

Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Type,2nd Most Common Type,3rd Most Common Type,4th Most Common Type,5th Most Common Type,6th Most Common Type,7th Most Common Type,8th Most Common Type,9th Most Common Type,10th Most Common Type
0,M4E,The Beaches,Neighborhood,Coffee Shop,Health Food Store,Trail,Pub,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run
1,M4K,"The Danforth West, Riverdale",Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Caribbean Restaurant,Bakery,Sports Bar,Spa
2,M4L,"The Beaches West, India Bazaar",Sandwich Place,Pet Store,Steakhouse,Food & Drink Shop,Sushi Restaurant,Ice Cream Shop,Pub,Fish & Chips Shop,Movie Theater,Burrito Place
3,M4M,Studio District,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Yoga Studio,Fish Market,Latin American Restaurant,Coworking Space
4,M4W,Rosedale,Park,Playground,Trail,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


### These can now be clustered using KMeans ###

In [22]:
from sklearn.cluster import KMeans

Toronto_grouped_clustering = Toronto_grouped.drop(["Postal Code", "Neighborhoods"], 1) # drop non-data columns for clustering

kmeans = KMeans(n_clusters = 4, random_state = 0).fit(Toronto_grouped_clustering)

kmeans.labels_

array([2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

### These labels can now be added back into our data ###

In [23]:
neighborhoods_venues_sorted.insert(0, "Cluster Label", kmeans.labels_) # add the label

central_toronto_categorised = central_toronto_df.join(neighborhoods_venues_sorted.set_index("Neighborhoods"), on = "Neighborhood") # join two of the dataframes so all the information is available

central_toronto_categorised.drop("Postal Code", 1, inplace = True) # drop a duplicate postcode column

central_toronto_categorised.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Type,2nd Most Common Type,3rd Most Common Type,4th Most Common Type,5th Most Common Type,6th Most Common Type,7th Most Common Type,8th Most Common Type,9th Most Common Type,10th Most Common Type
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Neighborhood,Coffee Shop,Health Food Store,Trail,Pub,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Caribbean Restaurant,Bakery,Sports Bar,Spa
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Sandwich Place,Pet Store,Steakhouse,Food & Drink Shop,Sushi Restaurant,Ice Cream Shop,Pub,Fish & Chips Shop,Movie Theater,Burrito Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Yoga Studio,Fish Market,Latin American Restaurant,Coworking Space
4,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3,Park,Playground,Trail,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


### The results should be visualised to make better sense of them ###

In [24]:
import matplotlib.cm as cm
import matplotlib.colors as colours

map_clusters = folium.Map(location = [Central_Toronto_lat, Central_Toronto_long], zoom_start = 13)

# set color scheme for the clusters
x = np.arange(4)

ys = [i + x + (i*x)**2 for i in range(4)]

colours_array = cm.rainbow(np.linspace(0, 1, len(ys)))

rainbow = [colours.rgb2hex(i) for i in colours_array]

markers_colours = []

for lat, lon, poi, cluster in zip(central_toronto_categorised["Latitude"],
                                  central_toronto_categorised["Longitude"],
                                  central_toronto_categorised["Neighborhood"],
                                  central_toronto_categorised["Cluster Label"]):
    
    label = folium.Popup(str(poi) + " Cluster " + str(cluster), parse_html = True)
    
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

### It turns out central Toronto is fairly similar all over! ###
#### Using the top 10 most common venue types, all but three areas have been categorised together. ####
#### From a glance at the map, it looks like Cluster 2 might be more residential even though it's still in the East Toronto borough ###
#### Cluster 1 is right by an airport and CN tower - not features you'll commonly see in other neighborhoods! ####
#### Cluster 3 is less obvious, so let's take a look at it ####

In [25]:
central_toronto_categorised.sort_values("Cluster Label", ascending = False).head(1)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Type,2nd Most Common Type,3rd Most Common Type,4th Most Common Type,5th Most Common Type,6th Most Common Type,7th Most Common Type,8th Most Common Type,9th Most Common Type,10th Most Common Type
4,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3,Park,Playground,Trail,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


### The three most common venue types are parks, playgrounds and trails, so it must be a more green area of the city, with less development! ###