# Segmenting and Clustering Neighborhoods in Toronto

In [25]:
import requests as r
import json
import pandas as pd
from bs4 import BeautifulSoup

# Submission part 1

## Scrape wikipedia for table of neighbourhoods

In [26]:
wiki_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# use requests to pull the content of the wiki page
wiki_scrape = r.get(wiki_page)

# convert the response using beautiful soup
soup = BeautifulSoup(wiki_scrape.content,'lxml')

# find the table heading
table = soup.find_all('table')[0] 

# convert the table back to a string so pandas can convert it to a list of dataframes
raw_df = pd.read_html(str(table))

# take the first (and only) dataframe from the list
raw_df = raw_df[0] 

# check the shape of the table
raw_df.shape

(288, 3)

## Clean the dataframe 

In [27]:
# preparing a new dataframe with no column names. 
# Columns will be added with specified names
df = pd.DataFrame()

# remove any rows where Borough is not assigned
# using str.lower() to ensure that none are missed due to inconsistent capitalisation
raw_df = raw_df[raw_df['Borough'].str.lower() != 'not assigned']

# check shape to see how many we have dropped
raw_df.shape

(211, 3)

We have dropped 77 rows where Borough was not assigned.

#### Group the dataframe by Postcode

In [28]:
# The Borough and Neighborhood columns require different string processing
# so will be grouped in the same way, but added separately to the dataframe

# create new column 'Borough' grouped by Postcode with only one Borough string in each cell. 
df['Borough'] = raw_df.groupby(['Postcode'])['Borough'].max()

# create a new column 'Neighborhood' containing contacenated neighourhood values
df['Neighborhood'] = raw_df.groupby(['Postcode'])['Neighbourhood'].apply(lambda x: ', '.join(x))

# Reset the index to push postcodes out of the index into a column
df.reset_index(inplace = True)

# rename PostCode column
df.rename({'Postcode' : 'PostalCode'}, axis = 'columns', inplace = True)

# Check a row that should contain multiple neighborhoods to see that it's formatted correctly
df[df['PostalCode'] == "M5A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


#### Replace 'not assigned' Neighborhoods with Borough names

In [29]:
# At this point there is in-fact only one row where the neighborhood is not assiged
# but this code would ensure that any rows are formatted as intended

for x in df.index:
    if df.loc[x, 'Neighborhood'].lower() == 'not assigned':
        df.loc[x, 'Neighborhood'] = df.loc[x, 'Borough']

# Check for any rows matching the intended output of the code above
df[df['Borough'] ==  df['Neighborhood']]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


### "Use the .shape method to print the number of rows of your dataframe"

In [30]:
print("There are", df.shape[0], "rows in the dataframe")

There are 103 rows in the dataframe


# Submission part 2

## Retrieve lat/long coordinates for each Borough

The module recommended in the instructions (geocoder.google) was suggested to be unreliable, and in my testing it failed to retrieve any results even when looping for several minutes. 

I also tried the Nominatim module from the geopy.geocoders library, which was shown in an earlier week of the course, however that timed out consistenyl after 2 loop iterations. I have imported the CSV from the instructions page instead. The CSV is saved into my Git repo.

In [31]:
lat_longs = pd.read_csv('Geospatial_Coordinates.csv')

lat_longs.rename({'Postal Code' : "PostalCode"}, axis = 'columns', inplace = True)

df = df.merge(lat_longs, how = 'right', on = 'PostalCode')

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Submission part 3

In [32]:
import requests as r
from pandas.io.json import json_normalize

In [33]:
df.groupby('Borough').mean()

Unnamed: 0_level_0,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,43.70198,-79.398954
Downtown Toronto,43.654169,-79.383665
East Toronto,43.669436,-79.324654
East York,43.700303,-79.335851
Etobicoke,43.660043,-79.542074
Mississauga,43.636966,-79.615819
North York,43.750727,-79.429338
Queen's Park,43.662301,-79.389494
Scarborough,43.766229,-79.249085
West Toronto,43.652653,-79.44929


## Goal: 
- Represent all PostalCodes grouped by their existing Borough labels
- Cluster the PostalCodes using k-means and present that new grouping on a map
- Retrieve the most common venue category at the centre of each new group and present on a map

In [34]:
import folium

In [35]:
means = df.groupby('Borough').mean().reset_index()
means

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654169,-79.383665
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Queen's Park,43.662301,-79.389494
8,Scarborough,43.766229,-79.249085
9,West Toronto,43.652653,-79.44929


In [36]:
folium_colours = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
             'lightred', 'darkblue', 'cadetblue',
             'darkpurple', 'pink', 'lightblue', 'lightgreen',
             'gray', 'black', 'lightgray']

## Postal Code centres coloured by existing Borough labels
Below are the central points of each PostalCode. Colour represents the Borough that it belongs to. 

In [37]:
lat_centre,long_centre = means.mean()
postcodes_map = folium.Map(location=[lat_centre, long_centre], zoom_start=11) # generate map centred on the mean lat/long of our selection

colour_counter = 0
for borough in df['Borough'].value_counts().index:
    filtered_df = df[df['Borough'] == borough]
    loop_colour = folium_colours[colour_counter]
    colour_counter += 1
    for lat, lng, label in zip(filtered_df['Latitude'], filtered_df['Longitude'], filtered_df['PostalCode']):
        folium.features.CircleMarker(
                                    [lat, lng],
                                    radius = 15,
                                    color = loop_colour,
                                    popup = label,
                                    fill = True,
                                    fill_color = loop_colour,
                                    fill_opacity = 0.6
                                    ).add_to(postcodes_map)

# display map
postcodes_map      

# Cluster PostalCodes with DBSCAN and K-means
The existing Boroughs are not very uniform. Perhaps they were formed by a long history of politics. Next I will cluster them using DBSCAN to create more uniform groupings.

## K-means clustering
I will cluster to form the same number of groups as there are already Boroughs (11)

In [38]:
from sklearn.cluster import KMeans 
import numpy as np

X = np.asarray(df[['Latitude', 'Longitude']])

k_means = KMeans(init = "k-means++", n_clusters = 11, n_init = 12)
k_means.fit(X)# in this instance X is an array of values with two attributes

df['k-means_labels'] = k_means.labels_ # an array of the cluster labels as integers. They line up with the training data, so the first label is for the first item of the training data.
k_means_centroids = k_means.cluster_centers_ # an array of coordinates for the final centroids.

In [40]:
kmeans_map = folium.Map(location=[lat_centre, long_centre], zoom_start=11) # generate map centred on the mean lat/long of our selection

colour_counter = 0
for cluster in df['k-means_labels'].value_counts().index:
    filtered_df = df[df['k-means_labels'] == cluster]
    loop_colour = folium_colours[colour_counter]
    colour_counter += 1
    for lat, lng, label in zip(filtered_df['Latitude'], filtered_df['Longitude'], filtered_df['PostalCode']):
        folium.features.CircleMarker(
                                    [lat, lng],
                                    radius = 15,
                                    color = loop_colour,
                                    popup = label,
                                    fill = True,
                                    fill_color = loop_colour,
                                    fill_opacity = 0.6
                                    ).add_to(kmeans_map)

# display map
kmeans_map

The k-means clustering is more uniform than the existing Boroughs

## DBSCAN clustering
I will cluster again using DBSCAN to see if it produces a useful result

In [49]:
from sklearn.cluster import DBSCAN 

epsilon = 0.02
minimumSamples = 3
db = DBSCAN(eps=epsilon, min_samples=minimumSamples).fit(X)
df['DBSCAN_labels'] = db.labels_

In [43]:
DBSCAN_map = folium.Map(location=[lat_centre, long_centre], zoom_start=11) # generate map centred on the mean lat/long of our selection

colour_counter = 0
for cluster in df['DBSCAN_labels'].value_counts().index:
    if cluster == -1:
        loop_colour = 'grey'
        opac = 0.1
    else:
        loop_colour = folium_colours[colour_counter]
        opac = 0.6
    filtered_df = df[df['DBSCAN_labels'] == cluster]

    colour_counter += 1
    for lat, lng, label in zip(filtered_df['Latitude'], filtered_df['Longitude'], filtered_df['PostalCode']):
        folium.features.CircleMarker(
                                    [lat, lng],
                                    radius = 15,
                                    color = loop_colour,
                                    popup = label,
                                    fill = True,
                                    fill_color = loop_colour,
                                    fill_opacity = opac
                                    ).add_to(DBSCAN_map)

# display map
DBSCAN_map

DBSCAN was not as effective in this instance. Most of the PostalCodes are classed as outliers.

# Finally, find the most common venue at the centre of each new cluster
- Query the Foursquare API to retrieve venues nearby the centroid of my new clusters
- Filter out uncategorised venues
- Find the most common category of venue at each centroid
- Display the new 'neighborhoods' (created by k-means clustering) with labels showing the most common venue at that location

- - Limitation: Foursquare limits the returned results to 30 entries, so I will have 30 venues at each centroid 

In [44]:
CLIENT_ID = 'PEBSOVVA0A5224DCOVK03YIXHRGKENPFXT25GPAOOI0EAUN4' 
CLIENT_SECRET = 'UTLG4CJXMANFCDEJOVCWJ5SDLSEQK2LS1R234RLOUXRFABT2'
VERSION = '20180604'
radius = 2000

df_venues = pd.DataFrame()
cluster = 0
for row in k_means_centroids:
        lat, long = row[0], row[1]
        URI = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, radius)
        call = r.get(URI)
        replydf = json_normalize(call.json()['response']['venues'])
        replydf['k-means_label'] = cluster
        df_venues = df_venues.append(replydf, sort = True)        
        cluster += 1

In [45]:
# categories are in a dictionary. This function will extract them into a simple format
def extract_category(cell):
    try:
        result = cell[0]['name']
    except:
        result = "None"
    return result
df_venues['categories_simplified'] = df_venues.categories.apply(extract_category)

## Visualise all venues coloured by their cluster

In [46]:
colour_counter = 0

venues_map = folium.Map(location=[lat_centre, long_centre], zoom_start = 11) # generate map centred on the mean lat/long of our selection

colour_counter = 0
for cluster in df_venues['k-means_label'].value_counts().index:
    filtered_df = df_venues[df_venues['k-means_label'] == cluster]
    loop_colour = folium_colours[colour_counter]
    colour_counter += 1
    for lat, lng in zip(filtered_df['location.lat'], filtered_df['location.lng']): #, label , filtered_df['name']
        folium.features.CircleMarker(
                                    [lat, lng],
                                    radius = 15,
                                    color = loop_colour,
                                 #   popup = label,
                                    fill = True,
                                    fill_color = loop_colour,
                                    fill_opacity = 0.6
                                    ).add_to(venues_map)

# display map
venues_map

### Find the most common venue type in each cluster while ignoring 'None' categories

In [47]:
sorting = df_venues[df_venues['categories_simplified'] != 'None'].groupby(        
                                                                         ['k-means_label', 'categories_simplified'], 
                                                                         as_index = True).count()['id'].reset_index(name = 'count')

common_venues = {}
for cluster in df_venues['k-means_label'].value_counts().index:
    common_venues[cluster] = sorting[(sorting['k-means_label'] == cluster) & (sorting['count'] == sorting[sorting['k-means_label'] == cluster]['count'].max())].reset_index().loc[0, 'categories_simplified']

# create new dataframe with centroid coordinates and most common venue type
common_venues_df = pd.DataFrame()
for x in range(len(k_means_centroids)):
    common_venues_df.loc[x, 'Type'] = common_venues[x]
    common_venues_df.loc[x, 'lat'] = k_means_centroids[x][0]
    common_venues_df.loc[x, 'long'] = k_means_centroids[x][1]    

# Present the new neighbourhoods with popup labels showing the most common venue type

In [48]:
colour_counter = 0

common_venues_map = folium.Map(location=[lat_centre, long_centre], zoom_start = 11) # generate map centred on the mean lat/long of our selection

for lat, lng, label in zip(common_venues_df['lat'], common_venues_df['long'], common_venues_df['Type']): 
    folium.features.CircleMarker(
                                [lat, lng],
                                radius = 40,
                                color = loop_colour,
                                popup = label,
                                fill = True,
                                fill_color = 'purple',
                                fill_opacity = 0.6
                                ).add_to(common_venues_map)

# display map
common_venues_map