# 3rd week assignment: Segmenting and Clustering Neighborhoods in Toronto

#### Importing all the libraries we need

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import json
!pip install folium
import folium

print('Libraries imported.')

Libraries imported.


# 1. Transform the data in the table on Wikipedia page into pandas dataframe

### Web Scraping with the BeautifulSoup package

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
canada_url = requests.get(url).text

soup = BeautifulSoup(canada_url, "lxml")
# print(soup.prettify()) # prettify method displays the indented .html

my_table = soup.find("table", {"class":"wikitable sortable"}) # this will find the table of postal codes in the soup
# my_table

### Transforming the data into a pandas dataframe
First we initialize empty arrays for each category of data.

Then we iterate through the data in the table and append each piece of information to those arrays.

Finally we create the dataframe with the corresponding array for each column.

In [3]:
trs = my_table.findAll("tr")
postal_codes = []
boroughs = []
neighborhoods = []
# Every row, including the headings row, start with <tr> and end with </tr>. In each row we have 3 <td> elements: PostalCode, Borough and Neighborhood 
for k in range(1, len(trs)):
    postal_codes.append(trs[k].findAll("td")[0].text)
    boroughs.append(trs[k].findAll("td")[1].text)
    neighborhoods.append(trs[k].findAll("td")[2].text[:-1])

# print("Postal codes: ", postal_codes, "\n")
# print("Boroughs: ", boroughs, "\n")
# print("Neighborhoods: ", neighborhoods)

toronto = pd.DataFrame()
toronto["PostalCode"] = postal_codes
toronto["Borough"] = boroughs
toronto["Neighborhood"] = neighborhoods
toronto.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Preprocessing the data

In [4]:
# Ignore cells with a borough that is Not assigned.
toronto = toronto[toronto["Borough"] != "Not assigned"].reset_index(drop=True)
toronto

# Not assigned neighborhood: the neighborhood will be the same as the borough
for k in range(toronto.shape[0]):
    if toronto.loc[k, "Neighborhood"] == "Not assigned":
        toronto.loc[k, "Neighborhood"] = toronto.loc[k, "Borough"]

toronto.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [None]:
# Combine neighborhoods in the same postal code area into one row with the neighborhoods separated with a comma (103 unique PC)
toronto = toronto.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 2. Completing the data with latitude and longitude of each neighborhood

In [None]:
# Import the .csv file with the coordinates asociated to each postal code
path = "https://public.boxcloud.com/d/1/b1!__MrLzy-oVsPVOxN-FD6qjj9WfmP8yKHi2dn1jpnJEnbGLFq-vs3LsHCNqtUbtQ6sM6t4QVNjszD-_ZYtUyp08r_A_tyYXEM--ya3VDj71x39aS70emgKf-MqnO12gIV2PsjOBMAYNJ61XKOS3B1ZVMTEzAkh8KjPhi49W3cea1Ftn9nX_1NkWM8DYV0X2eg-2y31N0pcPwH3vKzbB4d_CWCZuxW2JBZwYquJKuUKMV5lZ_FOV3_H75jT7XdvUzbccZnF39SnBdCjscm6CETqJy3YMY7MnoI1FASoKUVygaJwSXWMBEMu8SOJly3ugm4bDgi3k5lsfAuW9cF18TnKi8kFuhpFtc4AJC3JIOry1ijJUMnbEdXif_el1cJElz-_XEuJwWJlxdr8v-UpcOaimtM_Lyy3JsUa-68mCnNcMEawKnS4qyYKZE8w9BJ84qs7V3k3s6lbIw3y2mYJ_1f5AsqAG0-xs_c6UJp18FgyVTZ5TN2qPFa65A3GeBpOu3LO-oqGY97NJ4nASx-RyzXG_Qx8J2Gyakxciay0OF6K46no4Wwb6vMUOdESqnMMtFW0snl7Dg_2WrmmljEBCJTC1qeW2Yo70lr7a6kkX41i8I_0LWIJBB_briH6DMG-gG4zWCkJmmfNgWaNIlIe5lSfgx1y6cRHuuy5ewtIOq7LeXNfbkL4uck3GVlL1EcYm71IAebQTpLceNVP_Fqp3FENs-6oWC4nGvwkQGEH6whFhbJKVm5hW7IztWpDnuuQ2G6OkYBgke82IJgWBfPiYbaXXSgu1IfZY08CtQ76JZaZpgLjT9YsjEZysGpV5zY1XSGDXu2Y6PnAN6NUIzicT3D5db8uP4eMvjQP6oVJgHWOWuA1MNXd3reBxUNgr-USZo0vxeiwsZwu8RTh2C_yJOCnilEowCc15WyXMFF9hS1F5z1c3GYLs2VpY_ykBjPVuz_dqXQ7dXjz0C5xDlHMSl07IqwSfxKIiT8G71l_I_RedwJV29vN0DiXDeCq4q_h3EFNyljvQb9JZGMEFy0a6wXrQplSqNZChdQvp7XMSrxDEywlGnkoF0pXUck-HD5a-VhLZXr3NJVyog2__WlKhuNgrVYdgjl7QVOmbUx-e3ZkmFL5Mzbwmx_gKNafH7XZBxSKGdrBiwyQJo3tpSaAQlQzf-twBgZ00HmABtaCOgvmbZ3rbDR2YwWM0EqdpZ3RhLVziVgW2xNvJFNSLqvPyyHRvpHPtbmmfDuLytDa_wK0m-rbOBUVZL0glMmHAR7V0Jof7dxsvS3iCO6J910CIa5kFwCCs5LZ7uSQmLaYb3DM9tVSESehAscl-cLYRcotCPOjCQ9_mmWrd4zRhznJZuOutlAcVJKFKmLgti4iuALug0imunoanfKWWDgTUf-/download"
coordinates = pd.read_csv(path)
coordinates = coordinates[["Latitude", "Longitude"]]

# Merge both dataframes into one unique dataframe with all the information
toronto = pd.concat([toronto, coordinates], axis=1, join='inner')
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# 3. Explore and cluster the neighborhoods in Toronto

We're going to cluster all neighborhoods (all the 103 different postal codes), because from a total of 11 boroughs only 4 of them contain the word "Toronto" (we checked it with ```toronto["Borough"].value_counts()```).

#### Creating a map of Toronto with neighborhoods superimposed on top

In [None]:
latitude = 43.651070
longitude = -79.347015
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.65107, -79.347015.


In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# Now let's add markers to the map, one for each postal code
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

Note that when we select a blue point in the map it displays the name of all the neighborhoods within that Postal code and then the borough name.

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them, so we need to

#### Define Foursquare credentials, version and other parameters of search

In [None]:
CLIENT_ID = 'A2VXGUOFKKDNAWAARS0SUYMLADJXSQTLMXX2X3KMGTBHEFNY'
CLIENT_SECRET = '14OHU5XP0RM5TQA3CVHWMX32CBMD2K2C23TDHFCQCMJSQH1P'
VERSION = '20180605'

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

LIMIT = 100
RADIUS = 500

My credentails:
CLIENT_ID: A2VXGUOFKKDNAWAARS0SUYMLADJXSQTLMXX2X3KMGTBHEFNY
CLIENT_SECRET:14OHU5XP0RM5TQA3CVHWMX32CBMD2K2C23TDHFCQCMJSQH1P


#### Create the dataframe ```toronto_venues``` with all the venues near to our postal codes

We use the function ```getNearbyVenues``` that we defined in the "Neighborhoods of NY" Lab to extract the venues near a location and its categories.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, RADIUS, LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], 
                             v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 'PC Latitude', 'PC Longitude', 'Venue', 'Venue Latitude',
                             'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

Now we run the above function on each postal code and create a new dataframe called toronto_venues.

In [None]:
toronto_venues = getNearbyVenues(names = toronto['PostalCode'],
                                   latitudes = toronto['Latitude'],
                                   longitudes = toronto['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,PostalCode,PC Latitude,PC Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,M1E,43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


Let's check the size of the resulting dataframe, how many venues were returned for each postal code and find out how many unique categories can be curated from all the returned venues:

In [None]:
toronto_venues.shape

(2228, 7)

In [None]:
toronto_venues.groupby('PostalCode').count().head()

Unnamed: 0_level_0,PC Latitude,PC Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,1,1,1,1,1,1
M1C,1,1,1,1,1,1
M1E,8,8,8,8,8,8
M1G,4,4,4,4,4,4
M1H,8,8,8,8,8,8


In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


#### The definitive dataframe: ```PC_venues_sorted```
We will create a new dataframe (```toronto_grouped```) where: in each row we'll have a postal code, in each column one of the 272 different categories of venues and, in each cell the mean of the frequency of occurrence of each category.

In [None]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move postal code column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Be careful!** Running the next code we check that we don't have enough data to cluster this postal codes properly. For example, in the first postal code "M1B" we only have data of fast food restaurants, due to this its frequency is ```1.0``` and ```0.0``` for the other 271 categories of venues. These zeros will produce kind of randomness when we sort the most common venues in each postal code. Again thinking of "M1B", obviously the most common venue will be "Fast Food Restaurant" but then, pandas will sort the other 271 categories in some way that may not have anything in common with reality. We admite this since this assignment is for learning proposals.

In [None]:
num_top_venues = 5

for hood in toronto_grouped['PostalCode']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['PostalCode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                             venue  freq
0             Fast Food Restaurant   1.0
1        Middle Eastern Restaurant   0.0
2                            Motel   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----M1C----
                 venue  freq
0                  Bar   1.0
1    Accessories Store   0.0
2   Miscellaneous Shop   0.0
3                Motel   0.0
4  Monument / Landmark   0.0


----M1E----
                 venue  freq
0          Pizza Place  0.12
1  Rental Car Location  0.12
2                  Spa  0.12
3       Medical Center  0.12
4   Mexican Restaurant  0.12


----M1G----
                venue  freq
0         Coffee Shop  0.50
1   Korean Restaurant  0.25
2    Insurance Office  0.25
3  Miscellaneous Shop  0.00
4               Motel  0.00


----M1H----
                  venue  freq
0      Hakka Restaurant  0.12
1    Athletics & Sports  0.12
2   Fried Chicken Joint  0.12
3  Caribbean Restaurant  0.12
4           Gas Stat

Next function will sort the venues in descending order of frequency and then we can create the new dataframe ```PC_venues_sorted``` to display the top 10 venues for each postal code.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
PC_venues_sorted = pd.DataFrame(columns=columns)
PC_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    PC_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

PC_venues_sorted

### Cluster Postal Codes
We run k-means to cluster the postal codes into 5 clusters, create a new dataframe that includes the cluster as well as the top 10 venues for each postal code and finally visualize the resulting clusters.

In [None]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)
kmeans.labels_

In [None]:
# add clustering labels
PC_venues_sorted.insert(0, 'Cluster Label', kmeans.labels_)
PC_venues_sorted

toronto_merged = toronto

# merge toronto_grouped with toronto to add latitude/longitude for each postal code
toronto_merged = toronto_merged.join(PC_venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged.head() # check the last columns!

Note that we have some postal codes in which cluster label and other columns are equal to NaN, this is because we did not have data from those areas in terms of venues, so next step is ignoring those postal codes in order to draw a map with the valid clusters for the rest of the areas. We also need to convert the "Cluster Label" column into integers.

In [None]:
# Ignore rows with a cluster label that is NaN.
toronto_merged.dropna(axis=0, inplace = True)

toronto_merged["Cluster Label"] = toronto_merged["Cluster Label"].astype("int")

toronto_merged.head()

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, hood, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Label']):
    label = folium.Popup(str(hood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters