# Segmenting and Clustering Neighborhoods in Toronto

Let's download all the libraries that will be used for the clustring:

In [2]:
# data scraping and handling JSON files
from bs4 import BeautifulSoup
import requests
import json
from pandas.io.json import json_normalize

# handling data
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import folium

# clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## __PART 1__: Scraping Wikipedia page to produce the neighborhoods table.

We proceed with scraping the Wikipedia page in order to obtain the information about neighborhoods in Toronto:
- first, we get the data from the web page and extract the part that contains the table,
- second, we clean the data and store it in three lists, corresponding to postcode, borough and neighborhood,
- lastly, we create a pandas dataframe to hold the table and perform the modifications, suggested in the assignment (drop the cells with 'Not assigned' borough, combine the rows with the same code area, modify 'Not assigned' neighborhoods).

In [229]:
# get the content of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
values = soup.find( "table" ) 
table = values.findAll('td')
table[0:15]

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>]

In [230]:
# Clean the data and store the columns in three lists
postcode = []
borough = []
neighborhood = []
for i, value in enumerate(table):
    value = str(value).strip('<td>').strip('/<')
    value = value.split('title="')[-1].split('">')[0]
    value = value.split('\n')[0].split(' (')[0]
    value = value.split(', Toronto')[0]
    
    if (i+1)%3 == 1:
        postcode.append(value)
    elif (i+1)%3 == 2:
        borough.append(value)
    else:
        neighborhood.append(value)
        
print("Postcode : ", postcode[0:5])
print("Borough : ", borough[0:5])
print("Neighborhood : ", neighborhood[0:5])

Postcode :  ['M1A', 'M2A', 'M3A', 'M4A', 'M5A']
Borough :  ['Not assigned', 'Not assigned', 'North York', 'North York', 'Downtown Toronto']
Neighborhood :  ['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Harbourfront']


In [231]:
# Store the data in a pandas dataframe
df = pd.DataFrame({'Postcode' : postcode,
                  'Borough'  : borough,
                  'Neighborhood' : neighborhood})
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [232]:
# Drop cells wiht borough = 'Not assigned'
df = df[df['Borough'] != 'Not assigned']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [233]:
# Combine rows with same postcode  
def f(x):
    return pd.Series(dict(Borough = x['Borough'].unique()[0], 
                        Neighborhood = ', '.join(x['Neighborhood'])))
df = df.groupby('Postcode').apply(f)
df.reset_index(inplace=True)
df.head(10) 

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [234]:
# Transform cell with 'Not assigned' Neighborhood
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood']  = df.loc[df['Neighborhood'] == 'Not assigned','Borough']


We print the shape of the resulting dataframe and also double-check the following:
- number of unique postcodes coincides with the number of rows in the dataframe
- there are no 'Not assigned' values in either 'Borough' or 'Neighborhood' columns

In [235]:
# Check that all postcodes are unique 
# and there are no 'Not assigned' values

print("Shape of the dataframe is ", df.shape)
print("There are {} unique postcodes".format(df['Postcode'].count()))
print("Are there any 'Not assigned' boroughs? : ", 
      df['Borough'].isin(['Not assigned']).any())
print("Are there any 'Not assigned' neighborhoods? : ", 
      df['Neighborhood'].isin(['Not assigned']).any())

Shape of the dataframe is  (103, 3)
There are 103 unique postcodes
Are there any 'Not assigned' boroughs? :  False
Are there any 'Not assigned' neighborhoods? :  False
