# Segmenting and Clustering Neighborhoods in Toronto

Import necessary libraries

In [1]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
from bs4 import BeautifulSoup

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

import matplotlib.cm as cm
import matplotlib.colors as colors

import requests

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

## Section 1: scrap Canada Postal Codes from Wikipedia

#### Get the HTML content of the given URL

In [2]:
def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

#### Get the raw table and transform it into a _Pandas_ DataFrame:
1. First select the HTML _table_ having the class _wikitable sortable_
1. Transform the HTML into a list or _rows_, based on the markup _tr_
1. For each _row_, get the content of the markup _td_

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
content = getHTMLContent(url)

table_html = content.find('table', {'class': 'wikitable sortable'})
rows = table_html.find_all('tr')

table = [['PostalCode', 'Borough', 'Neighborhood']]

for row in rows[1:]:
    cells = [cell.get_text().strip() for cell in row.find_all('td')]
    table.append(cells)
    
df = pd.DataFrame(columns=table[0], data=table[1:])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Replace "Not assigned" values by np.NaN

In [4]:
df.replace("Not assigned", np.nan, inplace=True)

print("Shape before dropping NaN: {}".format(df.shape))

df.dropna(subset=["Borough"], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Shape after dropping NaN: {}".format(df.shape))

Shape before dropping NaN: (288, 3)
Shape after dropping NaN: (211, 3)


In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Default the value of the Neighborhood by the one from the Borough when NaN

In [6]:
print("Value before filling NA: {}".format(df.iloc[6,2]))
df["Neighborhood"] = df["Neighborhood"].fillna(df["Borough"])
print("Value after filling NA: {}".format(df.iloc[6,2]))

Value before filling NA: nan
Value after filling NA: Queen's Park


#### Group the table by Postal Code (to have only one entry by Postal Code)
First check we have  several rows for some Postal Codes

In [7]:
df[df["PostalCode"]=="M5A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park


Group by _PostalCode_ and apply an "aggregate" function.  
Then remove duplicates  
Check the result

In [8]:
df["Neighborhood"] = df.groupby(["PostalCode", "Borough"])["Neighborhood"].transform(lambda x: ", ".join(x))
df.drop_duplicates(inplace=True)
df[df["PostalCode"]=="M5A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [10]:
df.shape

(103, 3)

This is the end of the scraping section