# Segmenting and Clustering Neighborhoods in Toronto

In [180]:
import requests as r
import json
import pandas as pd
from bs4 import BeautifulSoup

## Scrape wikipedia for table of neighbourhoods

In [181]:
wiki_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [182]:
wiki_scrape = r.get(wiki_page)

In [183]:
soup = BeautifulSoup(wiki_scrape.content,'lxml')

In [184]:
# find the table heading
table = soup.find_all('table')[0] 

# convert the table back to a string so pandas can convert it to a list of dataframes
raw_df = pd.read_html(str(table))

In [185]:
# take the first (and only) dataframe from the list
raw_df = raw_df[0] 

In [186]:
raw_df.shape

(288, 3)

## Clean the dataframe 

In [187]:
# preparing a new dataframe with no column names. Columns will be added later with the specified names
df = pd.DataFrame()

In [188]:
# remove any rows where Borough is not assigned
raw_df = raw_df[raw_df['Borough'].str.lower() != 'not assigned']

# check shape
raw_df.shape

(211, 3)

#### Group the dataframe by Postcode

In [189]:
# create new column 'Borough' grouped by Postcode. 
df['Borough'] = raw_df.groupby(['Postcode'])['Borough'].max()

# create a new column 'Neighborhood' containing contacenated neighourhood values from grouped raw_df
df['Neighborhood'] = raw_df.groupby(['Postcode'])['Neighbourhood'].apply(lambda x: ', '.join(x))

# Reset the index to push postcodes out of the index into a column
df.reset_index(inplace = True)

# rename PostCode column
df.rename({'Postcode' : 'PostalCode'}, axis = 'columns', inplace = True)

In [190]:
# Check a row that should contain multiple neighborhoods to see that it's formatted correctly
df[df['PostalCode'] == "M5A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


#### Replace 'not assigned' Neighborhoods with Borough names

In [191]:
# At this point there is in-fact only one row where the neighborhood is not assiged
# but these lines would ensure that any rows are formatted as intended

for x in df.index:
    if df.loc[x, 'Neighborhood'].lower() == 'not assigned':
        df.loc[x, 'Neighborhood'] = df.loc[x, 'Borough']

In [192]:
# Check for any rows matching the intended formatting in the previous cell
df[df['Borough'] ==  df['Neighborhood']]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


### Use the .shape method to print the number of rows of your dataframe

In [193]:
print("There are", df.shape[0], "rows in the dataframe")

There are 103 rows in the dataframe
