# Segmenting and Clustering Neighborhoods in Toronto

In [48]:
import requests as r
import json
import pandas as pd
from bs4 import BeautifulSoup

## Scrape wikipedia for table of neighbourhoods

In [49]:
wiki_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# use requests to pull the content of the wiki page
wiki_scrape = r.get(wiki_page)

# convert the response using beautiful soup
soup = BeautifulSoup(wiki_scrape.content,'lxml')

# find the table heading
table = soup.find_all('table')[0] 

# convert the table back to a string so pandas can convert it to a list of dataframes
raw_df = pd.read_html(str(table))

# take the first (and only) dataframe from the list
raw_df = raw_df[0] 

# check the shape of the table
raw_df.shape

(288, 3)

## Clean the dataframe 

In [50]:
# preparing a new dataframe with no column names. 
# Columns will be added with specified names
df = pd.DataFrame()

# remove any rows where Borough is not assigned
# using str.lower() to ensure that none are missed due to inconsistent capitalisation
raw_df = raw_df[raw_df['Borough'].str.lower() != 'not assigned']

# check shape to see how many we have dropped
raw_df.shape

(211, 3)

We have dropped 77 rows where Borough was not assigned.

#### Group the dataframe by Postcode

In [51]:
# The Borough and Neighborhood columns require different string processing
# so will be grouped in the same way, but added separately to the dataframe

# create new column 'Borough' grouped by Postcode with only one Borough string in each cell. 
df['Borough'] = raw_df.groupby(['Postcode'])['Borough'].max()

# create a new column 'Neighborhood' containing contacenated neighourhood values
df['Neighborhood'] = raw_df.groupby(['Postcode'])['Neighbourhood'].apply(lambda x: ', '.join(x))

# Reset the index to push postcodes out of the index into a column
df.reset_index(inplace = True)

# rename PostCode column
df.rename({'Postcode' : 'PostalCode'}, axis = 'columns', inplace = True)

# Check a row that should contain multiple neighborhoods to see that it's formatted correctly
df[df['PostalCode'] == "M5A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


#### Replace 'not assigned' Neighborhoods with Borough names

In [52]:
# At this point there is in-fact only one row where the neighborhood is not assiged
# but this code would ensure that any rows are formatted as intended

for x in df.index:
    if df.loc[x, 'Neighborhood'].lower() == 'not assigned':
        df.loc[x, 'Neighborhood'] = df.loc[x, 'Borough']

# Check for any rows matching the intended output of the code above
df[df['Borough'] ==  df['Neighborhood']]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


### "Use the .shape method to print the number of rows of your dataframe"

In [53]:
print("There are", df.shape[0], "rows in the dataframe")

There are 103 rows in the dataframe
