# Segmenting and Clustering Neighborhoods in Toronto

In [226]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Extracting Data from URL

In [227]:
# specify url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#get data from url and load dataframe
postal_codes_df = pd.read_html(url)[0]
postal_codes_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Cleaning up Dataset

#### Rename column headers

In [228]:
#rename column names
column_names = ['PostalCode', 'Borough', 'Neighborhood']
postal_codes_df.columns = column_names
postal_codes_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### Filter out "Not assigned" Boroughs

In [229]:
#filter out the borough that are 'Not assigned'
postal_codes_df = postal_codes_df[postal_codes_df['Borough'] != 'Not assigned'].reset_index(drop = True)
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Postal codes with multiple neighborhoods are split up

In [230]:
#temporary dataframe to hold postal codes with multiple neighborhoods
temp_postal_df = postal_codes_df[postal_codes_df['Neighborhood'].str.find(",") > 0]
#create new dataframe to hold split up neighborhoods
neighborhood_split_df = pd.DataFrame(columns = column_names)
for idx, row in temp_postal_df.iterrows():
    neighborhoods = row["Neighborhood"].split(",")
    for neighbor in neighborhoods:
        neighborhood_split_df = neighborhood_split_df.append({"PostalCode": row["PostalCode"], "Borough": row["Borough"], "Neighborhood": neighbor.strip()}, ignore_index = True)

neighborhood_split_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5A,Downtown Toronto,Regent Park
1,M5A,Downtown Toronto,Harbourfront
2,M6A,North York,Lawrence Manor
3,M6A,North York,Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park


In [231]:
#dataframe with postal codes with one neighborhood
postal_codes_df = postal_codes_df[postal_codes_df['Neighborhood'].str.find(",") <= 0]
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
7,M3B,North York,Don Mills
10,M6B,North York,Glencairn
13,M3C,North York,Don Mills


In [232]:
#new dataframe with all data merged
postal_codes_df = pd.concat([postal_codes_df, neighborhood_split_df], axis = 0).reset_index(drop = True)
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M3B,North York,Don Mills
3,M6B,North York,Glencairn
4,M3C,North York,Don Mills


#### Fixing "Not assigned" Neighborhoods with respective Borough

In [233]:
postal_codes_df.loc[(postal_codes_df['Neighborhood'] == "Not assigned") & 
                    (postal_codes_df['Borough'] != "Not assigned"), "Neighborhood"] = postal_codes_df.loc[(postal_codes_df['Neighborhood'] == "Not assigned") & 
                                                                                                          (postal_codes_df['Borough'] != "Not assigned"), "Borough"] 

#### Shape of Postal code dataframe

In [234]:
postal_codes_df.shape

(217, 3)