# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np

## Scrape the Wikipedia page

In [3]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
pd.set_option('display.max_colwidth', -1)

In [5]:
tables = pd.read_html(link,header=0)[0]
tables.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Extracting Table that has an assigned borrow

In [7]:
tables = tables[tables['Borough'] != 'Not assigned']
tables.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [8]:
tables.columns = ['PostalCode', 'Borough', 'Neighborhood']

## Group by postcode¶

In [10]:
table_grouped = tables.groupby('PostalCode')
table_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Join the Neigborhood in a group by comma

In [12]:
def add_comma_neighbor(grp):
    '''
    This function return the borough as well as Neighbor hood for each group combined or joined with comma.
    '''
    return table_grouped.get_group(grp)['Borough'].iloc[0], ", ".join(table_grouped.get_group(grp)['Neighborhood'])

In [17]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [18]:
for grp in table_grouped.groups.keys():
    borough, neighbor = add_comma_neighbor(grp)
    neighborhoods = neighborhoods.append({'PostalCode' : grp,
                                          'Borough': borough,
                                          'Neighborhood': neighbor
                                         }, ignore_index=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4X,Downtown Toronto,"Cabbagetown, St. James Town"
1,M4Y,Downtown Toronto,Church and Wellesley
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4P,Central Toronto,Davisville North


In [19]:
neighborhoods.shape

(103, 3)

### Assigning Borough to neighborhood if a cell has a borough but a Not assigned neighborhood

In [20]:
neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
102,M7A,Queen's Park,Not assigned


In [21]:
neighborhoods.loc[neighborhoods['Neighborhood'] == 'Not assigned', 'Neighborhood'] = neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned']['Borough']
neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned']


Unnamed: 0,PostalCode,Borough,Neighborhood


In [22]:
neighborhoods.shape

(103, 3)