# Segmenting and Clustering Neighborhoods in Toronto

### Import the necessary librarys

In [1]:
from bs4 import BeautifulSoup
import requests
import csv

import pandas as pd
import numpy as np

## Get the html from the wikipedia page with the postal codes of Canada

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

Now that we have the html, we need to find the table in it.

In [3]:
table = soup.find('table')

Also, we need to create the csv file in which we will save the necessary data

In [4]:
csv_file = open('csv_toronto_neighbourhood.csv', 'w')
csv_writer = csv.writer(csv_file)

## Write the header in the csv file

In [5]:
row = []

for th in table.find_all('th'):
    row_value = th.text.rstrip()
    
    row.append(row_value)
print(row)
csv_writer.writerow(row)

['Postcode', 'Borough', 'Neighbourhood']


32

## Write all the data on the csv file

It's important to remove the data with no Borough assigned and write the Borough name if no neighbourhood is assigned

In [6]:
for tr in table.find_all('tr'):
    i=0
    row=[]
    discard = False
    postalCode = None
    borough = None
    neighbourhood = None
    for td in tr.find_all('td'):
        if i==0:
            postalCode = td.text.rstrip()
        elif i==1:
            if (td.text.rstrip() == 'Not assigned'):
                discard = True
            else:
                borough = td.text.rstrip()
        else:
            i=0
            if (td.text.rstrip() == 'Not assigned'):
                neighbourhood = borough
            else:
                neighbourhood = td.text.rstrip()
        i=i+1
    if (discard == False):
        row.append(postalCode)
        row.append(borough)
        row.append(neighbourhood)
        csv_writer.writerow(row)
        #print(row)
        
        

In [7]:
csv_file.close()

## Open the new csv file

In [8]:
df = pd.read_csv('csv_toronto_neighbourhood.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


In [9]:
df.shape

(213, 3)

## Group Neighbourhoods with the same Postcode

In [10]:
df_mod = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
df_mod = df_mod.reset_index()
df_mod.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
df_mod.shape

(103, 3)