# 1. Scrape the wikipedia page

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

### Find the Table

In [3]:
table = soup.find('table',{'class':'wikitable sortable'})

### Iteration: loop through the rows to get the data

In [4]:
PostalCode =[]
Borough = []
Neighbourhood =[]

In [5]:
for row in table.findAll("tr"):
    cells = row.findAll("td")
    if len(cells) == 3:
        PostalCode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighbourhood.append(cells[2].find(text=True))

In [6]:
df = pd.DataFrame(PostalCode, columns = ['PostalCode'])
df['Borough'] = Borough
df['Neighbourhood'] = Neighbourhood

# 2. Clean the table

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### (1). Remove cells with a borough that is ''Not assigned''

In [8]:
condition = df.Borough == 'Not assigned'
df = df.drop(df[condition].index, axis = 0, inplace = False)

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### (2). For cells with a borough but a ''Not assigned'' neighborhood, replace the neighborhood with the borough.

In [10]:
df['Neighbourhood'] = df['Neighbourhood'].str.strip()

In [11]:
import numpy as np
df['Neighbourhood'] = np.where(df['Neighbourhood'] =='Not assigned', df['Borough'], df['Neighbourhood'])

In [12]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### (3). Combine Neighbourhood with the same postal code

In [13]:
df2 = pd.DataFrame(df.groupby(['PostalCode','Borough'], as_index = False).agg(', '.join))
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
df2.shape

(103, 3)