# Import libraries

In [1]:
import pandas as pd 
import numpy as np

from bs4 import BeautifulSoup
import requests

# Scraping data from wikipedia

In [2]:
# Get HTML source code
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
match = soup.find('table', class_='wikitable')

In [4]:
data = []
table_rows = match.find_all('tr')
# Skip header
for row in table_rows[1:]:
    data.append([row.find_all('td')[0].text.strip('\n'), 
                 row.find_all('td')[1].text.strip('\n'), 
                 row.find_all('td')[2].text.strip('\n')])

# Create dataframe

In [5]:
df = pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Ignore borough that is Not assigned

In [7]:
idx = df[ df['Borough'] == 'Not assigned'].index
df.drop(idx, inplace=True)

In [8]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


# Group PostalCode and Borough 

In [9]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [10]:
print(df[df['PostalCode']=='M5V'].values)

[['M5V' 'Downtown Toronto'
  'CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara']]


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [11]:
df['Neighborhood'].replace('Not assigned',df['Borough'],inplace=True)
df[ df['Borough'] == "Queen's Park"].head()

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Queen's Park


In [12]:
df.shape

(103, 3)

In [16]:
df.to_csv('datasets/toronto_part1.csv', index=False)