### Notebook for scrapping Postal_Code , Borough and Neighborhood from given Wikipedia page.

#### Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# get response from the given wikipedia page

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# convert above response in HTML format using BeautifulSoup.

soup = BeautifulSoup(website_url,'lxml')

In [4]:
# In the above HTML page table of interest is "wikitable sortable".
# We can access the table using below code.
my_table = soup.find('table',{'class':'wikitable sortable'})
my_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [5]:
columns = []           # list for header / columns of the table.
all_data = []          # list for all <td> values. 
postal_codes = []      # list for all postal codes i.e. every third item of all_data[] list starting from 0 index.
borough = []           # list for all borough i.e. every third item of all_data[] list starting from 1 index.
neighborhood = []      # list for all neighnorhood i.e. every third item of all_data[] list starting from 2 index.

In [6]:
# get all <th> values from the table i.e. column names for our dataframe.
# removing 'newline' from last column.

cols = my_table.find_all('th')
for c in cols:
    columns.append(c.text)
columns[2] = columns[2].replace('\n','')
columns

['Postcode', 'Borough', 'Neighbourhood']

In [7]:
#populate all_data[] with all <td> values and removing 'newline' from last column.

all_d = my_table.find_all('td')
for a in all_d:
    all_data.append(a.text)
    
for i in range(len(all_data)):
    all_data[i] = all_data[i].replace('\n','')

all_data

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M5A',
 'Downtown Toronto',
 'Regent Park',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 "Queen's Park",
 'Not assigned',
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 'Etobicoke',
 'Islington Avenue',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',
 

In [8]:
#polulate Postal_Codes[] , Borough[] and Neighborhood[].

postal_codes = all_data[::3]
borough = all_data[1::3]
neighborhood = all_data[2::3]


In [9]:
# Creating Dataframe from above data.
df = pd.DataFrame() #creating empty dataframe.

#Populating columns
df[columns[0]] = postal_codes
df[columns[1]] = borough
df[columns[2]] = neighborhood
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Now we need to clean the above dataframe using below conditions:
1.Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

2.If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park

In [10]:
df = df[df.Borough != 'Not assigned'] #satisfies first condition.
df.Neighbourhood.replace('Not assigned' , df.Borough , inplace = True)  #satisfies second condition.
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [11]:
# shape of the resulting dataframe
df.shape

(211, 3)

## End of Notebook