# Notebook for the Applied Data Science Capstone Project - week 3

In [1]:
#start by importing some libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

Initially, I obtained the source wikipedia article and parsed it using BeatifulSoup:

In [2]:
#get and show the source Wikipedia article
article = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
article[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":862527922,"wgRevisionId":862527922,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat"

In [3]:
# use Beautiful Soup to parse the article and show the results
soup = BeautifulSoup(article, 'lxml')
soup.prettify()[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   List of postal codes of Canada: M - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );\n  </script>\n  <script>\n   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":862527922,"wgRevisionId":862527922,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTabl

My next step was to take the parsed BeautifulSoup object and find the table of postal codes:

In [4]:
# get the table of postal codes
table = soup.find('table')
table.prettify()[:1000]

'<table class="wikitable sortable">\n <tbody>\n  <tr>\n   <th>\n    Postcode\n   </th>\n   <th>\n    Borough\n   </th>\n   <th>\n    Neighbourhood\n   </th>\n  </tr>\n  <tr>\n   <td>\n    M1A\n   </td>\n   <td>\n    Not assigned\n   </td>\n   <td>\n    Not assigned\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M2A\n   </td>\n   <td>\n    Not assigned\n   </td>\n   <td>\n    Not assigned\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M3A\n   </td>\n   <td>\n    <a href="/wiki/North_York" title="North York">\n     North York\n    </a>\n   </td>\n   <td>\n    <a href="/wiki/Parkwoods" title="Parkwoods">\n     Parkwoods\n    </a>\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M4A\n   </td>\n   <td>\n    <a href="/wiki/North_York" title="North York">\n     North York\n    </a>\n   </td>\n   <td>\n    <a href="/wiki/Victoria_Village" title="Victoria Village">\n     Victoria Village\n    </a>\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M5A\n   </td>\n   <td>\n    <a href="/wiki/Downtown_Toronto" title="Downtown 

Now that I have a table of postal codes, I need get rid of the HTML tags and clean it up so I can make it into a Pandas dataframe for further processing. I start by making the rows into a list, ...

In [5]:
# parse the table
rows = [tr for tr in table.find_all('tr')[1:]]
print(rows[:5])

[<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>, <tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>, <tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>]


... then I get the text from each cell into a list, ...

In [6]:
# parse each row, getting rid of links and extra characters
cells = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    cells.append(row)
print(cells[:5])

[['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Harbourfront']]


... and finally make that list of lists into a dataframe!

In [7]:
# Turn the list into a Pandas dataframe
df = pd.DataFrame(cells, columns = ['PostalCode','Borough', 'Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


With a dataframe, things become a lot easier. The first step is to clean it up to eliminate that postal codes not assigned to a borough:

In [8]:
# Get rid of "not assigned" boroughs
df = df[df.Borough!='Not assigned']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Then I check to see if there are any boroughs without a neighborhood name (i.e. 'Not assigned') and make the neighborhood name the same as the borough's, per the assignment instructions:

In [9]:
df.loc[df.Neighborhood=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [10]:
# fix the boroughs with neighborhoods not assigned
df.loc[df.Neighborhood=='Not assigned', 'Neighborhood'] = df.loc[df.Neighborhood=='Not assigned'].Borough
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


The final step of the assignment is to make sure we only have one row per postal code, combining neighborhoods using commas (again, per assignment instructions), and show the shape of the final dataframe:

In [11]:
# Group the postal codes with more than one neighborhood
result = pd.DataFrame(df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x))).reset_index()
result.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
result.shape

(103, 3)