In [1]:
import pandas as pd
import urllib.request # Library for opening URLs
from bs4 import BeautifulSoup

In [2]:
# URL containing the data
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [4]:
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [5]:
# view html code (uncomment to view)
#BeautifulSoup.prettify(soup)

In [6]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
# all_tables # Uncomment to view all tables

In [7]:
# Select the correct table from all_tables
right_table=soup.find('table', class_='wikitable sortable')

In [8]:
# Extract relevant data from table and remove '\n'
postal=[]
borough=[]
hood=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        postal.append(cells[0].find(text=True).replace("\n",""))
        borough.append(cells[1].find(text=True).replace("\n",""))
        hood.append(cells[2].find(text=True).replace("\n",""))

In [9]:
# Create dataframe and add data
df=pd.DataFrame(postal,columns=['PostalCode'])
df['Borough']=borough
df['Neighborhood']=hood

In [10]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [11]:
# Remove all rows where Borough is 'Not assigned'
df = df[df['Borough'] != 'Not assigned']
df.reset_index(inplace=True)
df = df.drop(['index'], axis=1)

In [12]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
df.shape

(103, 3)