# Scraping Canada's Postal Codes from Wikipedia

This notebook scrapes Canada's postal codes from a page in Wikipedia ("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") to a data frame. 

### Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

print("The libraries were successfully imported.")

The libraries were successfully imported.


### Import the Wikipedia html file

In [2]:
# Link to the page in Wikipedia
wikipediaLink = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# Import the html file with requests and BeaultifulSoup
source = requests.get(wikipediaLink).text
soup = BeautifulSoup(source, 'lxml')

print("The Wikipedia html file was successfully imported.")

The Wikipedia html file was successfully imported.


In [4]:
# Print the complete html file
# print(soup.prettify())

### Extract the information of the postal codes from the table in the html file

In [5]:
# First we extract the part of the html file refering to the postal codes information (the first table)
table = soup.find("table")

# print(table.prettify())

In [6]:
# Then we can extract the information of the body of the table
tableBody = table.find('tbody')

# print(tableBody.prettify())

In [7]:
# Finally we can extract the information from each row and cell from the table
# Create list for the information of each row
postalCodes = []
# Get the data from each row (skipping the first row, that contains the headers)
for tr in tableBody.find_all("tr")[1:]:
    tds = tr.find_all("td")
    rowList = [tds[0].text, tds[1].text, tds[2].text.replace("\n", "")] # Eliminating the new line indicator on the last cell of the row
    postalCodes.append(rowList)

print(postalCodes[0:10])    

[['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Harbourfront'], ['M5A', 'Downtown Toronto', 'Regent Park'], ['M6A', 'North York', 'Lawrence Heights'], ['M6A', 'North York', 'Lawrence Manor'], ['M7A', "Queen's Park", 'Not assigned'], ['M8A', 'Not assigned', 'Not assigned']]


### Create the dataframe with the postal codes information

In [8]:
# Create tne names of the columns
columns = ["PostalCode", "Borough", "Neighbourhood"]
# Create the pandas DataFrame 
df = pd.DataFrame(postalCodes, columns = columns) 

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [9]:
# Eliminate rows withou t information about borough
df = df[df["Borough"] != "Not assigned"]

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [10]:
# Assign the borough's name to the neighbourhoods without name
df['Neighbourhood'] = np.where(df['Neighbourhood'] == "Not assigned", df['Borough'], df['Neighbourhood'])

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [11]:
# Group the neighbourhoods  with the same posta code in the same row
final_df = (df.groupby(["PostalCode", "Borough"])["Neighbourhood"]
            .apply(lambda x: ", ".join(x.dropna().unique()))
            .reset_index())
# Order dataframe by the postal code
final_df.sort_values(by=["PostalCode"], inplace=True)

final_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Save a csv file from the dataframe with the postal codes information

In [12]:
final_df.to_csv("postal_codes.csv", index=False, header=True)

### Create portable function for scraping postal codes from Wikipedia

In [13]:
# Create a complete function to scrape postal codes from the Wikipedia page
def scrapePostalCodesFromWikipedia():
    # Import libraries
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import numpy as np
    # Import the Wikipedia html file
    wikipediaLink = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
    source = requests.get(wikipediaLink).text
    soup = BeautifulSoup(source, 'lxml')
    # Create list form information of postal codes
    postalCodes = []
    # Extract the information of postal codes from table in the html file
    table = soup.find("table")
    tableBody = table.find('tbody')
    for tr in tableBody.find_all("tr")[1:]:
        tds = tr.find_all("td")
        rowList = [tds[0].text, tds[1].text, tds[2].text.replace("\n", "")]
        postalCodes.append(rowList)
    # Create the pandas DataFrame
    df = pd.DataFrame(postalCodes, columns = ["PostalCode", "Borough", "Neighbourhood"])
    df = df[df["Borough"] != "Not assigned"]
    df['Neighbourhood'] = np.where(df['Neighbourhood'] == "Not assigned", df['Borough'], df['Neighbourhood'])
    df = (df.groupby(["PostalCode", "Borough"])["Neighbourhood"]
          .apply(lambda x: ", ".join(x.dropna().unique()))
          .reset_index())
    df.sort_values(by=["PostalCode"], inplace=True)
    
    return df

print("The scrapePostalCodesFromWikipedia function was successfully created.")

The scrapePostalCodesFromWikipedia function was successfully created.


In [14]:
df = scrapePostalCodesFromWikipedia()

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
