In [1]:

#Import the required Libraries
import pandas as pd
import numpy as np
import requests
print("Imported Libraries")

Imported Libraries


In [2]:
#Download the URL from wikipedia page
url  = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
if page.status_code == 200:
    print('URL downloaded successfully')
else:
    print('ERROR in Downloading. Error code: {}'.format(page.status_code))

URL downloaded successfully


In [4]:
#We have to discard the "Not Assigned" columns, so we set them to NaN, so that we can later use the dropna method.
df_Canada = pd.read_html(url, header=0, na_values = ['Not assigned'])[0]
df_Canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
#Drop the "Borough" rows which are empty
df_Canada.dropna(subset=['Borough'], inplace=True)

In [8]:
#Number of rows in "Neighborhood" which are empty, but "Borough" exists
n_empty_neighborhood = df_Canada[df_Canada['Neighbourhood'].isna()].shape[0]
print('Number of rows in Neighborhood column which are empty: {}'.format(n_empty_neighborhood))

#Rows in which "Neighborhood" is emtpy but "Borough" exists
df_Canada[df_Canada['Neighbourhood'].isna()]

Number of rows in Neighborhood column which are empty: 1


Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,


In [9]:
#Replace NaN value in "Neighborhood" with "Borough" name & recheck the rows again
df_Canada['Neighbourhood'].fillna(df_Canada['Borough'], inplace=True)
n_empty_neighborhood = df_Canada[df_Canada['Neighbourhood'].isna()].shape[0]
print('Number of rows in Neighborhood column which are empty: {}'.format(n_empty_neighborhood))

Number of rows in Neighborhood column which are empty: 0


In [10]:
#Recheck the "Neighborhood" value on "Queen's Park" row
df_Canada[df_Canada['Borough']=="Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [11]:
#Groupby Postcodes/Borough
df_postcodes = df_Canada.groupby(['Postcode','Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df_postcodes.reset_index(inplace=True)
df_postcodes.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
#Check "Downtown Toronto", and compare it with the dataframe shown in the assignment
df_postcodes[df_postcodes['Borough']=='Downtown Toronto']

Unnamed: 0,Postcode,Borough,Neighbourhood
50,M4W,Downtown Toronto,Rosedale
51,M4X,Downtown Toronto,"Cabbagetown, St. James Town"
52,M4Y,Downtown Toronto,Church and Wellesley
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"
54,M5B,Downtown Toronto,"Ryerson, Garden District"
55,M5C,Downtown Toronto,St. James Town
56,M5E,Downtown Toronto,Berczy Park
57,M5G,Downtown Toronto,Central Bay Street
58,M5H,Downtown Toronto,"Adelaide, King, Richmond"
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station"


In [13]:
#Shape of the dataset
print('The shape of dataset is:',df_postcodes.shape)

The shape of dataset is: (103, 3)


In [15]:
#Export dataset to .csv file, So that we can use it in future Projects
df_postcodes.to_csv('Canada_Postcodes.csv')
print("File Saved")

File Saved
