In [1]:
import pandas as pd

Scrap the data frame from the Wikipedia page:

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

Drop those rows where __Borough__ is <span style="color:blue">Not assigned</span>:

In [3]:
df = df[df['Borough'] != 'Not assigned']

If __Neighborhood__ is <span style="color:blue">Not assigned</span>, then set it to __Borough__:

In [4]:
df['Neighborhood'][df['Neighborhood'] == 'Not assigned'] = df['Borough']

Where there are multiple __Neighborhood__ entries for the same __Postcode__ and __Borough__, combine them in a single record:

In [5]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()

In [6]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


So what is the number of rows and columns in the dataframe?

In [7]:
print(df.shape)

(103, 3)


Now we will try to determine the geographical coordinates of the post codes via __geocoder__...

In [11]:
#!conda install -c conda-forge geocoder --yes

In [10]:
import geocoder as gc

First, run a quick test for one post code:

In [34]:
postal_code = df['Postcode'][0]
lat_lng_coords = None
while(lat_lng_coords is None):
    g = gc.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

The __geocoder__ method does not seem to work, therefore we will use the workaround and load the coordinated from an external file:

In [35]:
postal_codes = pd.read_csv('http://cocl.us/Geospatial_data')

In [36]:
postal_codes.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

In [37]:
postal_codes.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [38]:
new_df = pd.merge(df, postal_codes, how='right', on='Postcode')

In [39]:
new_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [40]:
new_df.shape

(103, 5)