# Notebook to extend the pandas dataframe with coordinates

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library
import urllib.request
from bs4 import BeautifulSoup

print('Import done')

Import done


### Shortened code to create dataframe as in part 1 but without outputs

In [2]:
# URL/web page to be scraped to get ZIP codes of Canada
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")
right_table=soup.find('table', class_='wikitable sortable')

# Extract postal codes and corresponding boroughs and neighborhoods and create the desired pandas dataframe
Postcode=[]
Borough=[]
Neighborhood=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))
        
CA_postcodes = pd.DataFrame({'PostalCode': Postcode, 'Borough': Borough, 'Neighborhood': Neighborhood})

# Identify and drop entries with not assigned boroughs (and delete '\n'-extensions)
index_Notassigned = CA_postcodes[CA_postcodes['Borough'] == 'Not assigned'].index
CA_postcodes.drop(index_Notassigned, inplace=True)
CA_postcodes = CA_postcodes.replace('\n','', regex=True)

# Identify not assigned neighborhoods and replace 'Not assigned' with the corresponding borough
i = 0
for checker in CA_postcodes['Neighborhood']:
    i = i + 1
    if checker == 'Not assigned':
        CA_postcodes['Neighborhood'][i+1] = CA_postcodes['Borough'][i+1]
        
# Group neighborhoods to a single postal code and use the .shape-method to print the dimensions of the dataframe
final_df = CA_postcodes.groupby(['PostalCode','Borough'], sort=False)['Neighborhood'].apply(', '.join).to_frame(name = 'Neighborhood').reset_index()

print('The dataframe has',final_df.shape[0],'rows and',final_df.shape[1],'columns')
final_df.head(12)

The dataframe has 103 rows and 3 columns


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [3]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


### Get coordinates using ArcgGIS service and extend the dataframe with the coordinates 

In [4]:
import geocoder
# add latitude and longitude to pandas dataframe
final_df['Latitude'] = ""
final_df['Longitude'] = ""
cnt = 10

for i in range(len(final_df)):
    lat_lng_coords = None
    postal_code = final_df['PostalCode'][i]
    # loop until you get the coordinates
    while(lat_lng_coords is None):

        #g = geocoder.google('{M4B}, Toronto, Ontario'.format(postal_code))
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        final_df['Latitude'][i] = lat_lng_coords[0]
        final_df['Longitude'][i] = lat_lng_coords[1]
        
        # Progress checker for every ten updated entries
        if i % cnt == 0:
            print('Coordinates appended for',i,'of',len(final_df),'entries')        

final_df.head(12)

Coordinates appended for 0 of 103 entries
Coordinates appended for 10 of 103 entries
Coordinates appended for 20 of 103 entries
Coordinates appended for 30 of 103 entries
Coordinates appended for 40 of 103 entries
Coordinates appended for 50 of 103 entries
Coordinates appended for 60 of 103 entries
Coordinates appended for 70 of 103 entries
Coordinates appended for 80 of 103 entries
Coordinates appended for 90 of 103 entries
Coordinates appended for 100 of 103 entries


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7524,-79.3293
1,M4A,North York,Victoria Village,43.7304,-79.3133
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6551,-79.3626
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7231,-79.4516
4,M7A,Queen's Park,Queen's Park,43.6611,-79.391
5,M9A,Etobicoke,Islington Avenue,43.6622,-79.5284
6,M1B,Scarborough,"Rouge, Malvern",43.8115,-79.1955
7,M3B,North York,Don Mills North,43.7492,-79.3619
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.7075,-79.3118
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6574,-79.3782
