# Using Geocode to Add Latitude and Longitude

This notebook uses the Geocode package to add information of latitude and longitude to each postal code in the dataframe.

### Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

print("The libraries were successfully imported.")

The libraries were successfully imported.


### Install and import Geocoder package

In [2]:
# !conda install -c conda-forge geocoder -y # uncomment this line if you haven't installed the package yet
import geocoder

print("The geocoder package was successfully imported.")

The geocoder package was successfully imported.


### Import function to scrape postal codes from the Wikipedia page

In [3]:
# Create a complete function to scrape postal codes from the Wikipedia page
def scrapePostalCodesFromWikipedia():
    # Import libraries
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import numpy as np
    # Import the Wikipedia html file
    wikipediaLink = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
    source = requests.get(wikipediaLink).text
    soup = BeautifulSoup(source, 'lxml')
    # Create list form information of postal codes
    postalCodes = []
    # Extract the information of postal codes from table in the html file
    table = soup.find("table")
    tableBody = table.find('tbody')
    for tr in tableBody.find_all("tr")[1:]:
        tds = tr.find_all("td")
        rowList = [tds[0].text, tds[1].text, tds[2].text.replace("\n", "")]
        postalCodes.append(rowList)
    # Create the pandas DataFrame
    df = pd.DataFrame(postalCodes, columns = ["PostalCode", "Borough", "Neighbourhood"])
    df = df[df["Borough"] != "Not assigned"]
    df['Neighbourhood'] = np.where(df['Neighbourhood'] == "Not assigned", df['Borough'], df['Neighbourhood'])
    df = (df.groupby(["PostalCode", "Borough"])["Neighbourhood"]
          .apply(lambda x: ", ".join(x.dropna().unique()))
          .reset_index())
    df.sort_values(by=["PostalCode"], inplace=True)
    
    return df

print("The scrapePostalCodesFromWikipedia function was successfully created.")

The scrapePostalCodesFromWikipedia function was successfully created.


### Create dataframe of postal codes and neighbourhoods

In [4]:
dfNeighbourhoods = scrapePostalCodesFromWikipedia()

print(dfNeighbourhoods.shape)
dfNeighbourhoods.head(10)

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Find latitude and longitude for single postal code

In [5]:
# Select postal code from dataframe
indexPostalCode = 0
# indexPostalCode = 10
# indexPostalCode = dfNeighbourhoods.shape[0] - 1
postalCode = dfNeighbourhoods.loc[indexPostalCode, "PostalCode"]
postalCode

'M1B'

In [6]:
# Get latitude and longitude for selected postal code
postalCodeString = "{}, Toronto, Ontario".format(postalCode)
g = geocoder.google(postalCodeString)
coordinates = g.latlng
print("Location: {}; Coordinates: {}".format(postalCodeString, coordinates))

Location: M1B, Toronto, Ontario; Coordinates: None


Since the package did not get the coordinates in the first tries, we created a function to make several tries per postal code.

In [7]:
# Function to get coordinates of postal code in Toronto, Ontario
def getCoordinates(postalCode, maximumTries=100, returnNumTries=False):
    # Create string
    postalCodeString = "{}, Toronto, Ontario".format(postalCode)
    # Initialize variables
    coordinates = None
    tryCount = 0
    # Loop until you get the coordinates
    while(coordinates is None and tryCount <= maximumTries):
        g = geocoder.google(postalCodeString)
        coordinates = g.latlng
        tryCount += 1
    # Correct the coordinates variable for the case where no coordinates were found
    if coordinates is None:
        coordinates = (None, None)
    
    if returnNumTries:
        return coordinates, tryCount - 1
    else:
        return coordinates

print("The getCoordinates function was successfully created.")

The getCoordinates function was successfully created.


In [8]:
postalCode = dfNeighbourhoods.loc[10, "PostalCode"]
coordinates = getCoordinates(postalCode, maximumTries=20, returnNumTries=True)

print("The coordinates of the postal code {} were {} after {} tries".format(postalCode, coordinates[0], coordinates[1]))

The coordinates of the postal code M1P were (None, None) after 20 tries


### Find latitude and longitude for all postal codes in data frame

Since the Geocoder package was not able to get any information for the postal codes in countless tries, we used the csv file instead.

### Import csv file with latitudes and longitudes

In [9]:
# Set link to csv file
fileLink = "http://cocl.us/Geospatial_data"
# Import csv file as dataframe
dfLatLong = pd.read_csv(fileLink)
# Change column names
dfLatLong.columns = ["PostalCode", "Latitude", "Longitude"]
# Order dataframe by the postal code
dfLatLong.sort_values(by=["PostalCode"], inplace=True)

print(dfLatLong.shape)
dfLatLong.head(10)

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [10]:
# Create function to import csv file with latitude and longitude of postal codes
def importLatLongCsvFile():
    # Set link to csv file
    fileLink = "http://cocl.us/Geospatial_data"
    # Import csv file as dataframe
    df = pd.read_csv(fileLink)
    # Change column names
    df.columns = ["PostalCode", "Latitude", "Longitude"]
    # Order dataframe by the postal code
    df.sort_values(by=["PostalCode"], inplace=True)
    
    return df

print("The importLatLongCsvFile function was successfully created.")

The importLatLongCsvFile function was successfully created.


In [11]:
dfLatLong = importLatLongCsvFile()

print(dfLatLong.shape)
dfLatLong.head(10)

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


### Adding latitude and longitude to the dataframe of postal codes

In [12]:
# Merge the two dataframes on the column "PostalCode"
df = pd.merge(dfNeighbourhoods, dfLatLong, on="PostalCode", how="inner")

print(df.shape)
df.head(10)

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Create function to generate complete dataframe with postal codes information

In [13]:
def generateCompleteDataFrameOfPostalCodes():
    # Import information of postal codes and neighbourhoods form Wikipedia
    dfNeighbourhoods = scrapePostalCodesFromWikipedia()
    # Import information of latitude and longitude from csv file
    dfLatLong = importLatLongCsvFile()
    # Merge dataframes
    df = pd.merge(dfNeighbourhoods, dfLatLong, on="PostalCode", how="inner")
    
    return df

print("The generateCompleteDataFrameOfPostalCodes function was successfully created.")

The generateCompleteDataFrameOfPostalCodes function was successfully created.


In [14]:
df = generateCompleteDataFrameOfPostalCodes()

print(df.shape)
df.head(10)

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
