In [1]:
# -*- coding: utf-8 -*-
"""
This script is used to geocode busines sources which do not have lat/lon coordinates.
It uses the OSM Nominatim and GC National Address Register APIs.

We use several API calls to try to obtain the most complete response. 
Our order of preference for the API calls is: facility name (osm), 
street address (osm), street address (GC), city (osm).

The script filters out results results that have already been geocoded so that new data 
can be added to the pipeline without re-processing each record.

This script is adapted from Education Facilities for Healthcare
"""

import requests
import json
import pandas as pd
import numpy as np
import time
from os.path import exists
from datetime import datetime

# Libraries for GC API
import os
from dotenv import load_dotenv

# requires a valid key to use the GC API
# see docs https://api.canada.ca/en/homepage 
# create a .env file in this repository and add GC_API_KEY="YOUR_GC_API_KEY"
load_dotenv()
gc_key = os.environ.get("GC_API_KEY")

# temporarily suppresses SettingWithCopyWarning
pd.options.mode.chained_assignment = None 

In [2]:
# load new input data
# and filter data we have geocoded already

# NOTE the script below for detecting previously coded data is partially finished

input_data = "combined.csv"
prev_geocoded_data = ""

df_input = pd.read_csv(input_data, low_memory=False, dtype="str")

# df_input = output from parsing stage
# df_previously_coded = previous output, minus all filenames for which every geo_source is 'Source',
#     minus any with filenames on our re-code list
# df (to be coded) - filter to just filenames in our re-code list, and those without any geo_source,
#     minus any with filenames in our previously coded list
# df_leftover = df_input, minus df and minus filenames in df_previously_coded


recode_list = ['Quebec city', 'AB_hospital_services', 'BC_assisted_living_residences', 'BC_residential_care', 'BC_walk-in_clinics']

# detect previously geocoded rows from previous export
if exists(prev_geocoded_data):

    df_previous_run = pd.read_csv(prev_geocoded_data, low_memory=False, dtype="str")

    # filter to previously coded data
    remove_list = ['osm_facility_name', 'osm_address', 'gc_street_address', 'osm_city', 'no_osm']
    df_previously_geocoded = df_previous_run[df_previous_run["geo_source"].isin(remove_list)]    
    
    geo_list = list(df_previously_geocoded['filename'])
    
    df_store = df_previous_run[df_previous_run['filename'].isin(geo_list)]
    
    # filter out any files to be re-coded by filename
    df_store = df_store[~df_store['filename'].isin(recode_list)]
    
#     df_leftover = df_previous_run[~df_previous_run["geo_source"].isin(remove_list)] 
#     df_previously_geocoded = pd.read_csv("geocoded_17-02-2022.csv", low_memory=False, dtype="str")
    print('taking ' + str(len(df_previously_geocoded)) + ' records from previous run')
    df_previously_geocoded['filename'].nunique()
else:
    df_previously_geocoded = None
    print('no previous csv detected')
    
    
# filter results that have already been geocoded, based on idx

# number_of_nulls = len(df_input[df_input.geo_source.isnull()])

if isinstance(df_previously_geocoded, pd.DataFrame):
    
    # specify filenames to be ignore in next run
    geo_list = list(df_previously_geocoded['filename'])
    print("geo_list length: " + str(len(geo_list)))
    
    # remove any rows from input data which have already been coded - by filename
    df = df_input[~df_input['filename'].isin(geo_list)]
    df_leftover1 = df_input[df_input['filename'].isin(geo_list)]
#     print(str(len(df_input) - len(df)) + ' records excluded from geocoding based on filename')
#     print(str(len(df)) + ' of ' + str(number_to_geocode) + ' left to geocode')
    # 
else:
    print('no previously geocoded results found in dataframe')
    df_leftover1 = pd.DataFrame()
    df = df_input
    
# filter to just those without lat lon
df_leftover = pd.concat([df_leftover1, df[df.geo_source.notnull()]])
df = df[df.geo_source.isnull()]
print(str(len(df)) + ' of ' + str(len(df_input[df_input.geo_source.isnull()])) + ' records left to geocode')

# sanity check for length of dataframes
diff = len(df_input) - len(df_leftover) - len(df)
if diff != 0:
    print('ERROR length of output is ' + diff + '. review the scripts to correct')

no previous csv detected
no previously geocoded results found in dataframe
2255 of 2255 records left to geocode


In [3]:
# define parameters for osm api call
headers = {
    'User-Agent': 'Sam Lumley Statistics Canada',
    'From': 'sam.lumley@statcan.gc.ca' 
    }
url = 'https://nominatim.openstreetmap.org/search?'
url_gc = 'https://national-address-register-statcan.api.canada.ca:443/v1/addresses/search'

JSONS = []
JSONS_CITIES = []

In [4]:
# define our multiple api queries

df.fillna('', inplace=True)

df['nom_request_name'] = df['facility_name'] + ', ' + df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['nom_request_street'] = df['street_no'] + ' ' + df['street_name'] + ', ' + df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['nom_request_city'] = df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['gc_request_street'] = df['street_no'] + ' ' + df['street_name'] + ' ' + df['city'] + ' ' + df['province']

reqs_name = list(df['nom_request_name'])
reqs_street = list(df['nom_request_street'])
reqs_city = list(df['nom_request_city'])
reqs_gc = list(df['gc_request_street'])

In [5]:
# attempt api requests for each record

def osm_query(query, query_type):
    params = {'q': query,
            'addressdetails':'1',
            'format':'json',
            'email':'sam.lumley@statcan.gc.ca'}
    time.sleep(request_timing) 
    coords = requests.get(url, params=params, headers=headers)
    print("osm query " + query_type + ": " + query)
    return coords.json()

for i in range(len(reqs_street)):
    query_name = reqs_name[i]
    query_street = reqs_street[i]
    query_city = reqs_city[i]
    query_gc = reqs_gc[i]
    
    request_timing = 3 #seconds

        
    # we first try facility name street address,
    # if that fails, we try street address and then city 
    # which one we use is recorded in the geo_source column
        
    print(str(i + 1) + ': ' + query_name)
    
    # try with facility name
    resp = osm_query(query_name, "name")
    
    if len(resp) > 0:
        resp = resp[0]
        df['geo_source'].iloc[i] = "osm_facility_name"
        print('[1] osm facility found')
    else:
#         print('no osm facility found')
        
        # try street address with osm, if it's non-empty
        if len(df.street_name.iloc[i]) > 0:
            resp = osm_query(query_street, "street")
            if (len(resp) > 0):
#             if (len(resp) > 0) & (query_street[0] == ' '):
                resp = resp[0]
                df['geo_source'].iloc[i] = "osm_address"
                print('[2] osm street address found')
            else: 
#                 print('[2] no osm street address found')

                 # try GC API
                params_gc = {'qstr': query_gc}
                headers_gc = {'user_key': gc_key}
                coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
#                 print("trying gc api with query: " + query_gc)    
                if (coords_gc.status_code == 200):
                    resp = coords_gc.json()
                    print('[3] gc street address found')
                    df['geo_source'].iloc[i] = "gc_street_address"
                else:
#                     print('no gc street address found')

                    # try osm city
                    if len(df.city.iloc[i]) > 0:
                        resp = osm_query(query_city, "city")

                        if len(resp) > 0:
                            resp = resp[0]
                            df['geo_source'].iloc[i] = "osm_city"
                            print('[5] osm city address found')
                        else:
                            print('[6] no address found')
                            df['geo_source'].iloc[i] = "no_osm"
                    else:
                        print('[6] no addresses found')
                        df['geo_source'].iloc[i] = "no_osm"

        else:
            print('no street address in dataframe')

            # try osm city
            if len(df.city.iloc[i]) > 0:
                resp = osm_query(query_city, "city")

                if len(resp) > 0:
                    resp = resp[0]
                    df['geo_source'].iloc[i] = "osm_city"
                    print('osm city address found')
                else:
                    print('no address found')
                    df['geo_source'].iloc[i] = "no_osm"
            else:
                print('no addresses found')
                df['geo_source'].iloc[i] = "no_osm"

    print("\n")
    JSONS.append(resp)

#     if resp!=[]:
#         print("\n")
# #         print(resp)

1: Labrador South Health Centre, Forteau, NL, Canada
osm query name: Labrador South Health Centre, Forteau, NL, Canada
no street address in dataframe
osm query city: Forteau, NL, Canada
osm city address found


2: Foothills Medical Centre, Calgary, AB, Canada
osm query name: Foothills Medical Centre, Calgary, AB, Canada
[1] osm facility found


3: Alberta Children's Hospital, Calgary, AB, Canada
osm query name: Alberta Children's Hospital, Calgary, AB, Canada
[1] osm facility found


4: Rockyview General Hospital, Calgary, AB, Canada
osm query name: Rockyview General Hospital, Calgary, AB, Canada
[1] osm facility found


5: Peter Lougheed Centre, Calgary, AB, Canada
osm query name: Peter Lougheed Centre, Calgary, AB, Canada
[1] osm facility found


6: South Health Campus, Calgary, AB, Canada
osm query name: South Health Campus, Calgary, AB, Canada
[1] osm facility found


7: Oilfields General Hospital, Black diamond, AB, Canada
osm query name: Oilfields General Hospital, Black diamond,

In [7]:
with open('Nominatim.json', 'w', encoding='utf-8') as f:
    json.dump(JSONS, f, ensure_ascii=False, indent=4) 

In [12]:
with open('Nominatim.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)  
len(JSONS)

2255

In [26]:
# read json request results into our dataframe

def append_blank(index):
    df['geo_source'].iloc[index] = "no_osm"
    LATS.append('')
    LONS.append('')
    NAME.append('')
    ST_NO.append('')
    ST_NAME.append('')
    CITY.append('')
    PROV.append('')
    POST.append('')
    COUNTRY.append('')
    TYPE.append('')
    CLASS.append('')

with open('Nominatim.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)    
        
LATS = []
LONS = []
NAME = []
ST_NO = []
ST_NAME = []
CITY = []
PROV = []
POST = []
COUNTRY = []
TYPE = []
CLASS = []

# osm_healthcare_types = ['hospital', 'clinic', 'pharmacy']

for index, element in enumerate(JSONS):
    if element==[]:
        append_blank(index)
    
    else:
        # format osm results
        if 'address' in element:
            if (element['address']['country_code'] == 'ca'):
                
#                 df['geo_source'].iloc[index] = "osm_city"
                
                LATS.append(element['lat'])
                LONS.append(element['lon'])
                
                if 'amenity' in element.keys():
                    NAME.append(element['address']['amenity'])
#                     df['geo_source'].iloc[index] = "osm_facility"
                else:
                    NAME.append('')
                
                if 'house_number' in element['address']:
                    ST_NO.append(element['address']['house_number'])
                else:
                    ST_NO.append('')
                
                if 'road' in element['address']:
                    ST_NAME.append(element['address']['road'])
#                     df['geo_source'].iloc[index] = "osm_street"
                else:
                    ST_NAME.append('')
                    
                if 'city' in  element['address']:
                    CITY.append(element['address']['city'])
                else:
                    CITY.append('')    
                    
                if 'state' in element.keys():
                    PROV.append(element['address']['state'])
                else:
                    PROV.append('')
                 
                if 'postcode' in element['address']:
                    POST.append(element['address']['postcode'])
                else:
                    POST.append('')
                
                COUNTRY.append(element['address']['country_code'])
                
                if 'type' in element.keys():
                    TYPE.append(element['type'])
                else:
                    TYPE.append('')
                if 'class' in element.keys():
                    CLASS.append(element['class'])
                else:
                    CLASS.append('')  
            else:
                append_blank(index)
                
        # format gc api results      
        elif 'meta' in element:
            
            if (element['data'][0]['country']['code'] == 'CA'):
                COUNTRY.append(element['data'][0]['country']['code'])
                LATS.append(element['data'][0]['location']['geoCoordinates']['latitude'])
                LONS.append(element['data'][0]['location']['geoCoordinates']['longitude'])
                if 'streetName' in element.keys():
                    ST_NAME.append(element['streetName'])
                else:
                    ST_NAME.append('')
                if 'civicNumber' in element.keys():
                    ST_NO.append(element['civicNumber']['number'])
                else:
                    ST_NO.append('')
                TYPE.append('')
                CLASS.append('')
                NAME.append('')
                if 'province' in element.keys():
                    PROV.append(element['data'][0]['mailingAddress']['province']['code'])
                else:
                    PROV.append('')
                if 'postalCode' in element['data'][0]['mailingAddress']:
                    POST.append(element['data'][0]['mailingAddress']['postalCode'])
                else:
                    POST.append('')
                if 'cityName' in  element['data']:
                    CITY.append(element['data'][0]['cityName']['en'])
                else:
                    CITY.append('')
        else:
            append_blank(index)

# append results to dataframe
df['osm_name'] = NAME
df['osm_street_no'] = ST_NO
df['osm_street_name'] = ST_NAME
df['osm_city'] = CITY
df['osm_province'] = PROV
df['osm_postal_code'] = POST
df['osm_class'] = CLASS
df['osm_type'] = TYPE
df['osm_lat'] = LATS
df['osm_lon'] = LONS

2255

In [27]:
# recombine datasets: (1) no geocode (2) previously geocoded (3) newly geocoded
# df_leftover = df_leftover[df_leftover.geo_source.notnull()]
df_everything = pd.concat([df_leftover, df])

In [28]:
# sanity check for length of dataframes
diff = len(df_input) - len(df_everything)
if diff != 0:
    print('ERROR length of output is ' + diff + '. review the scripts to correct')

In [29]:
date = str(datetime.today().strftime('%Y-%m-%d'))
filename = "geocoded_" + date + ".csv"
df_everything.to_csv(filename, index=False)

In [30]:
df_everything['geo_source'].value_counts()

Source               25068
osm_address            986
osm_facility_name      590
osm_city               468
no_osm                 208
gc_street_address        3
Name: geo_source, dtype: int64

In [10]:
len(df)

2255