In [53]:
# -*- coding: utf-8 -*-
"""
This script is used to geocode any health sources which do not have lat/lon coordinates 
using the OSM Nominatim and GC National Address Register APIs.

We use several API requests calls to try to obtain the most complete response. 
Our order of preference for the API calls is: facility name (osm), 
street address (osm), street address (GC), city (osm).

The script also filters out results results that have already been geocoded 
to allow for the addition of new data to the pipeline without re-geocoding each record. 

It takes combined.csv as the primary input. It can take 

This script is adapted from Education Facilities for Healthcare
"""

import requests
import json
import pandas as pd
import numpy as np
import time
from os.path import exists

In [54]:
# Libraries for GC API
import os
from dotenv import load_dotenv

In [55]:
# requires a valid GC API key
# see https://api.canada.ca/en/homepage
# create a .env file in this repository and add GC_API_KEY="YOUR_GC_API_KEY"
load_dotenv()
gc_key = os.environ.get("GC_API_KEY")

In [127]:
# load new input data 
# we can also load previous runs so that results are not geocoded twice

df_input = pd.read_csv("combined.csv", low_memory=False, dtype="str")

# detect previously geocoded rows from previous export

if exists('geocoded_OSM_14-03-2022.csv'):
    
    df_previous_run = pd.read_csv("geocoded_OSM_14-03-2022.csv", low_memory=False, dtype="str")
    # add some magic to read the entire previously output csv
    # previously geocoded = input filtered to geosource = 
    remove_list = ['osm_facility_name', 'osm_address', 'gc_street_address', 'osm_city']
    
    df_previously_geocoded = df_previous_run[df_previous_run["geo_source"].isin(remove_list)]    
    
#     df_leftover = df_previous_run[~df_previous_run["geo_source"].isin(remove_list)] 
#     df_previously_geocoded = pd.read_csv("geocoded_17-02-2022.csv", low_memory=False, dtype="str")
    print(str(len(df_previously_geocoded)) + ' records already geocoded')
else:
    df_previously_geocoded = pd.DataFrame()
    print('no previous csv detected')

146 records already geocoded


In [143]:
# filter results that have already been geocoded, based on idx

number_of_nulls = len(df_input[df_input.geo_source.isnull()])

if isinstance(df_previously_geocoded, pd.DataFrame):
    geo_list = list(df_previously_geocoded['idx'])
    df = df_input[~df_input['idx'].isin(geo_list)]
    print(str(len(df_input) - len(df)) + ' records already geocoded in dataframe')
#     print(str(len(df)) + ' of ' + str(number_to_geocode) + ' left to geocode')
    # 
else:
    print('no previously geocoded results found in dataframe')
    df = df_input
    # 

146 records already geocoded in dataframe


In [144]:
# filter to just those without lat lon
df_leftover = df[df.geo_source.notnull()]
df = df[df.geo_source.isnull()]
print(str(len(df)) + ' of ' + str(number_of_nulls) + ' records left to geocode')

4 of 150 records left to geocode


In [145]:
# define parameters for osm api call
headers = {
    'User-Agent': 'Sam Lumley, Statistics Canada',
    'From': 'sam.lumley@statcan.gc.ca' 
    }
url = 'https://nominatim.openstreetmap.org/search?'
url_gc = 'https://national-address-register-statcan.api.canada.ca:443/v1/addresses/search'

JSONS = []
JSONS_CITIES = []

In [146]:
# define our multiple api queries

df.fillna('', inplace=True)

df['nom_request_name'] = df['facility_name'] + ', ' + df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['nom_request_street'] = df['street_no'] + ' ' + df['street_name'] + ', ' + df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['nom_request_city'] = df['city'] + ', ' + df['province'] + ', ' + 'Canada'
df['gc_request_street'] = df['street_no'] + ' ' + df['street_name'] + ' ' + df['city'] + ' ' + df['province']

reqs_name = list(df['nom_request_name'])
reqs_street = list(df['nom_request_street'])
reqs_city = list(df['nom_request_city'])
reqs_gc = list(df['gc_request_street'])

In [147]:
# make a nominatim request for each record 

def osm_query(query, query_type):
    params = {'q': query,
            'addressdetails':'1',
            'format':'json',
            'email':'sam.lumley@statcan.gc.ca'}
    time.sleep(request_timing) 
    coords = requests.get(url, params=params, headers=headers)
#     print("osm query " + query_type + ": " + query)
    return coords.json()

for i in range(len(reqs_street)):
    query_name = reqs_name[i]
    query_street = reqs_street[i]
    query_city = reqs_city[i]
    query_gc = reqs_gc[i]
    
    request_timing = 3 #seconds

        
    # we first try facility name street address,
    # if that fails, we try street address and then city 
    # which one we use is recorded in the geo_source column
        
    print(str(i + 1) + ': ' + query_name)
    
    # try with facility name
    resp = osm_query(query_name, "name")
    
    if len(resp) > 0:
        resp = resp[0]
        df['geo_source'].iloc[i] = "osm_facility_name"
        print('[1] osm facility found')
    else:
#         print('no osm facility found')
        
        # try street address with osm, if it's non-empty
        if len(df.street_name.iloc[i]) > 0:
            resp = osm_query(query_street, "street")
            if (len(resp) > 0) & (query_street[0] == ' '):
                resp = resp[0]
                df['geo_source'].iloc[i] = "osm_address"
                print('[2] osm street address found')
            else: 
#                 print('[2] no osm street address found')

                 # try GC API
                params_gc = {'qstr': query_gc}
                headers_gc = {'user_key': gc_key}
                coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
#                 print("trying gc api with query: " + query_gc)    
                if (coords_gc.status_code == 200):
                    resp = coords_gc.json()
                    print('gc street address found')
                    df['geo_source'].iloc[i] = "gc_street_address"
                else:
#                     print('no gc street address found')

                    # try osm city
                    if len(df.city.iloc[i]) > 0:
                        resp = osm_query(query_city, "city")

                        if len(resp) > 0:
                            resp = resp[0]
                            df['geo_source'].iloc[i] = "osm_city"
                            print('osm city address found')
                        else:
                            print('no address found')
                    else:
                        print('no addresses found')

        else:
            print('no street address in dataframe')

            # try osm city
            if len(df.city.iloc[i]) > 0:
                resp = osm_query(query_city, "city")

                if len(resp) > 0:
                    resp = resp[0]
                    df['geo_source'].iloc[i] = "osm_city"
                    print('osm city address found')
                else:
                    print('no address found')
            else:
                print('no addresses found')

    JSONS.append(resp)

#     if resp!=[]:
#         print("\n")
# #         print(resp)

1: 1315 Finch Avenue West - Med-Health, , ON, Canada
no street address in dataframe
no addresses found
2: 400 Queen Street West - Med-Health, , ON, Canada
no street address in dataframe
no addresses found
3: 3420 Hurontario Street - Med-Health, , ON, Canada
no street address in dataframe
no addresses found
4: 1216 Lawrence Avenue West - Med-Health, , ON, Canada
no street address in dataframe
no addresses found


In [148]:
with open('Nominatim.json', 'w', encoding='utf-8') as f:
    json.dump(JSONS, f, ensure_ascii=False, indent=4) 

In [149]:
with open('Nominatim.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)  
len(JSONS)

4

In [150]:
# read json request results into our dataframe

with open('Nominatim.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)    
        
LATS=[]
LONS=[]
ADDR=[]
CITY=[]
PROV=[]
NAME=[]
TYPE=[]
COUNTRY=[]
POST=[]
CITY=[]
CLASS=[]
for element in JSONS:
    if element==[]:
        LATS.append('')
        LONS.append('')
        NAME.append('')
        ADDR.append('')
        PROV.append('')
        TYPE.append('')
        COUNTRY.append('')
        POST.append('')
        CITY.append('')
        CLASS.append('')
    
    else:
        # if nominatim 
        # For now we will use everything osm gives us
        # later we might want to filter by class - eg just to "amenity" 
        
#         if (element['address']['country_code']=='ca') and ('amenity' in element['address'].keys()): 
#         if (element['address']['country_code']=='ca') and ((element['class'] in ['amenity', 'place', 'building']) ):
        
        if 'address' in element:
            # osm results            
            if (element['address']['country_code'] == 'ca'):

                COUNTRY.append(element['address']['country_code'])
                LATS.append(element['lat'])
                LONS.append(element['lon'])
                ADDR.append(element['display_name'])
                if 'type' in element.keys():
                    TYPE.append(element['type'])
                else:
                    TYPE.append('')
                if 'class' in element.keys():
                    CLASS.append(element['class'])
                else:
                    CLASS.append('')

                if 'amenity' in element.keys():
                    NAME.append(element['address']['amenity'])
                else:
                    NAME.append(element['display_name'])
                if 'state' in element.keys():
                    PROV.append(element['address']['state'])
                else:
                    PROV.append('')
                if 'postcode' in element['address']:
                    POST.append(element['address']['postcode'])
                else:
                    POST.append('')
                if 'city' in  element['address']:
                    CITY.append(element['address']['city'])
                else:
                    CITY.append('')

            else:
                LATS.append('')
                LONS.append('')
                NAME.append('')
                ADDR.append('')
                PROV.append('')
                CLASS.append('')
                TYPE.append('')
                COUNTRY.append('')
                CITY.append('')
        elif 'meta' in element:
            # gc api results
            if (element['data'][0]['country']['code'] == 'CA'):
                
                
                COUNTRY.append(element['data'][0]['country']['code'])
                LATS.append(element['data'][0]['location']['geoCoordinates']['latitude'])
                LONS.append(element['data'][0]['location']['geoCoordinates']['longitude'])
                if 'streetName' in element.keys():
                    ADDR.append(element['display_name'])
                else:
                    ADDR.append('')
                TYPE.append('')
                CLASS.append('')
                NAME.append('')
                if 'province' in element.keys():
                    PROV.append(element['data'][0]['mailingAddress']['province']['code'])
                else:
                    PROV.append('')
                if 'postalCode' in element['data'][0]['mailingAddress']:
                    POST.append(element['data'][0]['mailingAddress']['postalCode'])
                else:
                    POST.append('')
                if 'cityName' in  element['data']:
                    CITY.append(element['data'][0]['cityName']['en'])
                else:
                    CITY.append('')
                    
            
        else:
            LATS.append('')
            LONS.append('')
            NAME.append('')
            ADDR.append('')
            PROV.append('')
            CLASS.append('')
            TYPE.append('')
            COUNTRY.append('')
            CITY.append('')

df['osm_address']=ADDR
df['osm_name']=NAME
df['osm_lat']=LATS
df['osm_lon']=LONS
df['osm_prov']=PROV
df['osm_country']=COUNTRY
df['osm_class']=CLASS
df['osm_type']=TYPE
df['osm_city']=CITY

In [151]:
len(df_leftover)

22469

In [152]:
# recombine datasets: (1) no geocode (2) previously geocoded (3) newly geocoded
# df_leftover = df_leftover[df_leftover.geo_source.notnull()]
df_everything = pd.concat([df_leftover, df_previously_geocoded, df])
print(len(df_everything))

22619


In [153]:
df_everything.to_csv("geocoded_OSM_14-03-2022.csv", index=False)
# df_all_geocoded.to_csv("ODHFv2_Geocoded_OSM_17-02-2022.csv", index=False)

In [154]:
df_everything['geo_source'].value_counts()

Source               22469
osm_city                63
gc_street_address       60
osm_facility_name       22
                         4
osm_address              1
Name: geo_source, dtype: int64