In [2]:
# -*- coding: utf-8 -*-
"""
This script is used to reverse geocode health sources which do not have street addresses
but do have lat/lon coordinates. It uses the OSM Nominatim API.

It filters out results results that have already been geocoded so that new data 
can be added to the pipeline without re-processing each record.

This script is adapted from Education Facilities to Healthcare.
"""

import requests
import json
import pandas as pd
import numpy as np
import time
from os.path import exists

In [112]:
# load new input data and (optionally) output data from previous runs 

input_data = "geocoded_OSM_14-03-2022.csv"
prev_reverse_geocoded_data = "reverse_geocoded_14-03-2022.csv"

df_input = pd.read_csv(input_data, low_memory=False, dtype="str")

# detect previously geocoded rows from previous output
if exists(prev_reverse_geocoded_data):
    remove_list = ['osm_reverse']
    df_previous_run = pd.read_csv(prev_reverse_geocoded_data, low_memory=False, dtype="str")
    df_previously_r_geocoded = df_previous_run[df_previous_run["geo_source"].isin(remove_list)]    
    print(str(len(df_previously_r_geocoded)) + ' records already reverse geocoded')
else:
    df_previously_r_geocoded = pd.DataFrame()
    print('no previous csv detected')

1717 records already reverse geocoded


In [120]:
# filter input data for rows that have already been reverse geocoded, based on 'idx'

# number_of_nulls = len(df_input[df_input.geo_source.isnull()])

if isinstance(df_previously_r_geocoded, pd.DataFrame):
    geo_list = list(df_previously_r_geocoded['idx'])
    df = df_input[~df_input['idx'].isin(geo_list)]
    print(str(len(df_input) - len(df)) + ' records already geocoded in the input data')
else:
    print('no previously geocoded results found in dataframe')
    df = df_input

1717 records already geocoded in the input data


In [121]:
# filter to those without street address but with street name and street number

df_geo_source = df[(df.street_addr.notnull())]

# create street address from street name and street number if present
df2 = df[(df.street_addr.isnull()) & (df.street_name.notnull()) & (df.street_no.notnull())]
df2['street_addr'] = df2['street_addr'].fillna(df2['street_no'].astype(str) + " " + df2['street_name'])
print("number of addresses found from other columns: " + str(len(df2)))

# save results that do not need geocoding
df_leftover = pd.concat([df_geo_source, df_previously_r_geocoded, df2])
print(len(df_leftover))
# create new street address from street address and number

# df_all.drop(df_all[(df_all.street_addr.isnull()) & (df_all.street_name.notnull()) & (df_all.street_no.notnull())].index, inplace=True)

# df_all = df_all.append(df2)

number of addresses found from other columns: 76
22610


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['street_addr'] = df2['street_addr'].fillna(df2['street_no'].astype(str) + " " + df2['street_name'])


In [117]:
print("missing street addresses: " + str(df_input['street_addr'].isna().sum()))

missing street addresses: 1802


In [118]:
# filter to just those without no street address
df = df_all[df_all.street_addr.isnull()]
len(df)

1726

In [8]:
# define parameters for osm api call
headers = {
    'User-Agent': 'Sam Lumley, Statistics Canada',
    'From': 'sam.lumley@statcan.gc.ca' 
    }

url = 'https://nominatim.openstreetmap.org/reverse?'

JSONS = []
JSONS_CITIES = []

In [9]:
# define our multiple api queries
# df.fillna('', inplace=True)

lats = list(df['latitude'])
lons = list(df['longitude'])

In [None]:
# make a nominatim reverse geocode request for each record 

# for i in range(5): # use this line for testing small batches
for i in range(len(lats)):
    
    # temp, get last five. so from len(lats) - 5 to len(lats)
#     i = i + len(lats) - 5
    
    lat = lats[i]
    lon = lons[i]
    print(str(i + 1) + ": lat: " + str(lat) + ", lon: " + str(lon))
    
    request_timing = 3 #seconds
            
    params = {'lat': lat,
            'lon': lon,
            'format':'json',
            'email':'sam.lumley@statcan.gc.ca'}
    time.sleep(request_timing) 
    coords = requests.get(url, params=params, headers=headers)
    resp = coords.json()
        
    if len(resp) > 0:
        print('osm address found')
        df['geo_source'].iloc[i] = "osm_reverse"
        
                
    JSONS.append(resp)

#     if resp!=[]:
#         print(resp)

In [None]:
with open('reverse_nominatim.json', 'w', encoding='utf-8') as f:
    json.dump(JSONS, f, ensure_ascii=False, indent=4) 

In [106]:
# read json request results into our dataframe

with open('reverse_nominatim.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)    
        
LATS=[]
LONS=[]
ADDR=[]
CITY=[]
PROV=[]
NAME=[]
TYPE=[]
COUNTRY=[]
POST=[]
CITY=[]
CLASS=[]
for index, element in enumerate(JSONS):
    if element==[]:
        LATS.append('')
        LONS.append('')
        NAME.append('')
        ADDR.append('')
        PROV.append('')
        TYPE.append('')
        COUNTRY.append('')
        POST.append('')
        CITY.append('')
        CLASS.append('')
    else:
        # For now we will use everything osm gives us
        # later we might want to filter by class - eg just to "amenity" 
        
#         if (element['address']['country_code']=='ca') and ('amenity' in element['address'].keys()): 
#         if (element['address']['country_code']=='ca') and ((element['class'] in ['amenity', 'place', 'building']) ):
            
        if 'address' in element:
            if 'country_code' in element['address']:
                if (element['address']['country_code']=='ca'):
                    
                    df['geo_source'].iloc[index] = "osm_reverse"
                    COUNTRY.append(element['address']['country_code'])
                    LATS.append(element['lat'])
                    LONS.append(element['lon'])
                    ADDR.append(element['display_name'])
                    if 'type' in element.keys():
                        TYPE.append(element['type'])
                    else:
                        TYPE.append('')
                    if 'class' in element.keys():
                        CLASS.append(element['class'])
                    else:
                        CLASS.append('')

                    if 'amenity' in element['address']:
                        NAME.append(element['address']['amenity'])
                    else:
                        NAME.append(element['display_name'])
                    if 'state' in element.keys():
                        PROV.append(element['address']['state'])
                    else:
                        PROV.append('')
                    if 'postcode' in element['address']:
                        POST.append(element['address']['postcode'])
                    else:
                        POST.append('')
                    if 'city' in  element['address']:
                        CITY.append(element['address']['city'])
                    elif 'town' in element['address']:
                        CITY.append(element['address']['town'])
                    else:
                        CITY.append('')
                        
                else:
                    LATS.append('')
                    LONS.append('')
                    NAME.append('')
                    ADDR.append('')
                    PROV.append('')
                    CLASS.append('')
                    TYPE.append('')
                    COUNTRY.append('')
                    CITY.append('')
                        
            else:
                LATS.append('')
                LONS.append('')
                NAME.append('')
                ADDR.append('')
                PROV.append('')
                CLASS.append('')
                TYPE.append('')
                COUNTRY.append('')
                CITY.append('')         
                        
        else:
            LATS.append('')
            LONS.append('')
            NAME.append('')
            ADDR.append('')
            PROV.append('')
            CLASS.append('')
            TYPE.append('')
            COUNTRY.append('')
            CITY.append('')

df['osm_address']=ADDR
df['osm_name']=NAME
df['osm_lat']=LATS
df['osm_lon']=LONS
df['osm_prov']=PROV
df['osm_country']=COUNTRY
df['osm_class']=CLASS
df['osm_type']=TYPE
df['osm_city']=CITY         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['osm_address']=ADDR
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['osm_name']=NAME
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['osm_lat']=LATS
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

In [107]:
df['geo_source'].value_counts()

osm_reverse    1717
Source            4
osm_city          1
Name: geo_source, dtype: int64

In [108]:
# recombine datasets: (1) no reverse geocode needed
# (2) previously reverse geocoded (3) newly reverse geocoded

# df_leftover = df_leftover[df_leftover.geo_source.notnull()]
df_everything = pd.concat([df_leftover, df_previously_r_geocoded, df])
print(len(df_everything))

22619


In [109]:
df_everything.to_csv("reverse_geocoded_14-03-2022.csv", index=False)

In [110]:
df_everything['geo_source'].value_counts()

Source               20752
osm_reverse           1717
osm_city                63
gc_street_address       60
osm_facility_name       22
osm_address              1
Name: geo_source, dtype: int64