# Reverse geocoding using open-street-maps

In [10]:
import pandas as pd
import numpy as np
import geopandas as gpd
import geocoder

In [13]:
input_data_path = '../../data/disposal_sites/PFAS_Sites_2021-11-07.csv'
output_data_path = '../../data/disposal_sites/PFAS_Sites_2021-11-07_geocoded.csv'
output_geojson_path = '../../data/disposal_sites/PFAS_Sites_2021-11-07_geocoded.geojson'

In [19]:
df = pd.read_csv(input_data_path)

Make df into correct format

In [20]:
df = df[~(df['RTN'].isna())]

In [21]:
df = df[['Extracted', 'RTN', 'Town', 'Site_Name', 'Address', 'Notif_Date',
       'Disposition', 'Chemical', 'Source or receptor data', 'Source Type']]

In [22]:
df_dict = {'address_col' : 'Address',
           'town_col' : 'Town'}

In [23]:
addresses = df[[df_dict['address_col'], df_dict['town_col']]].drop_duplicates()

In [24]:
addresses['full_address'] = addresses[df_dict['address_col']] + ', ' + addresses[df_dict['town_col']] + ' MA'

In [25]:
%%time
def geocode(addresses, address_col):
    
    lons = []
    lats = []
    for address in addresses[address_col]:

        g = geocoder.osm(address)

        try:
            lons.append(g.osm['x'])
            lats.append(g.osm['y'])

        except: 
            print(f'{address} : No results found')
            lons.append(np.nan)
            lats.append(np.nan)
            
    return lats, lons

30 SPECTACLE POND ROAD, AYER MA : No results found
NO ADDRESS, WORCESTER MA : No results found
BENNETT BROOK, LITTLETON MA : No results found
Wall time: 26.1 s


In [None]:
lats, lons = geocode(addresses, address_col = 'full_address')

In [26]:
addresses['lat'] = lats
addresses['lon'] = lons

In [27]:
# Manually fill-in the places that couldn't be reverse geocoded
addresses[addresses['lat'].isna()]

Unnamed: 0,Address,Town,full_address,lat,lon
33,30 SPECTACLE POND ROAD,AYER,"30 SPECTACLE POND ROAD, AYER MA",,
34,NO ADDRESS,WORCESTER,"NO ADDRESS, WORCESTER MA",,
42,BENNETT BROOK,LITTLETON,"BENNETT BROOK, LITTLETON MA",,


### Overwrite with correct lats and lons

In [28]:
addresses.loc[33,'lon'] = -71.5269418
addresses.loc[33,'lat'] = 42.5565774

In [29]:
addresses.loc[34,'lon'] = -71.8428431
addresses.loc[34,'lat'] = 42.2753779

In [30]:
addresses.loc[42,'lon'] = -71.5500131
addresses.loc[42,'lat'] = 42.542443

### Attach to original dataset

In [31]:
df = df.merge(addresses, on = [df_dict['address_col'], df_dict['town_col']])

In [32]:
df.head(2)

Unnamed: 0,Extracted,RTN,Town,Site_Name,Address,Notif_Date,Disposition,Chemical,Source or receptor data,Source Type,full_address,lat,lon
0,Extracted,1-0021230,LANESBOROUGH,LANESBOROUGH WATER SUPPLY WELL #1,20 BRIDGE STREET,12/9/2020,LESS,PFOS & PFAS,Source lab reports,Other,"20 BRIDGE STREET, LANESBOROUGH MA",42.521411,-73.23162
1,Extracted,1-0021289,NORTHFIELD,FOUR STAR FARMS,496 PINE MEADOW ROAD,4/5/2021,LESS,PFAS,Receptor lab reports,,"496 PINE MEADOW ROAD, NORTHFIELD MA",42.636395,-72.482138


### Output as geocoded dataset

In [33]:
df.to_parquet(output_data_path)

In [11]:
df_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.lon, df.lat),
    crs={"init":"EPSG:4326"})

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [14]:
df_gdf.to_file(output_geojson_path, driver= 'GeoJSON')