Functions to prepare inputs for model dataset creation

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import imputation_utils

In [None]:
run constants.py

In [None]:
df = pd.read_csv(f'{disposal_sites_dict['file_location']}.csv')

Make df into correct format

In [None]:
addresses = df[[disposal_sites_dict['address_col'], disposal_sites_dict['town_col'], disposal_sites_dict['state_col']]].drop_duplicates()
addresses['full_address'] = addresses[disposal_sites_dict['address_col']] + ', ' + addresses[disposal_sites_dict['town_col']] + ' ' + addresses[disposal_sites_dict['state_col']]

Geocode locations

In [None]:
lats, lons = imputation_utils.geocode(addresses, address_col = 'full_address')
addresses['lat'] = lats
addresses['lon'] = lons

In [None]:
# Manually fill-in the places that couldn't be reverse geocoded
addresses[addresses['lat'].isna()]

Unnamed: 0,Address,Town,full_address,lat,lon
33,30 SPECTACLE POND ROAD,AYER,"30 SPECTACLE POND ROAD, AYER MA",,
34,NO ADDRESS,WORCESTER,"NO ADDRESS, WORCESTER MA",,
42,BENNETT BROOK,LITTLETON,"BENNETT BROOK, LITTLETON MA",,


Overwrite with correct lats and lons. Used Google maps to get lat/lon coordinates
* Notice that the numbers are specific to the index

In [None]:
addresses.loc[33,'lon'] = -71.5269418
addresses.loc[33,'lat'] = 42.5565774

In [None]:
addresses.loc[34,'lon'] = -71.8428431
addresses.loc[34,'lat'] = 42.2753779

In [None]:
addresses.loc[42,'lon'] = -71.5500131
addresses.loc[42,'lat'] = 42.542443

Attach to original dataset

In [None]:
df = df.merge(addresses, on = [disposal_sites_dict['address_col'], disposal_sites_dict['town_col'], disposal_sites_dict['state_col']])

In [None]:
df.head(2)

Unnamed: 0,Extracted,RTN,Town,Site_Name,Address,Notif_Date,Disposition,Chemical,Source or receptor data,Source Type,full_address,lat,lon
0,Extracted,1-0021230,LANESBOROUGH,LANESBOROUGH WATER SUPPLY WELL #1,20 BRIDGE STREET,12/9/2020,LESS,PFOS & PFAS,Source lab reports,Other,"20 BRIDGE STREET, LANESBOROUGH MA",42.521411,-73.23162
1,Extracted,1-0021289,NORTHFIELD,FOUR STAR FARMS,496 PINE MEADOW ROAD,4/5/2021,LESS,PFAS,Receptor lab reports,,"496 PINE MEADOW ROAD, NORTHFIELD MA",42.636395,-72.482138


### Output as geocoded dataset

In [None]:
df.to_csv(f'{disposal_sites_output}.csv', index = False)

In [None]:
df_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.lon, df.lat),
    crs={"init":"EPSG:4326"})

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [None]:
df_gdf.to_file(f'{disposal_sites_output}.geojson', driver= 'GeoJSON')