## Create Modeling Datasets

* Well exposure model
    * Only use the largest sample for each location

In [2]:
import pandas as pd
import numpy as np
import os
import imputation_utils
import geopandas as gpd
import rpy2.robjects as robjects

In [4]:
run constants.py

Read in imputed data

In [5]:
# Read in all imputed data
path, dirs, files = next(os.walk(f"{ros_folder}/imputed"))

imputed_data = pd.DataFrame()
for file in files:
    df = pd.read_csv(f'{ros_folder}/imputed/{file}')
    imputed_data = pd.concat([imputed_data,df],axis=0)

In [6]:
imputed_data.shape

(14304, 26)

### Source Attribution Datasets

In [15]:
disposal_sites_info = pd.read_csv(f'{disposal_sites_output}.csv')
rl_mdl_lookup = pd.read_csv('../../data/Extracted lab report data/RL_MDL_lookup_table.csv')

In [None]:
df_pfas_vars = pd.read_csv(pfas_dict['file_location'])

# Get list of pfas compounds
pfas_vars = df_pfas_vars[df_pfas_vars[pfas_dict['pfas_filter_col']] == 1][pfas_dict['acronym_col']]

### Well Exposure Datasets

##### Disposal Source
* Taking max of each RTN
* Make sure 'source' is in folder column to identify source samples

In [7]:
# Create well exposure data outputs (only use biggest for source)
disposal_source_df = imputed_data[(imputed_data['folder'].str.lower().str.contains('source'))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disposal_source_df['date_sampled_ds'] =  np.where(disposal_source_df['date_sampled'].isna(), '01/01/2001' , disposal_source_df['date_sampled'])


In [None]:
# Replace unknown date sampled 
disposal_source_df['date_sampled_ds'] =  np.where(disposal_source_df['date_sampled'].isna(), '01/01/2001' , disposal_source_df['date_sampled'])

In [8]:
# Merge lat/lon information
disposal_source_df = disposal_source_df.merge(disposal_sites_info[['RTN', 'lat', 'lon']], on = 'RTN')

In [9]:
# Make sure it is in wide format
disposal_source_df_wide = disposal_source_df.pivot_table(index=['RTN', 'report', 'lab' ,'sample_id', 'Matrix' ,'date_sampled_ds', 'lat', 'lon'], columns='Acronym', values='Result_val').reset_index()

In [None]:
# Use the maximum sample for each source sample
max_disposal_source_df = imputation_utils.create_max_disposal_df(disposal_source_df_wide)

In [17]:
max_disposal_source_df_wide = imputation_utils.fill_na_with_mdl_rl(df = max_disposal_source_df.reset_index(),
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [18]:
max_disposal_source_df_wide.shape

(18, 23)

In [19]:
max_disposal_source_df_wide.columns

Index(['level_0', 'RTN', 'index', 'report', 'lab', 'sample_id', 'Matrix',
       'date_sampled_ds', 'lat', 'lon', 'NEtFOSAA', 'PFBS', 'PFDA', 'PFDoA',
       'PFHpA', 'PFHxA', 'PFHxS', 'PFNA', 'PFOA', 'PFOS', 'PFTA', 'PFTrDA',
       'PFUnA'],
      dtype='object', name='Acronym')

In [None]:
# Fill in based on columns above with all PFAS
pfas_list = ['NEtFOSAA', 'PFBS', 'PFDA', 'PFDoA',
       'PFHpA', 'PFHxA', 'PFHxS', 'PFNA', 'PFOA', 'PFOS', 'PFTA', 'PFTrDA',
       'PFUnA']

In [20]:
for col in max_disposal_source_df_wide.columns:
    if col in pfas_list:
        max_disposal_source_df_wide.rename(columns = {col : f'{col}_DS'}, inplace = True)

Make into gdf

In [21]:
# turn into geodataframe
max_disposal_source_gdf = gpd.GeoDataFrame(
    max_disposal_source_df_wide, 
    geometry=gpd.points_from_xy(max_disposal_source_df_wide.lon, max_disposal_source_df_wide.lat),
    crs={"init":"EPSG:4326"})

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [22]:
max_disposal_source_gdf.shape

(18, 24)

Write out all files to baseline folder

In [23]:
max_disposal_source_gdf.to_file('../../data/modeling_data/well_exposure/base_samples/diposal_source_gdf.geojson', drive = 'GeoJSON')
max_disposal_source_df_wide.to_csv('../../data/modeling_data/well_exposure/base_samples/diposal_source_df.csv')

***

##### Private Well

In [24]:
private_well_df = imputed_data[(imputed_data['folder'].str.lower().str.contains('well')) | (imputed_data['folder'].str.lower().str.contains('receptor'))]

In [25]:
private_well_df.shape

(9202, 26)

In [26]:
private_well_df['RTN'].unique()

array(['2-0021075', '2-0021045', '3-0036774', '3-0036649', '1-0021289',
       '2-0020923', '4-0027571', '2-0020439', '2-0021072', '4-0028856',
       '4-0028855'], dtype=object)

In [27]:
# Replace unknown date sampled 
private_well_df['date_sampled_well'] =  np.where(private_well_df['date_sampled'].isna(), '01/01/2001' , private_well_df['date_sampled']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  private_well_df['date_sampled_well'] =  np.where(private_well_df['date_sampled'].isna(), '01/01/2001' , private_well_df['date_sampled'])


In [28]:
### PLACE OF POSSIBLE IMPROVEMENT - MORE ACCURATE LAT/LON - On Natick! - https://drive.google.com/drive/search?q=3-0036774%20-%20Natick%20-%20Document_ID_603323.pdf (all the different locations)

# West Tisbury - attach using excel table. Use LOC_ID as lat/lon
west_tisbury = pd.read_csv('../../data/private_wells/4-0027571_WestTisbury_Key_Table - WestTisbury_Key_Table.csv')
locations_updated = pd.read_csv('../../data/private_wells/private_well_locations_filled_in.csv')

west_tisbury_wells = locations_updated[locations_updated['report'].str.lower().str.contains('west tisbury')]

# Remove the word property - then join to get LOC_ID, and convert to lat/lon
west_tisbury_wells['sample_id'] = west_tisbury_wells['sample_id'].str.replace('PROPERTY ', '')
west_tisbury_wells['sample_id'] = west_tisbury_wells['sample_id'].str.replace('PROPERTY-', '')

west_tisbury_wells_w_loc_id = west_tisbury_wells.merge(west_tisbury[['PLAN_ID', 'LOC_ID']], left_on = 'sample_id', right_on = 'PLAN_ID')

west_tisbury_wells_w_loc_id['lon'] = west_tisbury_wells_w_loc_id['LOC_ID'].str.split('_').str[1]
west_tisbury_wells_w_loc_id['lat'] = west_tisbury_wells_w_loc_id['LOC_ID'].str.split('_').str[2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  west_tisbury_wells['sample_id'] = west_tisbury_wells['sample_id'].str.replace('PROPERTY ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  west_tisbury_wells['sample_id'] = west_tisbury_wells['sample_id'].str.replace('PROPERTY-', '')


In [29]:
wt_wells_gdf = gpd.GeoDataFrame(
    west_tisbury_wells_w_loc_id,
    geometry=gpd.points_from_xy(west_tisbury_wells_w_loc_id.lon, west_tisbury_wells_w_loc_id.lat),
    crs={"init":"EPSG:26986"})

wt_wells_gdf = wt_wells_gdf.to_crs('EPSG:4326')

In [30]:
wt_wells_gdf['lon'] = wt_wells_gdf['geometry'].x
wt_wells_gdf['lat'] = wt_wells_gdf['geometry'].y

In [31]:
wt_wells_df = wt_wells_gdf[['report', 'RTN', 'address', 'sample_id', 'lon','lat']]

In [32]:
# Geocode the rest of the locations (non west-tisbury using the address) - drop if address doesn't exist
private_wells = locations_updated[~(locations_updated['report'].str.lower().str.contains('west tisbury'))]

private_addresses = private_wells[['address']].drop_duplicates()

In [34]:
private_addresses['lat'], private_addresses['lon'] = imputation_utils.geocode(private_addresses, 'address')

22 RUSSET LANE, Stow, MA : No results found
30 RUSSET LN, Stow MA : No results found
38 Russet Lane, Stow MA : No results found


In [36]:
# Manually fill-in the places that couldn't be reverse geocoded
private_addresses[private_addresses['lat'].isna()]

Unnamed: 0,address,lat,lon
47,"22 RUSSET LANE, Stow, MA",,
113,"30 RUSSET LN, Stow MA",,
206,"38 Russet Lane, Stow MA",,


Overwrite with correct lats and lons

In [37]:
private_addresses.loc[47,'lon'] = -71.5077957
private_addresses.loc[47,'lat'] = 42.439586

In [38]:
private_addresses.loc[113,'lon'] = -71.5075817
private_addresses.loc[113,'lat'] = 42.4402009

In [39]:
private_addresses.loc[206,'lon'] = -71.507332
private_addresses.loc[206,'lat'] = 42.4407338

Merge lat/lons to private well dataset

In [40]:
private_wells = private_wells.merge(private_addresses, on = 'address')

In [41]:
private_wells.shape

(388, 6)

Merge lat/lons to private well samples

In [42]:
comb_private_wells = pd.concat([private_wells, wt_wells_df], axis = 0)

In [43]:
comb_private_wells.shape

(469, 6)

In [44]:
private_well_df['sample_id'] = private_well_df['sample_id'].str.replace('PROPERTY ', '')
private_well_df['sample_id'] = private_well_df['sample_id'].str.replace('PROPERTY-', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  private_well_df['sample_id'] = private_well_df['sample_id'].str.replace('PROPERTY ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  private_well_df['sample_id'] = private_well_df['sample_id'].str.replace('PROPERTY-', '')


In [45]:
merged_private_well_df = private_well_df.merge(comb_private_wells, on = ['report', 'RTN', 'sample_id'])

In [46]:
merged_private_well_df.shape

(5294, 30)

In [48]:
# Make sure it is in wide format
private_well_df_wide = merged_private_well_df.pivot_table(index=['RTN', 'date_sampled_well', 'sample_id', 'lab' ,'Matrix' ,'lat', 'lon'], columns='Acronym', values='Result_val').reset_index()

In [49]:
private_well_df_wide_imputed = imputation_utils.fill_na_with_mdl_rl(df = private_well_df_wide,
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [50]:
for col in private_well_df_wide_imputed.columns:
    if col not in ['RTN', 'report', 'lab' ,'sample_id', 'Matrix' ,'date_sampled_ds', 'lat', 'lon']: # For each pfas
        private_well_df_wide_imputed.rename(columns = {col : f'{col}_well'}, inplace = True)

In [51]:
# turn into geodataframe
private_well_gdf = gpd.GeoDataFrame(
    private_well_df_wide_imputed,
    geometry=gpd.points_from_xy(private_well_df_wide_imputed.lon, private_well_df_wide_imputed.lat),
    crs={"init":"EPSG:4326"})

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [52]:
private_well_gdf.shape

(456, 21)

In [53]:
private_well_gdf.columns

Index(['RTN', 'date_sampled_well_well', 'sample_id', 'lab', 'Matrix', 'lat',
       'lon', 'NEtFOSAA_well', 'PFBS_well', 'PFDA_well', 'PFDoA_well',
       'PFHpA_well', 'PFHxA_well', 'PFHxS_well', 'PFNA_well', 'PFOA_well',
       'PFOS_well', 'PFTA_well', 'PFTrDA_well', 'PFUnA_well', 'geometry'],
      dtype='object', name='Acronym')

Write out all files to baseline folder

In [54]:
private_well_gdf.to_file('../../data/modeling_data/well_exposure/base_samples/private_well_gdf.geojson', drive = 'GeoJSON')
private_well_df_wide.to_csv('../../data/modeling_data/well_exposure/base_samples/private_well_df.csv')