## Create Source Attribution Modeling Datasets

In [5]:
import pandas as pd
import numpy as np
import os
import imputation_utils
import general_utils
import geopandas as gpd
import rpy2.robjects as robjects

In [None]:
# Source data output location
source_attribution_modeling_dataset_output_folder = '../../data/modeling_data/source_attribution'

# Lookup table to impute missing RL/MDL values
rl_mdl_file = ('../../data/Extracted lab report data/RL_MDL_lookup_table.csv')

In [None]:
run constants.py

Read in imputed data

In [2]:
# Read in all imputed data
path, dirs, files = next(os.walk(f"{ros_folder}/imputed"))

imputed_data = pd.DataFrame()
for file in files:
    df = pd.read_csv(f'{ros_folder}/imputed/{file}')
    imputed_data = pd.concat([imputed_data,df],axis=0)

In [3]:
imputed_data.shape

(14304, 26)

### Source Attribution Datasets

In [4]:
disposal_sites_info = pd.read_parquet(f'{disposal_sites_output}.csv')

# Lookup table to impute missing RL/MDL values
rl_mdl_lookup = pd.read_csv(rl_mdl_file)

In [None]:
df_pfas_vars = pd.read_csv(pfas_dict['file_location'])

# Get list of pfas compounds
pfas_vars = df_pfas_vars[df_pfas_vars[pfas_dict['pfas_filter_col']] == 1][pfas_dict['acronym_col']]

Receptor Well data (response)

* The following code separates receptor from source in imputed data. The 'folder' column has unique values for source v. receptor - therefore we can separate using that measure. 
* Change as necessary in order to isolate receptor data 

In [5]:
receptor_lab_reports = ['Raw Reports/Receptor Lab Reports/', 
                        'Disposal Site Private Well Lab Reports/2-0021075 - Stow - PFAS/',
                        'Disposal Site Private Well Lab Reports/2-0021045 - Stow - MA Fire Academy/',
                        'Disposal Site Private Well Lab Reports/2-0021072 - PRINCETON - PFAS/',
                        'Disposal Site Private Well Lab Reports/2-0020923 - Hudson - 308 Chestnut PFAS/',
                        'Disposal Site Private Well Lab Reports/2-0020439 - Hudson - Cranberry PWS/']

receptor_df = imputed_data[(imputed_data['folder'].isin(receptor_lab_reports))]

In [None]:
receptor_df['Units'] = receptor_df['Units'].str.lower()

In [7]:
unique_vars = ['report', 'date_sampled', 'sample_id', 'lab', 'Matrix', 'Units']

receptor_df_wide = receptor_df.pivot_table(index=unique_vars, columns='Acronym', values='Result').reset_index()

In [8]:
receptor_df_wide = imputation_utils.fill_na_with_mdl_rl(df = receptor_df_wide,
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [11]:
receptor_df_wide.to_csv(f'{source_attribution_modeling_dataset_output_folder}/receptor_data.csv')

AFFF

In [12]:
# Attach AFFF disposal source reports (extracting 'AFFF' source type RTNs) to AFFF data in separate extraction
AFFF_RTNs = list(disposal_sites_info[disposal_sites_info['Source Type'].str.lower() == 'afff']['RTN'])

afff_df = imputed_data[((imputed_data['RTN'].isin(AFFF_RTNs)) & (imputed_data['folder'] == 'Raw Reports/Source Lab Reports/')) 
                                       | (imputed_data['folder'] == 'AFFF Lab Reports/')]

afff_df['Units'] = afff_df['Units'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  afff_df['Units'] = afff_df['Units'].str.lower()


In [13]:
afff_df['Units'].unique()

array(['ng/l', 'ng/g'], dtype=object)

In [14]:
unique_vars = ['report', 'date_sampled', 'sample_id', 'lab', 'Matrix', 'Units']

afff_df_wide = afff_df.pivot_table(index=unique_vars, columns='Acronym', values='Result').reset_index()

In [15]:
afff_df_wide = imputation_utils.fill_na_with_mdl_rl(df = afff_df_wide,
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [18]:
afff_df_wide.to_csv(f'{source_attribution_modeling_dataset_output_folder}/afff_source.csv')

Airport

In [19]:
# Attach AFFF disposal source reports (extracting 'AFFF' source type RTNs) to AFFF data in separate extraction
airport_RTNs = list(disposal_sites_info[disposal_sites_info['Source Type'].str.lower() == 'airport']['RTN'])

airport_df = imputed_data[((imputed_data['RTN'].isin(airport_RTNs)) & (imputed_data['folder'] == 'Raw Reports/Source Lab Reports/'))]

airport_df['Units'] = airport_df['Units'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airport_df['Units'] = airport_df['Units'].str.lower()


In [20]:
airport_df['Units'].unique()

array(['ng/l'], dtype=object)

In [21]:
unique_vars = ['report', 'date_sampled', 'sample_id', 'lab', 'Matrix', 'Units']

airport_df_wide = airport_df.pivot_table(index=unique_vars, columns='Acronym', values='Result').reset_index()

In [22]:
airport_df_wide = imputation_utils.fill_na_with_mdl_rl(df = airport_df_wide,
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [23]:
airport_df_wide.isna().sum()

Acronym
report          0
date_sampled    0
sample_id       0
lab             0
Matrix          0
Units           0
PFBS            0
PFDA            0
PFHpA           0
PFHxA           0
PFHxS           0
PFNA            0
PFOA            0
PFOS            0
PFUnA           0
dtype: int64

In [24]:
airport_df_wide.shape

(65, 15)

In [25]:
airport_df_wide.to_csv(f'{source_attribution_modeling_dataset_output_folder}/airport_source.csv')

Other

In [26]:
# Attach AFFF disposal source reports (extracting 'AFFF' source type RTNs) to AFFF data in separate extraction
other_RTNs = list(disposal_sites_info[disposal_sites_info['Source Type'].str.lower() == 'other']['RTN'])

other_df = imputed_data[((imputed_data['RTN'].isin(other_RTNs)) & (imputed_data['folder'] == 'Raw Reports/Source Lab Reports/'))]

other_df['Units'] = other_df['Units'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_df['Units'] = other_df['Units'].str.lower()


In [27]:
unique_vars = ['report', 'date_sampled', 'sample_id', 'lab', 'Matrix', 'Units']

other_df_wide = other_df.pivot_table(index=unique_vars, columns='Acronym', values='Result').reset_index()

In [28]:
other_df['Units'].unique()

array(['ng/g', 'ng/l'], dtype=object)

In [29]:
other_df_wide = imputation_utils.fill_na_with_mdl_rl(df = other_df_wide,
                                                   pfas_vars = pfas_vars,
                                                   rl_mdl_lookup = rl_mdl_lookup) 

In [30]:
other_df_wide.isna().sum()

Acronym
report          0
date_sampled    0
sample_id       0
lab             0
Matrix          0
Units           0
NEtFOSAA        0
PFBS            0
PFDA            0
PFDoA           0
PFHpA           0
PFHxA           0
PFHxS           0
PFNA            0
PFOA            0
PFOS            0
PFTA            0
PFTrDA          0
PFUnA           0
dtype: int64

In [31]:
other_df_wide.shape

(190, 19)

In [32]:
other_df_wide.to_csv(f'{source_attribution_modeling_dataset_output_folder}/other_source.csv')