## Create Source Attribution Modeling Datasets

In [5]:
import pandas as pd
import numpy as np
import os
import imputation_utils
import general_utils
import geopandas as gpd
import rpy2.robjects as robjects

In [None]:
# Source data output location
source_attribution_modeling_dataset_output_folder = '../../data/modeling_data/source_attribution'

# Lookup table to impute missing RL/MDL values
rl_mdl_file = ('../../data/Extracted lab report data/RL_MDL_lookup_table.csv')

# file location of waste/sludge effluent data
residential_file = '../../data/residential/Wastewater Treatment Systems Effluent  -  Sludge 2021-08.csv'

In [None]:
run constants.py

### Source Attribution Datasets

In [4]:
disposal_sites_info = pd.read_parquet(f'{disposal_sites_output}.csv')

# Lookup table to impute missing RL/MDL values
rl_mdl_lookup = pd.read_csv(rl_mdl_file)

In [None]:
df_pfas_vars = pd.read_csv(pfas_dict['file_location'])

# Get list of pfas compounds
pfas_vars = df_pfas_vars[df_pfas_vars[pfas_dict['pfas_filter_col']] == 1][pfas_dict['acronym_col']]

****

##### Residential

In [40]:
paths = ['../../data/residential/ros/input', '../../data/residential/ros/output', '../../data/residential/ros/analysis' ,'../../data/residential/imputed'] 

for path in paths:
    # Check whether the specified path exists or not
    isExist = os.path.exists(path)

    if not isExist:
      # Create a new directory because it does not exist 
      os.makedirs(path)

In [41]:
orig_residential_df = pd.read_csv(residential_file, encoding='iso-8859-1')

residential_df = orig_residential_df.copy()
residential_df.rename(columns = {'PFBS    ' : 'PFBS'}, inplace = True)

unique_vars = ['Facility', 'Sample Collection Date', 'Sample Field ID', 'Sample Laboratory ID #', ]

In [42]:
mrl_dict, mdl_dict, pfas_vars = imputation_utils.create_mrl_mdl_dict(residential_df, pfas_vars)

In [43]:
pfas_avail = list(set(pfas18).intersection(residential_df.columns))
residential_df = residential_df.reset_index()

In [46]:
df_imputation_dict = {}
for pfas in pfas_avail:
    residential_df[pfas] = pd.to_numeric(residential_df[pfas], errors = 'coerce')
    
    id_vars=['index','Facility', 'Sample Collection Date', 'Sample Field ID',]
    pfas_df = pd.melt(residential_df, id_vars=id_vars, value_vars=[pfas]).rename(columns = {'value' : 'Result_val', 'variable' : 'Acronym'})
    mrl_df = pd.melt(residential_df, id_vars=id_vars, value_vars=[mrl_dict[pfas]]).rename(columns = {'value' : 'RL'}).drop(columns = 'variable')
    mdl_df = pd.melt(residential_df, id_vars=id_vars, value_vars=[mdl_dict[pfas]]).rename(columns = {'value' : 'MDL'}).drop(columns = 'variable')
    
    long_df = pfas_df.merge(mrl_df, on = id_vars).merge(mdl_df, on = id_vars)

    long_df['Result_val'] = pd.to_numeric(long_df['Result_val'], errors = 'coerce')
    long_df['RL'] = pd.to_numeric(long_df['RL'], errors = 'coerce')
    long_df['MDL'] = pd.to_numeric(long_df['MDL'], errors = 'coerce')

    long_df['RL'] = np.where((long_df['RL'].isna()) & (long_df['MDL'].isna()) & (long_df['Result_val'].isna()), long_df['RL'].mode()[0], long_df['RL'])

    long_df['limit'] = long_df[['RL', 'MDL']].min(axis = 1)

    long_df['Result_val'] = np.where(long_df['Result_val'].isna(), long_df['limit'], long_df['Result_val'])

    long_df['Result_val_cen'] = np.where(long_df['Result_val'] == long_df['limit'], 1, 0)

    long_df = long_df.sort_values('Result_val')
    
    #If % of values ND < 80% output df as csv into new folder for ROS
    perc_non_detects = long_df['Result_val_cen'].sum() / long_df.shape[0]
    if (long_df['Result_val_cen'].sum() / long_df.shape[0] <= 0.8):
        long_df.to_csv(f'../../data/residential/ros/input/residential_{pfas}.csv')

    # else use 1/2 limit
    else:
        # If less than 95% is non-detect OR detected samples do not exceed 20 ng/l 
        if ((perc_non_detects) < non_detect_threshold) | (long_df['Result_val'].describe()['max'] > 20):
            long_df['Result_val'] = np.where(long_df['Result_val_cen'] == 1, long_df['Result_val'] / 2, long_df['Result_val'])
            long_df.to_csv(f'../../data/residential/imputed/residential_{pfas}.csv')
        else:
            print(pfas, ':')
            print(round(perc_non_detects * 100, 2), '% ND - Not included in analysis')
            print(long_df['Result_val'].describe())
        
    
    df_imputation_dict[pfas] = long_df

NEtFOSAA :
93.87 % ND - Not included in analysis
count    212.000000
mean       0.614170
std        0.646551
min        0.124000
25%        0.491000
50%        0.491000
75%        0.491000
max        5.980000
Name: Result_val, dtype: float64
PFTA :
99.06 % ND - Not included in analysis
count    212.000000
mean       0.507175
std        0.188821
min        0.075000
25%        0.491000
50%        0.491000
75%        0.491000
max        2.300000
Name: Result_val, dtype: float64
NMeFOSAA :
95.75 % ND - Not included in analysis
count    212.000000
mean       0.668896
std        1.520671
min        0.044000
25%        0.440000
50%        0.440000
75%        0.440000
max       15.400000
Name: Result_val, dtype: float64


In [47]:
robjects.globalenv['ros_analysis_location'] = "C:/Users/dcher/OneDrive/Desktop/repos/PFAS-Analysis/data/residential/ros/analysis"
robjects.globalenv['ros_inputs_location'] = "C:/Users/dcher/OneDrive/Desktop/repos/PFAS-Analysis/data/residential/ros/input"
robjects.globalenv['ros_outputs_location'] = "C:/Users/dcher/OneDrive/Desktop/repos/PFAS-Analysis/data/residential/ros/output"

r_source = robjects.r['source']
r_source("./ros.R")

R[write to console]: Loading required package: survival

R[write to console]: 
Attaching package: 'NADA'


R[write to console]: The following object is masked from 'package:stats':

    cor




0,1
value,[RTYPES.NILSXP]
visible,[RTYPES.LGLSXP]


In [48]:
# Read data back in after ROS. Overwrite with ROS modeled data
path, dirs, files = next(os.walk("../../data/residential/ros/output"))

for file in files:
    matrix_type = file.split('_')[0]
    acronym = file.split('_')[1].split('.')[0]
    
    ros_df = pd.read_csv(path + '/' + file)
    
    data = df_imputation_dict[acronym]
    data_filtered = data[data['Acronym'] == acronym]
    
    data_filtered = data_filtered.reset_index()

#     # Overwrite with ROS modeled sample
    data_filtered['Result_val'] = ros_df['modeled']
    
    # Error in ROS - Dropped censored values that exceed max of uncensored values. Use 1/2 imputation for these places.
    data_filtered['Result_val'] = np.where(data_filtered['Result_val'].isna(), data_filtered['limit'] / 2, data_filtered['Result_val'])
    
#     # write out to imputed folder
    data_filtered.to_csv(f'../../data/residential/imputed/{matrix_type}_{acronym}.csv')

In [49]:
# Read in all imputed data
path, dirs, files = next(os.walk("../../data/residential/imputed/"))

res_imputed_data = pd.DataFrame()
for file in files:
    df = pd.read_csv(f'../../data/residential/imputed/{file}')
    res_imputed_data = pd.concat([res_imputed_data,df],axis=0)

In [50]:
# Bring together and convert to wide format
res_df_wide = res_imputed_data.pivot_table(index=id_vars, columns='Acronym', values='Result_val').reset_index()

In [51]:
res_df_wide['Units'] = 'g/kg'

In [52]:
res_df_wide.isna().sum()

Acronym
index                     0
Facility                  0
Sample Collection Date    0
Sample Field ID           0
PFBS                      0
PFDA                      0
PFDoA                     0
PFHpA                     0
PFHxA                     0
PFHxS                     0
PFNA                      0
PFOA                      0
PFOS                      0
PFTrDA                    0
PFUnA                     0
Units                     0
dtype: int64

In [53]:
res_df_wide.to_csv('../../data/modeling_data/source_attribution/residential_source.csv')