# PFAS Toolbox -  Thomas Dairy Biosolids

### 1) Libraries Import (REQUIRED)

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import chardet
import seaborn as sns
import openpyxl 
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import FixedLocator, FixedFormatter
import scipy.stats as stats
from matplotlib.ticker import AutoMinorLocator

### 2) Data Import and Export (REQUIRED)

#### Define Functions for Data Import and Export (Required)

In [10]:
def read_excel_sheet(excel_file, sheet_name):
    """
    Read a specific sheet from an Excel file into a DataFrame.

    Args:
        excel_file (str): The path to the Excel file.
        sheet_name (str): The name of the sheet to read.

    Returns:
        pd.DataFrame or None: A DataFrame containing the sheet data or None if the sheet is not found.
    """
    try:
        # Load the specified sheet from the Excel file into a DataFrame
        df = pd.read_excel(excel_file, sheet_name=sheet_name, skiprows=1)
        return df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [11]:
def dfs_to_excel(dfs, filename):
    """
    Write an array of dataframes to separate tabs in an Excel file.
    
    Parameters:
    - dfs: A dictionary where keys are the tab names and values are the DataFrames to write.
    - filename: The name of the Excel file to write to.
    """
    # Create a Pandas Excel writer using XlsxWriter as the engine.
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        for tab_name, df in dfs.items():
            # Write each dataframe to a separate sheet
            df.to_excel(writer, sheet_name=tab_name)
    
    print(f"File '{filename}' has been written with {len(dfs)} tabs.")

#### OPTION 1) Import Data from Excel Macro

In [12]:
# Example usage:

#Specify where the file is saved use '/' instead of '\'
excel_file_path = 'U:/Research_and_Innovation/_ModellingGroupProjects/PFAS Investigations/Data and analysis/Database as of 04-15-2024.xlsm'
#Specify sheet name
sheet_name = 'PFAS Database'

#Call data import function
data_df = read_excel_sheet(excel_file_path, sheet_name)

data_df.head()

Unnamed: 0,Sampling point Date Component,Sample Number,Sampling Point,Sample Name,Sampled Date,Analysis,Component,Qualifiers,Result,Units,...,ChainLength,Pseudonym,Dominant Contributor,Sector,Depth,Edit Notes,Field,Notes,Region,Crop
0,ANODIZE SOLUTIONS (FPT)-43692.3333333333-10:2F...,259452.0,ANODIZE SOLUTIONS (FPT),,2019-08-15 08:00:00,PFAS-CL,10:2Fluorotelomersulfonic Acid (Report),,<9.40,ppt_wt_v,...,12.0,Metal Finisher B,,Metal Finishing,,,,,,
1,ANODIZE SOLUTIONS (FPT)-43692.3333333333-11Cl-...,259452.0,ANODIZE SOLUTIONS (FPT),,2019-08-15 08:00:00,PFAS-CL,11Cl-PF3OUdS (Report),,<4.70,ppt_wt_v,...,10.0,Metal Finisher B,,Metal Finishing,,,,,,
2,ANODIZE SOLUTIONS (FPT)-43692.3333333333-4:2Fl...,259452.0,ANODIZE SOLUTIONS (FPT),,2019-08-15 08:00:00,PFAS-CL,4:2Fluorotelomersulfonic Acid (Report),,<4.70,ppt_wt_v,...,6.0,Metal Finisher B,,Metal Finishing,,,,,,
3,ANODIZE SOLUTIONS (FPT)-43692.3333333333-6:2Fl...,259452.0,ANODIZE SOLUTIONS (FPT),,2019-08-15 08:00:00,PFAS-CL,6:2Fluorotelomersulfonic Acid (Report),,<19.0,ppt_wt_v,...,8.0,Metal Finisher B,,Metal Finishing,,,,,,
4,ANODIZE SOLUTIONS (FPT)-43692.3333333333-8:2Fl...,259452.0,ANODIZE SOLUTIONS (FPT),,2019-08-15 08:00:00,PFAS-CL,8:2Fluorotelomersulfonic Acid (Report),,<9.40,ppt_wt_v,...,10.0,Metal Finisher B,,Metal Finishing,,,,,,


### 3) Import Other Data (REQUIRED)

In [13]:
# Set default tick label size
plt.rcParams['xtick.labelsize'] = 22
plt.rcParams['ytick.labelsize'] = 22

In [14]:
# Import PFAS Attributes

#Specify where the file is saved use '/' instead of '\'
excel_file_path = 'U:/Research_and_Innovation/_ModellingGroupProjects/PFAS Investigations/Data and analysis/Old Databases and Spreadsheets/Database as of 03-25-2024.xlsm'
#Specify sheet name
sheet_name = 'PFAS Attributes'

df_PFAS_Attributes = pd.read_excel(excel_file_path, sheet_name=sheet_name)

In [15]:
# Create a dictionary mapping PFAS Compounds to their corresponding Abbreviation (No need to change)
pfas_mapping = dict(zip(df_PFAS_Attributes['Database'], df_PFAS_Attributes['Abbrev']))

In [16]:
# Load the Excel file into a DataFrame
df = pd.read_excel("U:/Research_and_Innovation/_ModellingGroupProjects/PFAS Investigations/Data and analysis/PFAS Toolbox_ColorsPalette.xlsx",sheet_name='ColorPalette_NonTOP')

def parse_rgb(rgb_string):
    # Check if rgb_string is a string and not empty
    if isinstance(rgb_string, str) and rgb_string:
        # Remove parentheses and split the string by commas
        rgb_values = rgb_string.strip('()').split(',')
        # Convert each value to float and return as a tuple
        return tuple(int(value)/255 for value in rgb_values)
    else:
        # Handle the case where rgb_string is not a string or is empty
        return None  # or return (0, 0, 0, 0) for a default transparent color, for example



# Create a dictionary with PFAS compounds as keys and corresponding colors as values
compound_colors_dict = {compound: parse_rgb(rgb) for compound, rgb in zip(df['Database'], df['New_RGB'])}
compound_colors_dict_abbrev = {abbrev: parse_rgb(rgb) for abbrev, rgb in zip(df['Abbrev'], df['New_RGB'])}

### 22) Thomas Dairy Timeseries BQL

#### Functions

In [82]:
def create_multilevel_pivot_table(data_df, filter_type=None, filter_qual_summary=None, filter_sample_location=None, filter_dominant_contributor=None, start_date=None, end_date=None):
    """
    Create a multilevel pivot table with filters based on 'Type', 'Qual Summary (Grade for filtering)',
    'Sample Location', 'Dominant Contributor', and 'Sample Date' columns.

    Args:
        data_df (pd.DataFrame): The DataFrame containing the data.
        filter_type (list of str, optional): List of 'Type' values to filter the data (default is None).
        filter_qual_summary (list of str, optional): List of 'Qual Summary (Grade for filtering)' values to filter the data (default is None).
        filter_sample_location (list of str, optional): List of 'Sample Location' values to filter the data (default is None).
        filter_dominant_contributor (list of str, optional): List of 'Dominant Contributor' values to filter the data (default is None).
        start_date (str, optional): Start date in 'YYYY-MM-DD' format (default is None).
        end_date (str, optional): End date in 'YYYY-MM-DD' format (default is None).

    Returns:
        pd.DataFrame: The multilevel pivot table with average values.
    """
    try:
        # Apply filters based on user-provided values
        filtered_data = data_df

        if filter_type:
            filtered_data = filtered_data[filtered_data['Type'].isin(filter_type)]

        if filter_qual_summary:
            filtered_data = filtered_data[filtered_data['Qual Summary (Grade for filtering)'].isin(filter_qual_summary)]

        if filter_sample_location:
            filtered_data = filtered_data[filtered_data['Sample Location'].isin(filter_sample_location)]

        if filter_dominant_contributor:
            filtered_data = filtered_data[filtered_data['Dominant Contributor'].isin(filter_dominant_contributor)]

        if start_date and end_date:
            filtered_data = filtered_data[(filtered_data['Sample Date'] >= start_date) & (filtered_data['Sample Date'] <= end_date)]

        # Create the pivot table with multilevel rows
        pivot_table = pd.pivot_table(filtered_data, values='Quant Incl Estimates Incl Qual Flags',
                                     index=['Sample Location'],
                                     columns='PFAS Compound', aggfunc='mean',fill_value=np.nan)

        return pivot_table

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

#### Define Filters and Settings

In [83]:
#filter_type (list of str, optional): List of 'Type' values to filter the data (default is None).
filter_type=['Soils']
#qual_summary_filter (list): Filter values for 'Qual Summary (Grade for filtering)'.
filter_qual_summary = ['Good','Probably Ok','','Probably ok','Probably okay','probably ok']

#sample_location_filter (list): Filter values for 'Sample Location'.
filter_sample_location = ['SHR4MM-Control Site Surface','SHR4MM-Control Site 0 1ft','SHR4MM-Control Site 1 2ft','SHRBTG30-DX Site1 Soil Surface','SHRBTG30-DX Site1 Soil 0 1ft',
                          'SHRBTG30-DX Site1 Soil 1 2ft','SHRBTG30-DX Site2 Soil Surface','SHRBTG30-DX Site2 Soil 0 1ft','SHRBTG30-DX Site2 Soil 1 2ft','SHRBTG30-DX Site3 Soil-Surface',
                          'SHRBTG30-DX Site3 Soil 0 1ft','SHRBTG30-DX Site3 Soil 1 2ft']  
#start_date (str): Start date for filtering 'Sample Date', in 'YYYY-MM-DD' format.
start_date = '2023-03-01'  
#end_date (str): End date for filtering 'Sample Date', in 'YYYY-MM-DD' format.
end_date = '2024-04-30' 
#filter_dominant_contributor (list of str, optional): List of 'Dominant Contributor' values to filter the data (default is None).
filter_dominant_contributor = None


#### Create Pivot Table for Average of Quant Incl Estimates Incl Qual Flags, date as rows and sample location as columns

In [81]:
pivot_table = create_multilevel_pivot_table(data_df, filter_type=filter_type, filter_qual_summary=filter_qual_summary, filter_sample_location=filter_sample_location, filter_dominant_contributor=None, start_date=start_date, end_date=end_date)
# Rename the index of data_df using the mapping (No need to change)
pivot_table.columns = pivot_table.columns.map(pfas_mapping)
pivot_table

PFAS Compound,11Cl-PF3OUdS,3:3FTCA,4:2FTS,5:3FTCA,6:2FTS,7:3FTCA,8:2FTS,9Cl-PF3ONS,ADONA,HFPO-DA,...,PFNS,PFNA,FOSA,PFOS,PFOA,PFPeS,PFPeA,PFTeDA,PFTrDA,PFUnA
Sample Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SHR4MM-Control Site 0 1ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHR4MM-Control Site 1 2ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHR4MM-Control Site Surface,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site1 Soil 0 1ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site1 Soil 1 2ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site1 Soil Surface,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site2 Soil 0 1ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site2 Soil 1 2ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site2 Soil Surface,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
SHRBTG30-DX Site3 Soil 0 1ft,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [75]:
# Dictionary mapping index to new values
index_to_value = {
    'SHR4MM-Control Site Surface': 'Contorl Site',
    'SHR4MM-Control Site 0 1ft': 'Control Site',
    'SHR4MM-Control Site 1 2ft': 'Control Site',
    'SHRBTG30-DX Site1 Soil Surface': 'Site 1',
    'SHRBTG30-DX Site1 Soil 0 1ft': 'Site 1',
    'SHRBTG30-DX Site1 Soil 1 2ft' : 'Site 1',
    'SHRBTG30-DX Site2 Soil Surface': 'Site 2',
    'SHRBTG30-DX Site2 Soil 0 1ft': 'Site 2',
    'SHRBTG30-DX Site2 Soil 1 2ft': 'Site 2',
    'SHRBTG30-DX Site3 Soil-Surface': 'Site 3',
    'SHRBTG30-DX Site3 Soil 0 1ft' : 'Site 3',
    'SHRBTG30-DX Site3 Soil 1 2ft': 'Site 3'
}

index_to_value_Depth = {
    'SHR4MM-Control Site Surface': 'Surface',
    'SHR4MM-Control Site 0 1ft': '0-1 ft',
    'SHR4MM-Control Site 1 2ft': '1-2 ft',
    'SHRBTG30-DX Site1 Soil Surface': 'Surface',
    'SHRBTG30-DX Site1 Soil 0 1ft': '0-1 ft',
    'SHRBTG30-DX Site1 Soil 1 2ft' : '1-2 ft',
    'SHRBTG30-DX Site2 Soil Surface': 'Surface',
    'SHRBTG30-DX Site2 Soil 0 1ft': '0-1 ft',
    'SHRBTG30-DX Site2 Soil 1 2ft': '1-2 ft',
    'SHRBTG30-DX Site3 Soil-Surface': 'Surface',
    'SHRBTG30-DX Site3 Soil 0 1ft' : '0-1 ft',
    'SHRBTG30-DX Site3 Soil 1 2ft': '1-2'
}

# Create a new column 'Category' by mapping the index using the dictionary
pivot_table['Sample Location'] = pivot_table.index.map(index_to_value)
pivot_table['Depth'] = pivot_table.index.map(index_to_value_Depth)

pivot_table

PFAS Compound,11Cl-PF3OUdS,3:3FTCA,4:2FTS,5:3FTCA,6:2FTS,7:3FTCA,8:2FTS,9Cl-PF3ONS,ADONA,HFPO-DA,...,FOSA,PFOS,PFOA,PFPeS,PFPeA,PFTeDA,PFTrDA,PFUnA,Sample Location,Depth
Sample Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SHR4MM-Control Site 0 1ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.076,0.067,0,0.0,0.0,0.0,0.0,Control Site,0-1 ft
SHR4MM-Control Site 1 2ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.13,0,0.0,0.0,0.0,0.0,Control Site,1-2 ft
SHR4MM-Control Site Surface,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.076,0.0,0,0.0,0.0,0.0,0.0,Contorl Site,Surface
SHRBTG30-DX Site1 Soil 0 1ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.12,0.83,0,0.0,0.0,0.0,0.0,Site 1,0-1 ft
SHRBTG30-DX Site1 Soil 1 2ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.11,0,0.79,0.0,0.0,0.0,Site 1,1-2 ft
SHRBTG30-DX Site1 Soil Surface,0,0,0,0,0,0,0.0,0,0,0,...,1.6,5.1,5.6,0,0.0,0.34,0.23,0.66,Site 1,Surface
SHRBTG30-DX Site2 Soil 0 1ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.085,5.5,0,0.33,0.0,0.0,0.0,Site 2,0-1 ft
SHRBTG30-DX Site2 Soil 1 2ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.5,0,1.3,0.0,0.0,0.0,Site 2,1-2 ft
SHRBTG30-DX Site2 Soil Surface,0,0,0,0,0,0,0.35,0,0,0,...,1.2,4.2,4.8,0,0.0,0.28,0.19,0.65,Site 2,Surface
SHRBTG30-DX Site3 Soil 0 1ft,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.16,2.6,0,0.44,0.0,0.0,0.0,Site 3,0-1 ft


In [76]:
# Set 'Sample Location' as the new index
pivot_table.set_index('Sample Location', inplace=True)

In [77]:
pivot_table

PFAS Compound,11Cl-PF3OUdS,3:3FTCA,4:2FTS,5:3FTCA,6:2FTS,7:3FTCA,8:2FTS,9Cl-PF3ONS,ADONA,HFPO-DA,...,PFNA,FOSA,PFOS,PFOA,PFPeS,PFPeA,PFTeDA,PFTrDA,PFUnA,Depth
Sample Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Control Site,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.076,0.067,0,0.0,0.0,0.0,0.0,0-1 ft
Control Site,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.0,0.13,0,0.0,0.0,0.0,0.0,1-2 ft
Contorl Site,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.076,0.0,0,0.0,0.0,0.0,0.0,Surface
Site 1,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.12,0.83,0,0.0,0.0,0.0,0.0,0-1 ft
Site 1,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.0,0.11,0,0.79,0.0,0.0,0.0,1-2 ft
Site 1,0,0,0,0,0,0,0.0,0,0,0,...,1.4,1.6,5.1,5.6,0,0.0,0.34,0.23,0.66,Surface
Site 2,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.085,5.5,0,0.33,0.0,0.0,0.0,0-1 ft
Site 2,0,0,0,0,0,0,0.0,0,0,0,...,0.0,0.0,0.0,0.5,0,1.3,0.0,0.0,0.0,1-2 ft
Site 2,0,0,0,0,0,0,0.35,0,0,0,...,2.0,1.2,4.2,4.8,0,0.0,0.28,0.19,0.65,Surface
Site 3,0,0,0,0,0,0,0.0,0,0,0,...,0.086,0.0,0.16,2.6,0,0.44,0.0,0.0,0.0,0-1 ft


In [78]:
#dfs: A dictionary where keys are the tab names and values are the DataFrames to write.
dfs = {'Biosolids': pivot_table}

#filename: The name of the Excel file to write to.
filename= 'Biosolids.xlsx'
dfs_to_excel(dfs, filename)

File 'Biosolids.xlsx' has been written with 1 tabs.
