Date: 11/01/2024

Point of Contact: Abigayle Hodson, Abigayle_Hodson@lbl.gov

Organization: Lawrence Berkeley National Laboratory

Purpose: The purpose of this notebook is to use data from a variety of sources, primarily various releases of the Clean Watersheds Needs Survey (CWNS), to create a list of active unit processes for each wastewater treatment plant in the United States for a specified year.

Data Sources:
*   Clean Watersheds Needs Survey (CWNS) (U.S. EPA, [2004](https://www.epa.gov/cwns/clean-watersheds-needs-survey-cwns-2004-report-and-data), [2008](https://ordspub.epa.gov/ords/cwns2008/f?p=cwns2008:25:), [2012](https://ordspub.epa.gov/ords/cwns2012/f?p=cwns2012:25:), and [2022](https://sdwis.epa.gov/ords/sfdw_pub/r/sfdw/cwns_pub/data-download?session=9748529459785))
*   Combined Heat and Power and Microgrid Installation Databases
[(U.S. DOE, 2024)](https://doe.icfwebservices.com/downloads/chp)
*   Water Environment Federation Biogas Database ([WEF, 2024](https://app.powerbi.com/view?r=eyJrIjoiMGFjZDFjZmItMjQ5Yi00ZTlhLWJmNTQtODFiNjlkYjFlODJjIiwidCI6ImI3ZTk3ODAyLTJhNjktNDc3ZS1iN2QyLWY0ZDE2MWMyMTBjYiIsImMiOjF9))

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np

#display all columns in dataframe
pd.set_option('display.max_columns', None)

#establish file path for ease of uploads and exports
path = 'wwtp_energy_methods_comparison/'

Mounted at /content/gdrive/


In [None]:
#specify year to create cumulative unit process list for (e.g. scenario = 2012 reflects treatment configurations in 2012)
scenario = 2042 #options = [2012, 2022, 2042]

In [None]:
#read in unit processes reported in the 2004, 2008, and 2012 releases of CWNS
up2012 = pd.read_csv(path + 'input_data/facility_information/cwns/2012/2012_SUMMARY_UNIT_PROCESS.csv', dtype = {'CWNS_NUMBER':str}, encoding = 'latin1')
up2008 = pd.read_csv(path + 'input_data/facility_information/cwns/2008/2008_SUMMARY_UNIT_PROCESS.csv',dtype = {'CWNS_NUMBER':str}, encoding = 'latin1')
up2004 = pd.read_csv(path + 'input_data/facility_information/cwns/2004/2004_Unit_Processes.csv', dtype = {'CWNS_NUMBER':str}, encoding = 'latin1')

#aggregate 2004, 2008, and 2012 unit process lists and drop/rename columns
up_old = pd.concat([up2012, up2008, up2004], axis = 0)
up_old.drop(['BACKUP_IND','PLANNED_YEAR','ADDITIONAL_NOTES','LAST_UPDATED_TS','BLANK','CHANGE_TYPE_CAT','SORT_SEQUENCE','KEEP_UP_CODE', 'CHGTP_NAME_CAT','TREATMENT_TYPE','Notes'], inplace = True, axis = 1)
up_old.rename(columns = {'CWNS_NUMBER':'CWNS_NUM'}, inplace = True)

#add a leading zero to CWNS ids with a length less than 11 to ensure proper merge
up_old['CWNS_NUM'] = ['0' + str(cwns) if len(str(cwns)) < 11 else str(cwns) for cwns in up_old['CWNS_NUM']]

#reconcile unit process naming conventions between different CWNS releases
upnames = pd.read_csv(path + 'input_data/facility_information/cwns/UNIT_PROCESS_NAMES.csv')
up_old = pd.merge(left = up_old, right = upnames, how = 'left', left_on = 'UNIT_PROCESS', right_on = 'ORIGINAL_UP_NAME')
up_old.drop(['ORIGINAL_UP_NAME'], inplace = True, axis = 1)

#remove processes listed for abandoment in 2004, 2008, or 2012 and processes listed as both PRES_IND = N and PROJ_IND = N
up_old = up_old.loc[up_old['CHANGE_TYPE'] != 'Abandonment']
up_old = up_old.loc[~((up_old['PRES_IND'] == 'N') & (up_old['PROJ_IND'] == 'N'))]
up_old = up_old[['CWNS_NUM','REPORT_YEAR','PRES_IND','PROJ_IND','FINAL_UNIT_PROCESS_NAME']]

#change formatting of present and projected indices to binary
up_old.loc[up_old['PRES_IND'] == 'Y', 'PRES_IND'] = 1
up_old.loc[up_old['PRES_IND'] == 'N', 'PRES_IND'] = 0
up_old.loc[up_old['PROJ_IND'] == 'Y', 'PROJ_IND'] = 1
up_old.loc[up_old['PROJ_IND'] == 'N', 'PROJ_IND'] = 0

#read in unit processes from the 2022 CWNS
up2022 = pd.read_csv(path + 'input_data/facility_information/cwns/2022/UNIT_PROCESSES.csv', dtype = {'CWNS_ID' : str})
up2022.rename(columns = {'CWNS_ID':'CWNS_NUM'}, inplace = True)

#add a leading zero to CWNS ids with a length less than 11 to ensure proper merge
up2022['CWNS_NUM'] = ['0' + str(cwns) if len(str(cwns)) < 11 else str(cwns) for cwns in up2022['CWNS_NUM']]

#change formatting of 2022 unit process names to match that of prior years
#note: 'Biological Treatment, Other' was manually corrected to be more specific. 'Chemical N Removal' was assumed to be roughly the same energy intensity as 'Chemical P removal'
upnames_2022 = pd.read_csv(path + 'input_data/facility_information/cwns/UNIT_PROCESS_NAMES_2022.csv')
up2022 = pd.merge(left = up2022, right = upnames_2022, how = 'left', left_on = 'UNIT_PROCESS', right_on = '2022_UNIT_PROCESS_NAME')

#filter to relevant columns and rename to match the formatting of old unit process dataframes
up2022 = up2022[['CWNS_NUM','FINAL_UNIT_PROCESS_NAME','EXISTING_FLAG','PLANNED_FLAG']]
up2022.rename(columns = {'EXISTING_FLAG':'PRES_IND','PLANNED_FLAG':'PROJ_IND'}, inplace = True)
up2022.loc[up2022['PRES_IND'] == 'Y', 'PRES_IND'] = 1
up2022.loc[up2022['PRES_IND'] == 'N', 'PRES_IND'] = 0
up2022.loc[pd.isna(up2022['PRES_IND']), 'PRES_IND'] = 0
up2022.loc[up2022['PROJ_IND'] == 'Y', 'PROJ_IND'] = 1
up2022.loc[up2022['PROJ_IND'] == 'N', 'PROJ_IND'] = 0
up2022.loc[pd.isna(up2022['PROJ_IND']), 'PROJ_IND'] = 0
up2022['REPORT_YEAR'] = 2022

#create unit process list which contains information from 2004, 2008, 2012, and 2022 CWNS
uplist_all = pd.concat([up2022, up_old], axis = 0)

#sort unit processes by reporting year
uplist_all.sort_values(by = ['CWNS_NUM','REPORT_YEAR'], ascending = True, inplace = True)

  up2004 = pd.read_csv(path + 'input_data/facility_information/cwns/2004/2004_Unit_Processes.csv', dtype = {'CWNS_NUMBER':str}, encoding = 'latin1')


In [None]:
#upload facilities identified as producing electricity in DOE's Combined Heat and Power Installation database, pre-filtered to wastewater treatment plants and manually assigned a CWNS number based on facility name and location
doe_biogas = pd.read_csv(path + 'input_data/facility_information/biogas/biogas_wwtps_doe.csv', dtype = {'CWNS_NUM':str})

#drop facilities without an identified CWNS number
doe_biogas = doe_biogas.dropna(subset = ['CWNS_NUM'])
doe_biogas.reset_index(inplace = True, drop = True)

#add a leading zero to facilities with improperly recorded CWNS number
doe_biogas['CWNS_NUM'] = ['0' + str(cwns) if len(str(cwns)) < 11 else str(cwns) for cwns in doe_biogas['CWNS_NUM']]

#use DOE biogas database to add anaerobic digestion to unit process list
doe_biogas_ad = doe_biogas[['CWNS_NUM','Latest Install Year']].rename(columns = {'Latest Install Year':'REPORT_YEAR'})
doe_biogas_ad['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Anaerobic Digestion, Other'
doe_biogas_ad['PRES_IND'] = 1
doe_biogas_ad['PROJ_IND'] = 1

#use DOE biogas database to add biogas utilization facilities to unit process list
doe_biogas_bg = doe_biogas[['CWNS_NUM','Latest Install Year']].rename(columns = {'Latest Install Year':'REPORT_YEAR'})
doe_biogas_bg['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Anaerobic Digestion, Other'
doe_biogas_bg['PRES_IND'] = 1
doe_biogas_bg['PROJ_IND'] = 1

#upload facilities identified as producing electricity in WEF's Biogas Database, downloaded prior to website update which removed option to download data
wef_biogas = pd.read_csv(path + 'input_data/facility_information/biogas/biogas_wwtps_wef.csv', dtype = {'CWNS_NUM':str})

#drop facilities without an identified CWNS number
wef_biogas = wef_biogas.dropna(subset = 'CWNS_NUM')

#add a leading zero to facilities with improperly recorded CWNS number
wef_biogas['CWNS_NUM'] = ['0' + str(cwns) if len(str(cwns)) < 11 else str(cwns) for cwns in wef_biogas['CWNS_NUM']]

#upload facilities that project using energy recovery from the 2022 CWNS
cwns_biogas = pd.read_csv(path + 'input_data/facility_information/cwns/2022/UNIT_PROCESSES.csv', dtype = {'CWNS_NUM':str})

#rename columns,
cwns_biogas.rename(columns = {'CWNS_ID':'CWNS_NUM'}, inplace = True)

#add a leading zero to facilities with improperly recorded CWNS number
cwns_biogas['CWNS_NUM'] = ['0' + str(cwns) if len(str(cwns)) < 11 else str(cwns) for cwns in cwns_biogas['CWNS_NUM']]

#filter to facilities that project using biogas for energy recovery
cwns_biogas = cwns_biogas.loc[(cwns_biogas['UNIT_PROCESS'] == 'Biosolids Anaerobic Digestion with Energy Recovery') & ((cwns_biogas['EXISTING_FLAG'] == 'Y') | (cwns_biogas['PLANNED_FLAG'] == 'Y'))]
cwns_biogas['PROJ_IND'] = 1
cwns_biogas.rename(columns = {'EXISTING_FLAG':'PRES_IND'}, inplace = True)
cwns_biogas.loc[cwns_biogas['PRES_IND'] == 'Y', 'PRES_IND'] = 1

#add anaerobic digestion from WEF database to unit process list
wef_biogas_ad = wef_biogas.loc[wef_biogas['AD'] == 'yes'][['CWNS_NUM', 'AD']]
wef_biogas_ad['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Anaerobic Digestion, Other'
wef_biogas_ad['REPORT_YEAR'] = 2013
wef_biogas_ad['PRES_IND'] = 1
wef_biogas_ad['PROJ_IND'] = 1
wef_biogas_ad = wef_biogas_ad[['CWNS_NUM','FINAL_UNIT_PROCESS_NAME','REPORT_YEAR','PRES_IND','PROJ_IND']]

#add biogas utilization facilities from WEF database to unit process list
wef_biogas_bg = wef_biogas.loc[wef_biogas['Biogas_utilized'] == 'yes']
wef_biogas_bg['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Digestor Gas Utilization Facilities'
wef_biogas_bg['REPORT_YEAR'] = 2013
wef_biogas_bg['PRES_IND'] = 1
wef_biogas_bg['PROJ_IND'] = 1
wef_biogas_bg = wef_biogas_bg[['CWNS_NUM','FINAL_UNIT_PROCESS_NAME','REPORT_YEAR','PRES_IND','PROJ_IND']]

#add biogas utilization facilities from 2022 CWNS to unit process list
cwns_biogas_ad = cwns_biogas[['CWNS_NUM','PRES_IND','PROJ_IND']]
cwns_biogas_ad['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Anaerobic Digestion, Other'
cwns_biogas_ad['REPORT_YEAR'] = 2022

#merge additional anaerobic digestion processes with cumulative unit process list
uplist_all = pd.concat([uplist_all, wef_biogas_ad, doe_biogas_ad, cwns_biogas_ad], axis = 0, ignore_index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wef_biogas_bg['FINAL_UNIT_PROCESS_NAME'] = 'Biosolids Digestor Gas Utilization Facilities'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wef_biogas_bg['REPORT_YEAR'] = 2013
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wef_biogas_bg['PRES_IND'] = 1
A value is trying to be set on a copy of a slice

In [None]:
#assign key unit processes a code (ie. 'Activated Sludge' is assigned the code 'AS'); note, not all unit processes receive a code because not all processes are considered when forming treatment trains
up_eicodes = pd.read_csv(path + 'input_data/facility_information/cwns/UNIT_PROCESS_EI_CODES_WERF.csv')
uplist_eicodes = uplist_all.merge(up_eicodes[['FINAL_UNIT_PROCESS_NAME','WERF_CODE']].drop_duplicates(subset = ['FINAL_UNIT_PROCESS_NAME']), how = 'left', on = 'FINAL_UNIT_PROCESS_NAME')

#drop unit processes that do not have an associated WERF code
uplist_eicodes.dropna(subset = 'WERF_CODE', inplace = True)

In [None]:
#upload nutrient removal columns from 2012 CWNS
nutr_rem = pd.read_excel(path + 'input_data/facility_information/cwns/2012/SUMMARY_EFFLUENT.xlsx', dtype = {'CWNS_NUMBER':str})
nutr_rem.rename(columns = {'CWNS_NUMBER':'CWNS_NUM'}, inplace = True)

#use nutrient removal flags to add unit processes to unit process list
for index, row in nutr_rem.iterrows():
  if row['PRES_NITROGEN_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Biological Nutrient Removal'], 'PRES_IND':[1], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['BNR']})], axis = 0, ignore_index = True)
  if row['PRES_PHOSPHOROUS_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Phosphorus Removal, Chemical'], 'PRES_IND':[1], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['CHEM-P']})], axis = 0, ignore_index = True)
  if row['PRES_AMMONIA_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Nitrification, Biological (Other)'], 'PRES_IND':[1], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['NIT']})], axis = 0, ignore_index = True)
  if row['PROJ_NITROGEN_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Biological Nutrient Removal'], 'PRES_IND':[0], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['BNR']})], axis = 0, ignore_index = True)
  if row['PROJ_PHOSPHOROUS_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Phosphorus Removal, Chemical'], 'PRES_IND':[0], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['CHEM-P']})], axis = 0, ignore_index = True)
  if row['PROJ_AMMONIA_REMOVAL'] == 'Y':
    uplist_eicodes = pd.concat([uplist_eicodes, pd.DataFrame({'CWNS_NUM':[row['CWNS_NUM']], 'FINAL_UNIT_PROCESS_NAME':['Nitrification, Biological (Other)'], 'PRES_IND':[0], 'PROJ_IND':[1], 'REPORT_YEAR':[2012], 'WERF_CODE':['NIT']})], axis = 0, ignore_index = True)

In [None]:
def clear_old_treatment(uplist_yr_table, scenario):
  '''
  Function that removes outdated secondary/solids processes from cumulative unit process list
    Parameters:
      uplist_yr_table = dataframe of all reported unit processes relevant to treatment train assignment
      scenario = year for treatment train assignment (2012, 2022, or 2024)
    Returns:
      uplist_werf_yr_final = modified dataframe of reported unit processes relevant to treatment train assignment, excluding old secondary/solids treatment processes
  '''
  #extract all solids treatment processes from cumulative unit process list
  uplist_yr_table_dig = uplist_yr_table.loc[(uplist_yr_table['WERF_CODE'] == 'AED') | (uplist_yr_table['WERF_CODE'] == 'AND') | (uplist_yr_table['WERF_CODE'] == 'LIME') | (uplist_yr_table['WERF_CODE'] == 'FBI') | (uplist_yr_table['WERF_CODE'] == 'MHI') | (uplist_yr_table['WERF_CODE'] == 'BIODRY') | (uplist_yr_table['WERF_CODE'] == 'BS_LAGOON')]

  #identify facilities with more than one reported solids process
  uplist_yr_table_dig['DUP'] = uplist_yr_table_dig.duplicated(subset = 'CWNS_NUM', keep = False)
  uplist_yr_table_dig_dup = uplist_yr_table_dig.loc[(uplist_yr_table_dig['DUP'] == True)]

  #identify most recently reported solids process
  up_werf_dig_dup_maxyr = uplist_yr_table_dig_dup.groupby(['CWNS_NUM'])['REPORT_YEAR'].describe()[['max']]
  uplist_yr_table_dig_dup_keep = pd.merge(left = uplist_yr_table_dig_dup, right = up_werf_dig_dup_maxyr, how = 'left', on = 'CWNS_NUM')
  uplist_yr_table_dig_dup_keep.loc[(uplist_yr_table_dig_dup_keep['REPORT_YEAR'] == uplist_yr_table_dig_dup_keep['max']), 'KEEP'] = 1
  uplist_yr_table_dig_dup_keep.loc[(uplist_yr_table_dig_dup_keep['REPORT_YEAR'] != uplist_yr_table_dig_dup_keep['max']), 'KEEP'] = 0
  uplist_yr_table_dig_dup_keep = uplist_yr_table_dig_dup_keep.loc[:,['CWNS_NUM','REPORT_YEAR','WERF_CODE','KEEP','FINAL_UNIT_PROCESS_NAME']]
  uplist_yr_table_dig_dup_keep.sort_values(by = ['CWNS_NUM','REPORT_YEAR'], ascending = False, inplace = True, ignore_index = True)

  #remove less recently reported solids processes from cumulative unit process list
  uplist_werf_yr_cut = pd.merge(left = uplist_yr_table, right = uplist_yr_table_dig_dup_keep, how = 'left', on = ['CWNS_NUM','REPORT_YEAR','WERF_CODE','FINAL_UNIT_PROCESS_NAME'])
  uplist_werf_yr_cut = uplist_werf_yr_cut.loc[:,['CWNS_NUM','REPORT_YEAR',f'{scenario}_IND','WERF_CODE','KEEP','FINAL_UNIT_PROCESS_NAME']]
  uplist_werf_yr = uplist_werf_yr_cut.loc[(uplist_werf_yr_cut['KEEP'] != 0)]
  uplist_werf_yr = uplist_werf_yr.loc[:,['CWNS_NUM','REPORT_YEAR',f'{scenario}_IND','WERF_CODE','FINAL_UNIT_PROCESS_NAME']]

  #extract all secondary treatment processes from cumulative unit process list, excluding biogas utilization, biosolids lagoons, and polishing lagoons
  uplist_werf_yr_sec = uplist_werf_yr.loc[(uplist_werf_yr['WERF_CODE'].str.contains('AS')) | (uplist_werf_yr['WERF_CODE'].str.contains('TF')) | (uplist_werf_yr['WERF_CODE'].str.contains('POND')) | (uplist_werf_yr['WERF_CODE'].str.contains('LAGOON'))]
  uplist_werf_yr_sec = uplist_werf_yr_sec.loc[(uplist_werf_yr_sec['WERF_CODE'] != 'BIOGAS_CWNS')]
  uplist_werf_yr_sec = uplist_werf_yr_sec.loc[(uplist_werf_yr_sec['WERF_CODE'] != 'BS_LAGOON')]
  uplist_werf_yr_sec = uplist_werf_yr_sec.loc[(uplist_werf_yr_sec['WERF_CODE'] != 'LAGOON_POL')]

  #identify facilities with more than one reported secondary process
  uplist_werf_yr_sec['DUP'] = uplist_werf_yr_sec.duplicated(subset = 'CWNS_NUM', keep=False)
  uplist_werf_yr_sec_dup = uplist_werf_yr_sec.loc[(uplist_werf_yr_sec['DUP'] == True)]

  #identify most recently reported secondary process
  up_werf_sec_dup_maxyr = uplist_werf_yr_sec_dup.groupby(['CWNS_NUM'])['REPORT_YEAR'].describe()[['max']]
  uplist_werf_yr_sec_dup_keep = pd.merge(left = uplist_werf_yr_sec_dup, right = up_werf_sec_dup_maxyr, how = 'left', on = 'CWNS_NUM')
  uplist_werf_yr_sec_dup_keep.loc[(uplist_werf_yr_sec_dup_keep['REPORT_YEAR'] == uplist_werf_yr_sec_dup_keep['max']), 'KEEP'] = 1
  uplist_werf_yr_sec_dup_keep.loc[(uplist_werf_yr_sec_dup_keep['REPORT_YEAR'] != uplist_werf_yr_sec_dup_keep['max']), 'KEEP'] = 0
  uplist_werf_yr_sec_dup_keep = uplist_werf_yr_sec_dup_keep.loc[:,['CWNS_NUM','REPORT_YEAR','WERF_CODE','KEEP','FINAL_UNIT_PROCESS_NAME']]
  uplist_werf_yr_sec_dup_keep.sort_values(by = ['CWNS_NUM','REPORT_YEAR'], ascending = False, inplace = True, ignore_index = True)

  #remove less recently reported secondary processes from cumulative unit process list
  uplist_werf_yr_cut2 = pd.merge(left = uplist_werf_yr, right = uplist_werf_yr_sec_dup_keep, how = 'left', left_on = ['CWNS_NUM', 'REPORT_YEAR', 'WERF_CODE','FINAL_UNIT_PROCESS_NAME'], right_on = ['CWNS_NUM','REPORT_YEAR','WERF_CODE','FINAL_UNIT_PROCESS_NAME'])
  uplist_werf_yr_cut2 = uplist_werf_yr_cut2.loc[:,['CWNS_NUM','REPORT_YEAR',f'{scenario}_IND','WERF_CODE','KEEP','FINAL_UNIT_PROCESS_NAME']]
  uplist_werf_yr_cut2 = uplist_werf_yr_cut2.loc[(uplist_werf_yr_cut2['KEEP'] != 0)]
  uplist_werf_yr_final = uplist_werf_yr_cut2.loc[:,['CWNS_NUM','REPORT_YEAR',f'{scenario}_IND','WERF_CODE','FINAL_UNIT_PROCESS_NAME']]
  uplist_werf_yr_final.drop_duplicates(subset = ['CWNS_NUM','WERF_CODE'], inplace = True, ignore_index = True)

  return uplist_werf_yr_final

In [None]:
#create columns that indicate if a unit process was present in 2012, 2022, or 2042
uplist_eicodes.loc[(uplist_eicodes['REPORT_YEAR'] <= 2012) & (uplist_eicodes['PRES_IND'] == 1), '2012_IND'] = 1
uplist_eicodes.loc[(uplist_eicodes['REPORT_YEAR'] <= 2022) & (uplist_eicodes['PRES_IND'] == 1), '2022_IND'] = 1
uplist_eicodes.loc[(uplist_eicodes['PRES_IND'] == 1) | (uplist_eicodes['PROJ_IND'] == 1), '2042_IND'] = 1

#filter unit process list to unit processes that were present in selected scenario
uplist_werf_scenario = uplist_eicodes.loc[(uplist_eicodes[f'{str(scenario)}_IND'] == 1)][['CWNS_NUM','REPORT_YEAR',f'{str(scenario)}_IND','WERF_CODE','FINAL_UNIT_PROCESS_NAME']]
uplist_werf_scenario = uplist_werf_scenario.dropna(subset = ['WERF_CODE'])

#retain only most recently reported secondary/solids processes
uplist_werf_scenario.sort_values(by = ['CWNS_NUM','REPORT_YEAR'], ascending = True, inplace = True, ignore_index = True)
uplist_werf_scenario_final = clear_old_treatment(uplist_werf_scenario, str(scenario))

#drop duplicate processes
uplist_werf_scenario_final.drop_duplicates(subset = ['CWNS_NUM','WERF_CODE'], inplace = True, ignore_index = True)

#export cumnulative unit process list
uplist_werf_scenario_final.to_csv(path + 'input_data/process_methods/cumulative_unit_process_list_%s.csv' %str(scenario), index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uplist_yr_table_dig['DUP'] = uplist_yr_table_dig.duplicated(subset = 'CWNS_NUM', keep = False)
