In [1]:
dcipher_clms = [           
                    'reporting_jurisdiction',
                     'county_names',
                     'other_jurisdiction',
                     'zipcode',
                     'population_served',
                     'sewage_travel_time',
                     'sample_location',
                     'sample_location_specify',
                     'institution_type',
                     'epaid',
                     'wwtp_name',
                     'wwtp_jurisdiction',
                     'capacity_mgd',
                     'industrial_input',
                     'stormwater_input',
                     'influent_equilibrated',
                     'sample_type',
                     'composite_freq',
                     'sample_matrix',
                     'collection_storage_time',
                     'collection_storage_temp',
                     'pretreatment',
                     'pretreatment_specify',
                     'solids_separation',
                     'concentration_method',
                     'extraction_method',
                     'pre_conc_storage_time',
                     'pre_conc_storage_temp',
                     'pre_ext_storage_time',
                     'pre_ext_storage_temp',
                     'tot_conc_vol',
                     'ext_blank',
                     'rec_eff_target_name',
                     'rec_eff_spike_matrix',
                     'rec_eff_spike_conc',
                     'pasteurized',
                     'pcr_target',
                     'pcr_target_ref',
                     'pcr_type',
                     'lod_ref',
                     'hum_frac_target_mic',
                     'hum_frac_target_mic_ref',
                     'hum_frac_target_chem',
                     'hum_frac_target_chem_ref',
                     'other_norm_name',
                     'other_norm_ref',
                     'quant_stan_type',
                     'stan_ref',
                     'inhibition_method',
                     'num_no_target_control',
                     'sample_collect_date',
                     'sample_collect_time',
                     'time_zone',
                     'flow_rate',
                     'ph',
                     'conductivity',
                     'tss',
                     'collection_water_temp',
                     'equiv_sewage_amt',
                     'sample_id',
                     'lab_id',
                     'test_result_date',
                     'sars_cov2_units',
                     'sars_cov2_avg_conc',
                     'sars_cov2_std_error',
                     'sars_cov2_cl_95_lo',
                     'sars_cov2_cl_95_up',
                     'sars_cov2_below_lod',
                     'lod_sewage',
                     'ntc_amplify',
                     'rec_eff_percent',
                     'inhibition_detect',
                     'inhibition_adjust',
                     'hum_frac_mic_conc',
                     'hum_frac_mic_unit',
                     'hum_frac_chem_conc',
                     'hum_frac_chem_unit',
                     'other_norm_conc',
                     'other_norm_unit',
                     'quality_flag']

county_keys = {    
                "ada":"Adams",
                "aso":"Asotin",
                "ben":"Benton",
                "che":"Chelan",
                "clm":"Clallam",
                "clk":"Clark",
                "col":"Columbia",
                "cow":"Cowlitz",
                "dou":"Douglas",
                "fer":"Ferry",
                "fra":"Franklin",
                "gar":"Garfield",
                "grt":"Grant",
                "ghb":"Grays Harbor",
                "isl":"Island",
                "jef":"Jefferson",
                "kin":"King",
                "ksp":"Kitsap",
                "ktt":"Kittitas",
                "klk":"Klickitat",
                "lew":"Lewis",
                "lin":"Lincoln",
                "mas":"Mason",
                "oka":"Okanogan",
                "pac":"Pacific",
                "por":"Pend Oreille",
                "per":"Pierce",
                "san":"San Juan",
                "skg":"Skagit",
                "skm":"Skamania",
                "sno":"Snohomish",
                "spo":"Spokane",
                "ste":"Stevens",
                "thu":"Thurston",
                "wah":"Wahkiakum",
                "wal":"Walla Walla",
                "wha":"Whatcom",
                "wit":"Whitman",
                "yak":"Yakima",
                }


def condense_county_columns(df_pid170):
    """
    REDCap PID170 contains a unique column for every possible county (values 0 or 1) yes or no interprotation of said county
    
    Convert all the county columns into a single column containing a list of full county name values
    """
    
    df_pid170 = df_pid170.copy()

    county_columns = df_pid170.filter(regex="county_names").columns.copy()#identify county name columns

    df_counties = df_pid170[county_columns].copy()


    #Convert data frame to dictionary in format {index: [abbreviated county names]}
    raw_county_names = {}

    for row_index in df_counties.index:

        row_series = df_counties.loc[row_index,:]

        a = list(row_series[row_series == 1].index)
        a = [i[-3:] for i in a]

        raw_county_names[row_index] = a 


    #Convert dictionary of raw county names into full names based on county key:value pairs. 
    full_county_names = {}

    for key, value in raw_county_names.items():
        full_county_names[key] = [county_keys[i] for i in value]


    df_pid170 = df_pid170.loc[:, ~df_pid170.columns.isin(county_columns)].copy()
    df_pid170.loc[:,["county_names"]] = pd.Series(full_county_names)
    
    return df_pid170


def wide_to_long(df_pid171):
    """
    REDCap PID171 is in wide format with unique sample ID's and PCR_target of n1 and n2 containing fields sars_cov2_below_lod (n1 and n2) and sars_cov2_avg_conc (n1 and n2). 
    
    Transform long format: repeat sample ID's for n1 target and n2 target (PCR_target field). Single column of sars_cov2_below_lod and sars_cov2_avg_conc. 
    
    """
    df_pid171 = df_pid171.reset_index().copy()
    
    #identify columns to melt, and all the rest
    melt_clms = ['n1_sars_cov2_avg_conc', 'n2_sars_cov2_avg_conc', 'n1_sars_cov2_below_lod', 'n2_sars_cov2_below_lod']
    not_melt_clms = df_pid171.columns[~df_pid171.columns.isin(melt_clms)]
    
    #perform melt for avg_conc and keep all other columns
    df_melt_conc = pd.melt(df_pid171, value_vars = ['n1_sars_cov2_avg_conc', 'n2_sars_cov2_avg_conc'], var_name = "pcr_target", value_name = 'sars_cov2_avg_conc', id_vars = not_melt_clms )
    #perform melt for below_lod and only keep the value column (below_lod)
    df_melt_lod = pd.melt(df_pid171, value_vars = ['n1_sars_cov2_below_lod', 'n2_sars_cov2_below_lod'], var_name = "pcr_target", value_name = 'sars_cov2_below_lod', id_vars = ["sample_id"] )
    
    #change the PCR_target column to only first 2 letters (n1 or n2)
    df_melt_lod["pcr_target"] = df_melt_lod["pcr_target"].str[0:2]
    df_melt_conc["pcr_target"] = df_melt_lod["pcr_target"].str[0:2]
    
    #merge the dataframes together
    df_pid171 = pd.merge(df_melt_conc, df_melt_lod, how = "inner", left_on = ["sample_id", "pcr_target"], right_on = ["sample_id", "pcr_target"])
    
    return df_pid171


In [2]:
%run -i "viral_lims_export.py"

In [3]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]

df_pid170 = condense_county_columns(df_pid170)
df_pid171 = wide_to_long(df_pid171)

#Changing values of columns "sars_cov2_units"
units = {1: "copies/L wastewater"}
df_pid171["sars_cov2_units"] = df_pid171["sars_cov2_units"].map(units)

In [4]:
df_test = df_pid171.merge(df_pid170, left_on = "sample_site_id", right_index = True)
not_zipcode = df_pid176.columns[~df_pid176.columns.isin(["zipcode", "pcr_target"])]
complete = df_test.merge(df_pid176.loc[:,not_zipcode], left_on="micro_lab_id", right_index=True)
complete = complete.rename(columns = {"micro_lab_id":"lab_id"})
complete = complete.loc[:,dcipher_clms]
complete = complete.sort_values(by = ["sample_id"], ignore_index=True)

In [9]:
df = complete.loc[0:71,:]

In [10]:
df.to_csv("DCIPHER_upload_sample.csv", index = False)

# Appendix

In [None]:
%run -i "viral_lims_export.py"

In [None]:
import pandas as pd

df_map = pd.read_excel("NWSS Data Dictionary_v2_0_3_20210621.xlsx", sheet_name="Metadata", header = 0)

remove = ["Reporter", 
         "Collection Site",
          "WWTP",
          "Collection Method",
          "Processing Method",
          "SARSCoV2 Quantification Method",
          "Sample",
          "SARSCoV2 Quantification Results",
          ]

df_map = df_map[~df_map["Field Name"].isin(remove)] #rows in excel used for separate categories, remove
df_map = df_map.dropna(how = "all").reset_index(drop = True) # drop blank rows

print(f" DCIPHER data map/dictionary contains: {df_map.shape[0]} fields")

### Compare Field Names in DCIPHER dictionary to REDCap Fields

In [None]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]
df_pid177 = ww_redcap["PID177"]


pid171_clms = [i for i in df_pid171.columns]
pid170_clms = [i for i in df_pid170.columns]
pid176_clms = [i for i in df_pid176.columns]
pid177_clms = [i for i in df_pid177.columns]


print(f"PID171 contains: {len(pid171_clms)} columns")
print(f"PID170 contains: {len(pid170_clms)} columns")
print(f"PID176 contains: {len(pid176_clms)} columns")
print(f"PID177 contains: {len(pid177_clms)} columns")

In [None]:
PID171_in_common = list(set(pid171_clms) & set(df_map["Field Name"]))
PID170_in_common = list(set(pid170_clms) & set(df_map["Field Name"]))
PID176_in_common = list(set(pid176_clms) & set(df_map["Field Name"]))
PID177_in_common = list(set(pid177_clms) & set(df_map["Field Name"]))

print(f"PID171 and DCHIPHER data dictionary have {len(PID171_in_common)} common fields")
print(f"PID170 and DCHIPHER data dictionary have {len(PID170_in_common)} common fields")
print(f"PID176 and DCHIPHER data dictionary have {len(PID176_in_common)} common fields")
print(f"PID177 and DCHIPHER data dictionary have {len(PID177_in_common)} common fields")


In [None]:
# Checking if there are duplicate fields in REDCap project fields
REDCap_in_common = PID171_in_common + PID170_in_common + PID176_in_common + PID177_in_common


In [None]:
s_in_redcap = pd.Series(REDCap_in_common)
print(f"Duplicated Fields: \n{s_in_redcap[s_in_redcap.duplicated()]}")
print()

s_in_redcap = s_in_redcap.unique()
print(f"length of unique fields: {len(s_in_redcap)}")

### Identify Missing Fields (DCIPHER expects 80, REDCap projects add to 72 of these)

In [None]:
set(df_map["Field Name"]) - set(s_in_redcap)

### DCIPHER fields from REDCap

lab_id: use WW lab ID from PID177 index (because viral lab is same for all, WW lab differs)

inhibition_method: missing from PID171, has been added to REDCap PID171 and transform script adjusted

hum_frac_chem_unit: missing from PID176, added with available choice as dropdown

ext_blank: missing from PID176, added values yes/no to PID176

county_names: PID170, need to convert multiple column to just a single field from REDCap Export 

# Transformed REDCap projects columns ID comparison to DCIPHER Data Dictionary

In [None]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]
df_pid177 = ww_redcap["PID177"]

df_pid177 = df_pid177.reset_index()
df_pid177 = df_pid177.rename(columns = {"ww_lab_id": "lab_id"})

df_pid170 = condense_county_columns(df_pid170)
df_pid171 = wide_to_long(df_pid171)

In [None]:
pid171_clms = [i for i in df_pid171.columns]
pid170_clms = [i for i in df_pid170.columns]
pid176_clms = [i for i in df_pid176.columns]
pid177_clms = [i for i in df_pid177.columns]


print(f"PID171 contains: {len(pid171_clms)} columns")
print(f"PID170 contains: {len(pid170_clms)} columns")
print(f"PID176 contains: {len(pid176_clms)} columns")
print(f"PID177 contains: {len(pid177_clms)} columns")

In [None]:
PID171_in_common = list(set(pid171_clms) & set(df_map["Field Name"]))
PID170_in_common = list(set(pid170_clms) & set(df_map["Field Name"]))
PID176_in_common = list(set(pid176_clms) & set(df_map["Field Name"]))
PID177_in_common = list(set(pid177_clms) & set(df_map["Field Name"]))

print(f"PID171 and DCHIPHER data dictionary have {len(PID171_in_common)} common fields")
print(f"PID170 and DCHIPHER data dictionary have {len(PID170_in_common)} common fields")
print(f"PID176 and DCHIPHER data dictionary have {len(PID176_in_common)} common fields")
print(f"PID177 and DCHIPHER data dictionary have {len(PID177_in_common)} common fields")

In [None]:
REDCap_in_common = PID171_in_common + PID170_in_common + PID176_in_common + PID177_in_common

In [None]:
s_in_redcap = pd.Series(REDCap_in_common)
print(f"Duplicated Fields: \n{s_in_redcap[s_in_redcap.duplicated()]}")
print()

s_in_redcap = s_in_redcap.unique()
print(f"length of unique fields: {len(s_in_redcap)}")

In [None]:
print("Unaccounted fields in DCIPHER: ")
    
print(set(df_map["Field Name"]) - set(s_in_redcap))

# finding fields to stitch dataframes

Can join PID171 and PID170 using: the index of PID170 ('sample_site_id') and PID171 field 'sample_site_id'

Can join PID171 and PID176 using: the index of PID176 ('micro_lab_id') and PID171 field 'micro_lab_id'

Need to rename micro_lab_id to just lab_id

PID177 leave the hell alone for now. 

In [None]:
df_test = df_pid171.merge(df_pid170, left_on = "sample_site_id", right_index = True)
not_zipcode = df_pid176.columns[~df_pid176.columns.isin(["zipcode", "pcr_target"])]
complete = df_test.merge(df_pid176.loc[:,not_zipcode], left_on="micro_lab_id", right_index=True)
complete = complete.rename(columns = {"micro_lab_id":"lab_id"})
complete = complete.loc[:,df_map["Field Name"]]
complete = complete.sort_values(by = ["sample_id"], ignore_index=True)