# Explore DCIPHER upload requirements 

In [1]:
%run -i "DCIPHER_transform.py"

In [2]:
%run -i "viral_lims_export.py"

In [3]:
from lims_login import redcap_tokens_prod
from lims_login import redcap_api_url
from viral_lims_export import project_dtype_summary

# Validating columns with DCIPHER

NPDES permit number (<2-letter abbreviation><#######>) VALIDATION COMPLETE

    - field name: epaid
    - Data source: REDCap PID170 
    - Conclusion: currently in valid form - manual entry for new WW sites have risk of entry errors in the future

ZIP code (#####) VALIDATION COMPLETE

    - field name: zipcode
    - Data source: REDCap PID170 
    - Conclusion: currently in valid form - manual entry for new WW sites have risk of entry errors in the future

category


    - field name: zipcode

        [
         'reporting_jurisdiction',
         'sample_location',
         'institution_type',
         'wwtp_jurisdiction',
         'stormwater_input',
         'influent_equilibrated',
         'sample_type',
         'sample_matrix',
         'pretreatment',
         'solids_separation',
         'concentration_method',
         'extraction_method',
         'ext_blank',
         'rec_eff_target_name',
         'rec_eff_spike_matrix',
         'pasteurized',
         'pcr_target',
         'pcr_type',
         'hum_frac_target_mic',
         'hum_frac_target_chem',
         'other_norm_name',
         'quant_stan_type',
         'num_no_target_control',
         'sars_cov2_units',
         'sars_cov2_below_lod',
         'ntc_amplify',
         'inhibition_detect',
         'inhibition_adjust',
         'hum_frac_mic_unit',
         'hum_frac_chem_unit',
         'other_norm_unit',
         'quality_flag']


date ([yyyy]-[mm]-[dd])  VALIDATION COMPLETE

    - field name: sample_collect_date
    - Data source: LIMS > PID171 
    - Conclusion: currently in valid form - no transform needed
    
    - field name: test_result_date
    - Data source: LIMS > PID171 
    - Conclusion: currently in valid form - no transform needed

float    VALIDATION COMPLETE

    - field name:

         'sewage_travel_time',
         'capacity_mgd',
         'industrial_input',
         'composite_freq',
         'collection_storage_time',
         'collection_storage_temp',
         'pre_conc_storage_time',
         'pre_conc_storage_temp',
         'pre_ext_storage_time',
         'pre_ext_storage_temp',
         'tot_conc_vol',
         'rec_eff_spike_conc',
         'flow_rate',
         'ph',
         'conductivity',
         'tss',
         'collection_water_temp',
         'equiv_sewage_amt',
         'sars_cov2_avg_conc',
         'sars_cov2_std_error',
         'sars_cov2_cl_95_lo',
         'sars_cov2_cl_95_up',
         'lod_sewage',
         'rec_eff_percent',
         'hum_frac_mic_conc',
         'hum_frac_chem_conc',
         'other_norm_conc'


integer  VALIDATION COMPLETE

    - field name: population_served
    - Data source: REDCap PID170
    - Conclusion: currently in valid form - manual entry for new WW sites have risk of entry errors in the future

jurisdiction id (a string 20 characters or less, containing only numbers, English alphabetic characters, underscores, and hyphens; white space is not allowed; not case sensitive) 

VALIDATION COMPLETE

    - field name: sample_id
    - Data source: REDCap PID171
    - Conclusion: currently in valid form - auto generated in REDCap
    
    
    - field name: lab_id ("micro_lab_id" in PID171)
    - Data source: REDCap PID171 
    - Conclusion: currently in valid form - manual entry for new WW sites have risk of entry errors in the future


list (comma-separated strings) VALIDATION COMPLETE

    - field name: county_names
    - Data source: REDCap PID170
    - Conclusion: valid form after transformation (DCIPHER import prep)
    
    
    - field name: other_jurisdiction
    - Data source: REDCap PID170
    - Conclusion: WARNING: data is not in list format and there is no transformation function in place. This field does not have any data, but this may cause problems in the future. 

string VALIDATION COMPLETE

    - field name:
    
         'sample_location_specify',
         'wwtp_name',
         'pretreatment_specify',
         'pcr_target_ref',
         'lod_ref',
         'hum_frac_target_mic_ref',
         'hum_frac_target_chem_ref',
         'other_norm_ref',
         'stan_ref',
         'inhibition_method'
         
time zone (UTC-[hh]:[mm]): VALIDATION COMPLETE

    - field name: time_zone
    - Data source: REDCap PID171 (user input)
    - Conclusion: already in valid form from source 

time, 24-hr ([hh]:[mm]) VALIDATION COMPLETE                                                                                                                                              

    - field name: sample_collect_time (REDCap)
    - Data source: LIMS > PID171 
    - Conclusion: Transformation from LIMS to REDCap imports correct format into REDCap



In [None]:
import pandas as pd

df_map = pd.read_excel("NWSS Data Dictionary_v2_0_3_20210621.xlsx", sheet_name="Metadata", header = 0)

remove = ["Reporter", 
         "Collection Site",
          "WWTP",
          "Collection Method",
          "Processing Method",
          "SARSCoV2 Quantification Method",
          "Sample",
          "SARSCoV2 Quantification Results",
          ]

df_map = df_map[~df_map["Field Name"].isin(remove)] #rows in excel used for separate categories, remove
df_map = df_map.dropna(how = "all").reset_index(drop = True) # drop blank rows

print(f" DCIPHER data map/dictionary contains: {df_map.shape[0]} fields")

In [None]:
df_map_01 = df_map.iloc[:,0:2].copy()
df_map_01.groupby("Data Type")["Field Name"].count()

In [None]:
df_map_01[df_map_01["Data Type"] == "category"]

In [None]:
category_fields = [i for i in df_map_01[df_map_01["Data Type"] == "category"]["Field Name"]]
complete[category_fields].dtypes

# Category Review

    [
     'reporting_jurisdiction': ["REDCap PID170"], #good
     'sample_location': ["REDCap PID170"], #good
     'institution_type': ["REDCap PID170"], #good, PID170 needed conversion from keys to values
     'wwtp_jurisdiction': ["REDCap PID170"], #good
     'stormwater_input': ["REDCap PID170"], #good, PID170 needed conversion from keys to values
     'influent_equilibrated': ["REDCap PID170"], #good, PID170 needed conversion from keys to values
     'sample_type': ["REDCap PID170"], #good, PID170 needed conversion from keys to values
     'sample_matrix': ["REDCap PID170"], #good, PID170 needed conversion from keys to values
     'pretreatment': ["REDCap PID171"], #good, PID171 needed conversion from keys to values
     
     
     'solids_separation', #OK RIGHT NOW, NEED TO COME BACK TO THIS, REDCAP MISSING DROPDOWN VALUES
     'concentration_method', #OK RIGHT NOW, NEED TO COME BACK TO THIS, REDCAP MISSING DROPDOWN VALUES
     'extraction_method', #OK RIGHT NOW, NEED TO COME BACK TO THIS, REDCAP MISSING DROPDOWN VALUES
     'ext_blank', #OK RIGHT NOW, NEED TO COME BACK TO THIS, REDCAP MISSING DROPDOWN VALUES
     
     
     'rec_eff_target_name': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'rec_eff_spike_matrix': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'pasteurized': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'pcr_target': ["REDCap PID176"], #good, prior transformation allows only "n1" or "n2" as possibilities
     'pcr_type': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'hum_frac_target_mic': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     
     'hum_frac_target_chem': ["REDCap PID176"], #OK RIGHT NOW, NEED TO COME BACK TO THIS, REDCAP MISSING DROPDOWN VALUES
     
     'other_norm_name': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'quant_stan_type': ["REDCap PID170"], #good
     'num_no_target_control': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'sars_cov2_units' : ["REDCap PID171"], #good, needed conversion from keys to values
     'sars_cov2_below_lod': ["REDCap PID171"], #this field is resolved by wide_to_long transformation
     'ntc_amplify': ["REDCap PID171"], good 
     'inhibition_detect': ["REDCap PID171"], good 
     'inhibition_adjust': ["REDCap PID171"], good 
     'hum_frac_mic_unit': ["REDCap PID171"], #this field is resolved by wide_to_long transformation
     'hum_frac_chem_unit': ["REDCap PID176"], #good, PID176 needed conversion from keys to values
     'other_norm_unit': ["REDCap PID171"], #good, PID171 needed conversion from keys to values
     'quality_flag':: ["REDCap PID171"], good 
     ]


In [None]:
complete["county_names"]

In [None]:
df_pid170["sample_location_specify"].unique()

In [None]:
df_pid171["extraction_method"].unique()

In [None]:
df_county_codes = pd.read_csv("county_codes.csv")

In [None]:
county_codes = dict(zip(df_county_codes["county"], df_county_codes["code"])) 

In [None]:
county_codes

In [None]:
S = pd.Series(county_keys.keys())
V = pd.Series(county_keys.values())

In [None]:
df = pd.concat([S,V], axis = 1)

In [None]:
df[1] = df[1].map(county_codes)

In [None]:
keys_dict = dict(zip(df[0], df[1]))

In [None]:
keys_dict

# Import DCIPHER sample columns to confirm all fields:

Conclusions:

The following fields are missing in our REDCap merge:

{'analysis_ignore',
 'dashboard_ignore',
 'major_lab_method',
 'major_lab_method_desc',
 'qc_ignore'}
 

In [None]:
import pandas as pd
import os

filepath = r"C:\Users\AXG5303\OneDrive - Washington State Executive Branch Agencies\Projects\WW_DCIPHER\NWSS_DCIPHER_Wastewater Data_CSV Template_v2_1_All Fields"

pwd = os.getcwd()

os.chdir(os.path.dirname(filepath))
sample = pd.read_csv(os.path.basename(filepath)+".csv")
os.chdir(pwd)


In [None]:
df_pid171.columns[df_pid171.columns == "sample_location"]

In [None]:
df_pid170.columns[df_pid170.columns == "sample_location"]

In [None]:
df_pid170[["sample_location"]]

# Appendix

In [None]:
%run -i "viral_lims_export.py"

In [None]:
import pandas as pd

df_map = pd.read_excel("NWSS Data Dictionary_v2_0_3_20210621.xlsx", sheet_name="Metadata", header = 0)

remove = ["Reporter", 
         "Collection Site",
          "WWTP",
          "Collection Method",
          "Processing Method",
          "SARSCoV2 Quantification Method",
          "Sample",
          "SARSCoV2 Quantification Results",
          ]

df_map = df_map[~df_map["Field Name"].isin(remove)] #rows in excel used for separate categories, remove
df_map = df_map.dropna(how = "all").reset_index(drop = True) # drop blank rows

print(f" DCIPHER data map/dictionary contains: {df_map.shape[0]} fields")

### Compare Field Names in DCIPHER dictionary to REDCap Fields

In [None]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]
df_pid177 = ww_redcap["PID177"]


pid171_clms = [i for i in df_pid171.columns]
pid170_clms = [i for i in df_pid170.columns]
pid176_clms = [i for i in df_pid176.columns]
pid177_clms = [i for i in df_pid177.columns]


print(f"PID171 contains: {len(pid171_clms)} columns")
print(f"PID170 contains: {len(pid170_clms)} columns")
print(f"PID176 contains: {len(pid176_clms)} columns")
print(f"PID177 contains: {len(pid177_clms)} columns")

In [None]:
PID171_in_common = list(set(pid171_clms) & set(df_map["Field Name"]))
PID170_in_common = list(set(pid170_clms) & set(df_map["Field Name"]))
PID176_in_common = list(set(pid176_clms) & set(df_map["Field Name"]))
PID177_in_common = list(set(pid177_clms) & set(df_map["Field Name"]))

print(f"PID171 and DCHIPHER data dictionary have {len(PID171_in_common)} common fields")
print(f"PID170 and DCHIPHER data dictionary have {len(PID170_in_common)} common fields")
print(f"PID176 and DCHIPHER data dictionary have {len(PID176_in_common)} common fields")
print(f"PID177 and DCHIPHER data dictionary have {len(PID177_in_common)} common fields")


In [None]:
# Checking if there are duplicate fields in REDCap project fields
REDCap_in_common = PID171_in_common + PID170_in_common + PID176_in_common + PID177_in_common


In [None]:
s_in_redcap = pd.Series(REDCap_in_common)
print(f"Duplicated Fields: \n{s_in_redcap[s_in_redcap.duplicated()]}")
print()

s_in_redcap = s_in_redcap.unique()
print(f"length of unique fields: {len(s_in_redcap)}")

### Identify Missing Fields (DCIPHER expects 80, REDCap projects add to 72 of these)

In [None]:
set(df_map["Field Name"]) - set(s_in_redcap)

### DCIPHER fields from REDCap

lab_id: use WW lab ID from PID177 index (because viral lab is same for all, WW lab differs)

inhibition_method: missing from PID171, has been added to REDCap PID171 and transform script adjusted

hum_frac_chem_unit: missing from PID176, added with available choice as dropdown

ext_blank: missing from PID176, added values yes/no to PID176

county_names: PID170, need to convert multiple column to just a single field from REDCap Export 

# Transformed REDCap projects columns ID comparison to DCIPHER Data Dictionary

In [None]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]
df_pid177 = ww_redcap["PID177"]

df_pid177 = df_pid177.reset_index()
df_pid177 = df_pid177.rename(columns = {"ww_lab_id": "lab_id"})

df_pid170 = condense_county_columns(df_pid170)
df_pid171 = wide_to_long(df_pid171)

In [None]:
pid171_clms = [i for i in df_pid171.columns]
pid170_clms = [i for i in df_pid170.columns]
pid176_clms = [i for i in df_pid176.columns]
pid177_clms = [i for i in df_pid177.columns]


print(f"PID171 contains: {len(pid171_clms)} columns")
print(f"PID170 contains: {len(pid170_clms)} columns")
print(f"PID176 contains: {len(pid176_clms)} columns")
print(f"PID177 contains: {len(pid177_clms)} columns")

In [None]:
PID171_in_common = list(set(pid171_clms) & set(df_map["Field Name"]))
PID170_in_common = list(set(pid170_clms) & set(df_map["Field Name"]))
PID176_in_common = list(set(pid176_clms) & set(df_map["Field Name"]))
PID177_in_common = list(set(pid177_clms) & set(df_map["Field Name"]))

print(f"PID171 and DCHIPHER data dictionary have {len(PID171_in_common)} common fields")
print(f"PID170 and DCHIPHER data dictionary have {len(PID170_in_common)} common fields")
print(f"PID176 and DCHIPHER data dictionary have {len(PID176_in_common)} common fields")
print(f"PID177 and DCHIPHER data dictionary have {len(PID177_in_common)} common fields")

In [None]:
REDCap_in_common = PID171_in_common + PID170_in_common + PID176_in_common + PID177_in_common

In [None]:
s_in_redcap = pd.Series(REDCap_in_common)
print(f"Duplicated Fields: \n{s_in_redcap[s_in_redcap.duplicated()]}")
print()

s_in_redcap = s_in_redcap.unique()
print(f"length of unique fields: {len(s_in_redcap)}")

In [None]:
print("Unaccounted fields in DCIPHER: ")
    
print(set(df_map["Field Name"]) - set(s_in_redcap))