In [1]:
%run -i "viral_lims_export.py"

In [2]:
import pandas as pd

df_map = pd.read_excel("NWSS Data Dictionary_v2_0_3_20210621.xlsx", sheet_name="Metadata", header = 0)

remove = ["Reporter", 
         "Collection Site",
          "WWTP",
          "Collection Method",
          "Processing Method",
          "SARSCoV2 Quantification Method",
          "Sample",
          "SARSCoV2 Quantification Results",
          ]

df_map = df_map[~df_map["Field Name"].isin(remove)] #rows in excel used for separate categories, remove
df_map = df_map.dropna(how = "all").reset_index(drop = True) # drop blank rows

print(f" DCIPHER data map/dictionary contains: {df_map.shape[0]} fields")

 DCIPHER data map/dictionary contains: 80 fields


### Compare Field Names in DCIPHER dictionary to REDCap Fields

In [3]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]
df_pid177 = ww_redcap["PID177"]


pid171_clms = [i for i in df_pid171.columns]
pid170_clms = [i for i in df_pid170.columns]
pid176_clms = [i for i in df_pid176.columns]
pid177_clms = [i for i in df_pid177.columns]


print(f"PID171 contains: {len(pid171_clms)} columns")
print(f"PID170 contains: {len(pid170_clms)} columns")
print(f"PID176 contains: {len(pid176_clms)} columns")
print(f"PID177 contains: {len(pid177_clms)} columns")

PID171 contains: 94 columns
PID170 contains: 85 columns
PID176 contains: 74 columns
PID177 contains: 26 columns


In [4]:
PID171_in_common = list(set(pid171_clms) & set(df_map["Field Name"]))
PID170_in_common = list(set(pid170_clms) & set(df_map["Field Name"]))
PID176_in_common = list(set(pid176_clms) & set(df_map["Field Name"]))
PID177_in_common = list(set(pid177_clms) & set(df_map["Field Name"]))

print(f"PID171 and DCHIPHER data dictionary have {len(PID171_in_common)} common fields")
print(f"PID170 and DCHIPHER data dictionary have {len(PID170_in_common)} common fields")
print(f"PID176 and DCHIPHER data dictionary have {len(PID176_in_common)} common fields")
print(f"PID177 and DCHIPHER data dictionary have {len(PID177_in_common)} common fields")


PID171 and DCHIPHER data dictionary have 37 common fields
PID170 and DCHIPHER data dictionary have 18 common fields
PID176 and DCHIPHER data dictionary have 21 common fields
PID177 and DCHIPHER data dictionary have 1 common fields


In [5]:
# Checking if there are duplicate fields in REDCap project fields
REDCap_in_common = PID171_in_common + PID170_in_common + PID176_in_common + PID177_in_common


In [6]:
s_in_redcap = pd.Series(REDCap_in_common)
print(f"Duplicated Fields: \n{s_in_redcap[s_in_redcap.duplicated()]}")
print()

s_in_redcap = s_in_redcap.unique()
print(f"length of unique fields: {len(s_in_redcap)}")

Duplicated Fields: 
75    zipcode
76    zipcode
dtype: object

length of unique fields: 75


### Identify Missing Fields (DCIPHER expects 80, REDCap projects add to 72 of these)

In [7]:
set(df_map["Field Name"]) - set(s_in_redcap)

{'county_names',
 'lab_id',
 'sample_id',
 'sars_cov2_avg_conc',
 'sars_cov2_below_lod'}

### DCIPHER fields from REDCap

lab_id: use WW lab ID from PID177 index (because viral lab is same for all, WW lab differs)

inhibition_method: missing from PID171, has been added to REDCap PID171 and transform script adjusted

hum_frac_chem_unit: missing from PID176, added with available choice as dropdown

ext_blank: missing from PID176, added values yes/no to PID176

county_names: PID170, need to convert multiple column to just a single field from REDCap Export 

In [None]:
[i for i in s_in_redcap]

In [None]:
df_pid170.columns

In [None]:
df_pid176.head(5)

In [None]:
df_pid177.head(5)

In [None]:
[i for i in df_pid176.columns]

In [None]:
df_pid176[
    ['hum_frac_target_mic',
     'hum_frac_target_mic_ref',
     'hum_frac_target_chem',
     'hum_frac_target_chem_ref']]