# Development of Wastewater Surveillance Data Automation Script 

In [1]:
#run script that executes export of LIMS data

%run -i "viral_lims_export.py"

####Export all lims data####
df_lims = export_df_from_LIMS()

### Critical convert long to wide ####
df_lims = (
    drop_null_sample_ID(df_lims)# #remove artifiact data from LIMS (missing sample ID's) 
    .pipe(drop_all_but_N1_N2)#Remove full rows where PCR Target is enything except N1 or N2
    .pipe(below_lod_to_yes_no)
    .pipe(long_to_wide) #meat and potatos! everything not under PCRTarget N1 or N2 will not be imported
    )

####Transform lims dataframe#### 
df_lims = (
    rename_lims_columns(df_lims) 
    .pipe(verify_time_field) #change time values to None if dont fit format HH:MM
    .pipe(convert_numeric) #convert numeric columns to floats, coerce errors
    .pipe(freetext_transform)
    .pipe(validate_yes_no_clms)
    .pipe(validate_choice_fields))

####export all dataframes from 4 WW Redcap Projects####
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

#### Export accepted values in restricted fields from metadata API ####
fields_restricted = (
    redcap_metadata_export(redcap_api_url, redcap_tokens_prod["PID171"])
    .pipe(accepted_redcap_fields)
    )

####Export date and time fields from REDCap
fields_datetime = (
    redcap_metadata_export(redcap_api_url, redcap_tokens_prod["PID171"])
    .pipe(date_time_redcap_fields)
    )

# Uploading Records

In [2]:
#Connecting to PID171
import redcap
project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID171"])

response = project.import_records(df_lims, force_auto_number=False)
response

{'count': 114}

# Prep for DCIPHER

In [3]:
%run -i "DCIPHER_transform.py"
%run -i "viral_lims_export.py"

In [4]:
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

df_pid171 = ww_redcap["PID171"]
df_pid170 = ww_redcap["PID170"]
df_pid176 = ww_redcap["PID176"]


In [5]:
df_pid170 = condense_county_columns(df_pid170)
df_pid170 = pid170_values_transform(df_pid170)

df_pid171 = wide_to_long(df_pid171)
df_pid171 = pid171_transform(df_pid171)

df_pid176 = pid176_transform(df_pid176)

In [6]:
complete = clean_merge(df_pid170, df_pid171, df_pid176)

# patch 3.0 modifications
complete = DCIPHER_v3_modifications(complete)

#filter and save
rows_to_keep = complete["sample_id"].isin(df_lims.index)
complete = complete.loc[rows_to_keep,:]

complete.to_csv("DCIPHER_upload.csv", index = False)

path = r"Y:\Confidential\DCHS\PHOCIS\Surveillance\COVID-19 Wastewater Surveillance\DCIPHER_upload\ww_files"
complete.to_csv(path+"\DCIPHER_upload.csv", index = False)

In [7]:
from datetime import datetime

date = str(datetime.now())[0:10]
date = date.replace("-","_")

historic_path = r"Y:\Confidential\DCHS\PHOCIS\Surveillance\COVID-19 Wastewater Surveillance\DCIPHER_upload\ww_files\historic_uploads"

complete.to_csv(historic_path +"\DCIPHER_" + date + ".csv", index = False)

# Explore DCIPHER 2nd upload Issues

# Patch 3.0 fields names changes

In [None]:
from lims_login import redcap_api_url
from lims_login import redcap_tokens_prod
print(redcap_api_url)
print(redcap_tokens_prod["PID171"])

In [None]:
pid171_meta = redcap_metadata_export(redcap_api_url,redcap_tokens_prod["PID171"])

In [None]:
pid_171_meta_fields = accepted_redcap_fields(pid171_meta)

In [None]:
import redcap
project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID171"])

In [None]:
test = project.export_records(format = "df", raw_or_label = "label")

In [None]:
test["extraction_method"]

# Exploring LIMS data map (data dictionary) compared to REDCap fields

### Do the fields in LIMS database match the fields in the data map (data dictionary)?

In [None]:
#import LIMS data map excel file (data dictionray)
df_LIMS_map = pd.read_excel("LIMS_Data_Dictionary_for_SARS_in_WW.xlsx", sheet_name = "DD for SARS in WW", index_col = 0, header = 3, usecols = "A:C")
LIMS_map_fields = list(df_LIMS_map.index) #data map fields

#import raw LIMS fields values
lims_raw = export_df_from_LIMS()
lims_values = list(lims_raw.columns)

#do the values is data dictionary correspond to the raw LIMS Columns?
print(f"number of raw fields in LIMS database: {len(lims_values)}")
print(f"number of raw fields in LIMS data map (dictionary): {len(LIMS_map_fields)}")

print(f" Are the raw fields identical? {lims_values == LIMS_map_fields}")

### Isolating only the data map fields that will be moved to REDCap 

In [None]:
#adjusted columns map, original before changing SARCoV2AvgConc and SARCoV2BeLowLOD to N1_ and N2_ prefix during long to wide transform
dict_lims_column_map = {
                         'SampleCollectDate': 'sample_collect_date',
                         'SampleCollectTime': 'sample_collect_time',
                         'pH': 'ph',
                         'Conductivity': 'conductivity',
                         'CollectionWaterTemp': 'collection_water_temp',
                         'TSS': 'tss',
                         'CollectionStorageTime': 'collection_storage_time',
                         'CollectionStorageTemp': 'collection_storage_temp',
                         'Pretreatment': 'pretreatment',
                         'PretreatmentSpecify': 'pretreatment_specify',
                         'EquivSewageAmt': 'equiv_sewage_amt',
                         'TestResultDate': 'test_result_date',
                         'FlowRate': 'flow_rate',
                         'SARSCoV2Units': 'sars_cov2_units',
                         'SARSCoV2AvgConc': 'sars_cov2_avg_conc',
                         'SARSCoV2StdError': 'sars_cov2_std_error',
                         'SARSCoV2CI95lo': 'sars_cov2_cl_95_lo',
                         'SARSCoV2CI95up': 'sars_cov2_cl_95_up',
                         'SARSCoV2BelowLOD': 'sars_cov2_below_lod',
                         'LODSewage': 'lod_sewage',
                         'NTCAmplify': 'ntc_amplify',
                         'RecEffSpikeConc': 'rec_eff_percent',
                         'InhibitionDetect': 'inhibition_detect',
                         'InhibitionAdjust': 'inhibition_adjust',
                         'ConcentrationMethod': 'concentration_method',
                         'ExtractionMethod': 'extraction_method',
                         'PreConcStorageTime': 'pre_conc_storage_time',
                         'PreConcStorageTemp': 'pre_conc_storage_temp',
                         'PreExtStorageTime': 'pre_ext_storage_time',
                         'PreExtStorageTemp': 'pre_ext_storage_temp',
                         'TotConcVol': 'tot_conc_vol',
                         'QualityFlag': 'quality_flag',
#                          'N1_SARSCoV2AvgConc':'n1_sars_cov2_avg_conc',
#                          'N2_SARSCoV2AvgConc':'n2_sars_cov2_avg_conc',
#                          'N1_SARSCoV2BelowLOD':'n1_sars_cov2_below_lod',
#                          'N2_SARSCoV2BelowLOD':'n2_sars_cov2_below_lod'
                        }


#selecting only the relevant fields from LIMS data map
LIMS_map_fields_v2 =  set(dict_lims_column_map.keys()) & set(LIMS_map_fields)
#Did all the fields get captured in above variable?
print(len(dict_lims_column_map))
print(len(LIMS_map_fields_v2))

#relavent data map
df_LIMS_map_v2 = df_LIMS_map.loc[LIMS_map_fields_v2].copy()
df_LIMS_map_v2.head()

#renaming
df_LIMS_map_v2.rename(index = dict_lims_column_map, inplace = True)
df_LIMS_map_v2.head()

### Completed transformations: Numeric columns

In [None]:
df_LIMS_map_v2.loc[numeric_clms]

### Completed transformations: Text to columns

In [None]:
df_LIMS_map_v2.loc[text_to_numeric]

### Completed transformations: Dates were not changed, time field is transformed elsewhere

In [None]:
time_and_date = ["sample_collect_date", "test_result_date","sample_collect_time"]
df_LIMS_map_v2.loc[time_and_date]

### Completed transformations: special fields, do not need transform

In [None]:
special = ["sars_cov2_avg_conc","sars_cov2_below_lod" ]
df_LIMS_map_v2.loc[special]

## Completed transformations: yes/no columns

In [None]:
print(yes_no_clms)

yes_no_clms_original = list(set(yes_no_clms) - set(['n1_sars_cov2_below_lod', 'n2_sars_cov2_below_lod']))

df_LIMS_map_v2.loc[yes_no_clms_original]

### Explore remaining fields

In [None]:
covered_clms = numeric_clms + text_to_numeric + time_and_date + special + yes_no_clms_original

covered = df_LIMS_map_v2.index.isin(covered_clms)
df_LIMS_map_v2.loc[~covered]

In [None]:
df_lims.columns

In [None]:
df_LIMS_map_v2.loc["extraction_method", "Field/Data Type"]

In [None]:
covered = df_LIMS_map_v2.index.isin(covered_clms)
not_covered = df_LIMS_map_v2.loc[~covered].index

In [None]:
[i for i in not_covered]

In [None]:
[df_lims[clm].unique() for clm in df_lims[not_covered]]

Notes for REDCap import:
'extraction_method' has a list of possibilities, the LIMS value is not present in this list
'lod_sewage' is just a text value, test importing native data
'pre_conc_storage_temp' is just a text value, test importing native data
'sars_cov2_units' has list of possibilities, number inputs, need dict to convert
'oncentration_method' has a list of possibilities, the LIMS value is not present in this list

# Apendix

## A-I) Explore datatype stored in LIMS database

pyodbc cursor object allows to interact with database parameters. cursos.columns() returns information about every column in the database table.

In [None]:
cnxn = pyodbc.connect(credentials) # credentials = 'DSN=LIMS_DATA;UID=xxxxxxx;PWD=xxxxxxx'
cursor = cnxn.cursor()

dtype_list = [(i.column_name, i.type_name) for i in cursor.columns(table="vz_Epi_ELS_SARS-CoV-2 ddPCR")]

dtype_list


RESULT: Two columns have datetime type, remaining columns are varchar type
('TestResultDate', 'datetime')
('SampleCollectDate', 'datetime')

## A-II) Explore converting LIMS dataframe to numeric type - may not be necessary.

In [None]:
potential_numeric = ["NumNoTargetControl", "SARSCoV2AvgConc"]

In [None]:
df_lims[potential_numeric] = df_lims[potential_numeric].apply(pd.to_numeric, errors = "coerce")

In [None]:
df_lims.info()

## B-I) REDCap Manual data export

Exploring manual csv data export - column ID's, Datatypes, Exporting Survey ID and Survey Timestamp

**Conclusion**: 2 additional column are present in when manually exporting csv and keeping survey ID and Survey timestamp selected

In [None]:
import pandas as pd

#import data
df_PID177_manual = pd.read_csv("./redcap_manual_export/PID177_ww_labs.csv")
df_PID177_manual_noID_noTimeStamp = pd.read_csv("./redcap_manual_export/PID177_ww_labs_minus_SurTimestamp_SurIdentifier.csv")

#make set of column names
columns_PID177_full = set(df_PID177_manual.columns)
columns_PID177_minimal = set(df_PID177_manual_noID_noTimeStamp.columns)

#compare column sets
print("additional columns present: " + str(columns_PID177_full - columns_PID177_minimal))
#print(labs_set_minimal - labs_set_full) #returns empty set 

print("\n")
print(df_PID177_manual[['redcap_survey_identifier', 'a3_ww_lab_set_up_timestamp']])
print("\n")
print(df_PID177_manual.info())

## B-II) REDCap API Data Export

Explore data export via native REDCap API pull

**Conclusion**: API call return data without the additional columns: [redcap_survey_identifier, a3_ww_lab_set_up_timestamp]. These columns can be pulled when exporting data manualy by checking a box.

**Conclusion**: API export columns and manual export columns are identical when survey_identifier and survey_timestamp field remain uncheck during manual export. 

**Conclusion**: During API export, all column fields are objects. Manual export to csv and load to pandas, yields some numeric fields.

**Conclusion**: Datetime format is different between API export, and manual csv export. 

In [None]:
#export PID177 all data via API
df_PID177_API = redcap_API_export(redcap_api_url, redcap_tokens_prod["PID177"])

API_columns_set = set(df_PID177_API.columns)

#comparing columns of csv manual export with identifier and timestamp fields with standrad API export
print("additional columns present: " + str(columns_PID177_full - API_columns_set))
#print(API_columns_set - columns_PID177_full) #empty set
print()
#are all the columns identical? 
print("Are all the columns identical between standard csv export and API export?")
print(all(df_PID177_manual_noID_noTimeStamp.columns == df_PID177_API.columns))

print()
#converting both manually pulled csv and API data to numberic datatypes (if possible)
#df_PID177_API = df_PID177_API.apply(pd.to_numeric, errors = "ignore")
#df_PID177_manual_noID_noTimeStamp = df_PID177_manual_noID_noTimeStamp.apply(pd.to_numeric, errors = "ignore")

#converting timestamp 
df_PID177_API["ww_lab_setup_date"] = pd.to_datetime(df_PID177_API["ww_lab_setup_date"])
df_PID177_manual_noID_noTimeStamp["ww_lab_setup_date"] = pd.to_datetime(df_PID177_manual_noID_noTimeStamp["ww_lab_setup_date"])
print("after converting all columns to numeric, and 'ww_lab_setup_date' columns to datetime, are the dataframes identical?")

print(df_PID177_manual_noID_noTimeStamp.equals(df_PID177_manual_noID_noTimeStamp))


# B-III) RedCap Pycap Data Export

**Conclusion**: PyCap API calls to export record DO NOT contain record status data

In [None]:
import redcap
PID177_project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID177"])

In [None]:
PID177_pycap = PID177_project.export_records(format="df",
                                             event_name = "unique",
                                             #export_checkbox_labels = True,
                                             #export_data_access_groups = True,
                                             #export_survey_fields = True
                                            )



print("Additional columns present: " + str(set(ww_redcap["PID177"].columns) - set(PID177_pycap.columns)))
print(set(PID177_pycap.columns) - set(ww_redcap["PID177"].columns))

# C-I) Exploring LIMS data columns and REDCap data columns

**Conclusion**: Only certain columns from LIMS data map to PID171

**Conclusion**: Created an excel file, to map column name conversion between LIMS and REDCap PID171, created a function to generate dictionary for LIMS data transformation (renaming columns)

In [None]:
df_WWSamples = ww_redcap["PID171"]
print(df_WWSamples.shape)

In [None]:
df_lims.dtypes.to_csv("lims_datatypes.csv")
df_WWSamples.dtypes.to_csv("PID171_WW_Samples.csv")

In [None]:
df_WWSamples.iloc[:,:5]

Function to generate column name transform dictionary. Data lives in xlsx file "LIMS_REDCap_columns_ID's.xlsx"

In [None]:
df_lims_redcap_column_map = pd.read_excel("LIMS_REDCap_columns_IDs.xlsx", sheet_name = "COMBINED") 

not_null_lims = pd.notna(df_lims_redcap_column_map["LIMS_COLUMNS_NAME"]) #find present LIMS columns that map to REDCap

df_lims_map = df_lims_redcap_column_map[not_null_lims][["PID171_COLUMNS_NAME","LIMS_COLUMNS_NAME"]] #filter for mapped columns

df_lims_map.set_index("LIMS_COLUMNS_NAME", inplace = True)

dict_lims_map = df_lims_map.to_dict()

dict_lims_map = dict_lims_map['PID171_COLUMNS_NAME']

dict_lims_map

# C-II) Explore LIMS date column types in preperation for REDCap Import
Conclusion: Only 2 columns from LIMS contain dates, both column contain dates in form YYYY-MM-DD, nothing else is needed.

In [None]:
mdy = fields_datetime[fields_datetime["text_validation_type_or_show_slider_number"].str.contains("date")] #all redcap PID171 datetime fields
a = set(mdy.index)
b = set(dict_lims_column_map.values()) #all redcap values that come from lims
dates = list(a & b)  

print(dates) #only two columns from LIMS that go into REDCap have date values


#Testing format of date columns from LIMS that must be imported into REDCap
for i in dates:
    a = df_lims[i].astype("str")
    print([j for j in a])

# D-I) Demonstration of Upload 1 row of data into REDCap

In [None]:
from datetime import date

# TEST Import 1 row of data (PID171 format, all blanks)
df_test_01 = ww_redcap["PID171"][df_lims.columns].iloc[0:1].copy()   #take only 1 row of data 


# setting values to df_test
df_test_01.set_index(pd.Series([999999]), inplace = True)  #set index to 999999
df_test_01.loc[999999] = "test"
df_test_01["test_result_date"] = date.today()
df_test_01['sample_collect_date'] = date.today()
df_test_01['sample_collect_time'] = "10:30"
df_test_01["pretreatment"] = 1 # 0 or 1 (yes, no)
df_test_01["sars_cov2_units"] = 1 # number 1-6 (drop down categories)
df_test_01["sars_cov2_below_lod"] = "yes" # "yes" or "no"
df_test_01["ntc_amplify"] = "yes" # "yes" or "no"
df_test_01["inhibition_detect"] =  "yes" # "yes" or "no" or "not_tested"
df_test_01["inhibition_adjust"] = "yes" # "yes" or "no" (only if inhibition_detect = "yes")
df_test_01["concentration_method"] = "mf-mgcl2" #long list of drop down values
df_test_01["extraction_method"] = "qiagen-fecal" #long list of drop down values
df_test_01["quality_flag"] = "yes" # "yes" or "no"



In [None]:
#Connecting to PID171
import redcap
project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID171"])

response = project.import_records(df_test_01, force_auto_number=False)
response

# D-II) Testing obligate fields import into REDCap

In [None]:
#build a simple dataframe to import obligate value fields only

row = {}
for i,j in fields.items():
    row[i] = next(iter(j))

single_row = pd.DataFrame(row, index = [1])
two_rows = single_row.append(single_row)
two_rows.reset_index(drop = True, inplace=True)
two_rows.index.name = "sample_id"

In [None]:
#Attempt to import two rows
import redcap
project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID171"])

response = project.import_records(two_rows, force_auto_number=False)
response

# D-III) Combination of new column names due to tests for multiple PCR targets per sample

Conclusion: 2 unique column must be present for every PCR Target: ["sars_cov2_below_lod", "sars_cov2_avg_conc"]

There are 14 PCR targets. Resulting in 28 combinations. 

In [None]:
#PID176 metadata
fields_restricted = (
    redcap_metadata_export(redcap_api_url, redcap_tokens_prod["PID176"])
    .pipe(accepted_redcap_fields)
    )

#Lists that need to be combined
pcr_target = list(fields_restricted['pcr_target'].values())
result_fields = ["sars_cov2_below_lod", "sars_cov2_avg_conc"]

#concat all combinations
pcr_target[0] + "_" + result_fields[0]

combined_fields = []
for i in pcr_target:
    for j in result_fields:
        value = i + "_" + j
        combined_fields.append(value)
        
        
combined_fields

# D-IV) Exploring New Column names for PCR gene Targets

In [None]:
df = ww_redcap["PID171"]
below_lod = []
avg_conc = []

#all columns ending in "below_lod" and "avg_conc"
for i in df.columns:
    if re.search(r"below_lod$", i):
        below_lod.append(i)
        print(i)
        
    elif re.search(r"avg_conc$", i):
        avg_conc.append(i)
        print(i)
    

In [None]:
combined = avg_conc + below_lod

master_key = [

     'n1_sars_cov2_below_lod',
     'n1_sars_cov2_avg_conc',
     'n2_sars_cov2_below_lod',
     'n2_sars_cov2_avg_conc',
     'n3_sars_cov2_below_lod',
     'n3_sars_cov2_avg_conc',
     'e_sarbeco_sars_cov2_below_lod',
     'e_sarbeco_sars_cov2_avg_conc',
     'n_sarbeco_sars_cov2_below_lod',
     'n_sarbeco_sars_cov2_avg_conc',
     'rdrp_sarsr_sars_cov2_below_lod',
     'rdrp_sarsr_sars_cov2_avg_conc',
     'niid_2019ncov_n_sars_cov2_below_lod',
     'niid_2019ncov_n_sars_cov2_avg_conc',
     'rdrp_genencov_ip2_sars_cov2_below_lod',
     'rdrp_genencov_ip2_sars_cov2_avg_conc',
     'rdrp_genencov_ip4_sars_cov2_below_lod',
     'rdrp_genencov_ip4_sars_cov2_avg_conc',
     'taqpathn_sars_cov2_below_lod',
     'taqpathn_sars_cov2_avg_conc',
     'taqpaths_sars_cov2_below_lod',
     'taqpaths_sars_cov2_avg_conc',
     'orf1b_sars_cov2_below_lod',
     'orf1b_sars_cov2_avg_conc',
     'orf1ab_sars_cov2_below_lod',
     'orf1ab_sars_cov2_avg_conc',
     'n1andn2combined_sars_cov2_below_lod',
     'n1andn2combined_sars_cov2_avg_conc']


for i in combined:
    if i in master_key:
        pass
        #print(i)
    else:
        print(f'ERROR The following value is not present in master key {i}')


n = set(master_key)
m = set(combined)

print(n-m)
print(m-n)

print(len(below_lod))
print(len(avg_conc))

print(len(master_key))
print(len(combined))