# Development of Wastewater Surveillance Data Automation Script 

## 1) Export All data from LIMS DatabasE

In [1]:
#run script that executes export of LIMS data

%run -i "viral_lims_export.py"

####Export all lims data####
df = export_df_from_LIMS()
#df.info()

####export all dataframes from 4 WW Redcap Projects####
ww_redcap = project_dtype_summary(redcap_api_url, redcap_tokens_prod)

# Appendix 

## A-I) Explore datatype stored in LIMS database

pyodbc cursor object allows to interact with database parameters. cursos.columns() returns information about every column in the database table.

In [None]:
cnxn = pyodbc.connect(credentials) # credentials = 'DSN=LIMS_DATA;UID=xxxxxxx;PWD=xxxxxxx'
cursor = cnxn.cursor()

dtype_list = [(i.column_name, i.type_name) for i in cursor.columns(table="vz_Epi_ELS_SARS-CoV-2 ddPCR")]

dtype_list


RESULT: Two columns have datetime type, remaining columns are varchar type
('TestResultDate', 'datetime')
('SampleCollectDate', 'datetime')

## A-II) Explore converting LIMS dataframe to numeric type - may not be necessary.

In [None]:
potential_numeric = ["NumNoTargetControl", "SARSCoV2AvgConc"]

In [None]:
df[potential_numeric] = df[potential_numeric].apply(pd.to_numeric, errors = "coerce")

In [None]:
df.info()

## B-I) REDCap Manual data export

Exploring manual csv data export - column ID's, Datatypes, Exporting Survey ID and Survey Timestamp

**Conclusion**: 2 additional column are present in when manually exporting csv and keeping survey ID and Survey timestamp selected

In [None]:
import pandas as pd

#import data
df_PID177_manual = pd.read_csv("./redcap_manual_export/PID177_ww_labs.csv")
df_PID177_manual_noID_noTimeStamp = pd.read_csv("./redcap_manual_export/PID177_ww_labs_minus_SurTimestamp_SurIdentifier.csv")

#make set of column names
columns_PID177_full = set(df_PID177_manual.columns)
columns_PID177_minimal = set(df_PID177_manual_noID_noTimeStamp.columns)

#compare column sets
print("additional columns present: " + str(columns_PID177_full - columns_PID177_minimal))
#print(labs_set_minimal - labs_set_full) #returns empty set 

print("\n")
print(df_PID177_manual[['redcap_survey_identifier', 'a3_ww_lab_set_up_timestamp']])
print("\n")
print(df_PID177_manual.info())

## B-II) REDCap API Data Export

Explore data export via native REDCap API pull

**Conclusion**: API call return data without the additional columns: [redcap_survey_identifier, a3_ww_lab_set_up_timestamp]. These columns can be pulled when exporting data manualy by checking a box.

**Conclusion**: API export columns and manual export columns are identical when survey_identifier and survey_timestamp field remain uncheck during manual export. 

**Conclusion**: During API export, all column fields are objects. Manual export to csv and load to pandas, yields some numeric fields.

**Conclusion**: Datetime format is different between API export, and manual csv export. 

In [None]:
#export PID177 all data via API
df_PID177_API = redcap_API_export(redcap_api_url, redcap_tokens_prod["PID177"])

API_columns_set = set(df_PID177_API.columns)

#comparing columns of csv manual export with identifier and timestamp fields with standrad API export
print("additional columns present: " + str(columns_PID177_full - API_columns_set))
#print(API_columns_set - columns_PID177_full) #empty set
print()
#are all the columns identical? 
print("Are all the columns identical between standard csv export and API export?")
print(all(df_PID177_manual_noID_noTimeStamp.columns == df_PID177_API.columns))

print()
#converting both manually pulled csv and API data to numberic datatypes (if possible)
#df_PID177_API = df_PID177_API.apply(pd.to_numeric, errors = "ignore")
#df_PID177_manual_noID_noTimeStamp = df_PID177_manual_noID_noTimeStamp.apply(pd.to_numeric, errors = "ignore")

#converting timestamp 
df_PID177_API["ww_lab_setup_date"] = pd.to_datetime(df_PID177_API["ww_lab_setup_date"])
df_PID177_manual_noID_noTimeStamp["ww_lab_setup_date"] = pd.to_datetime(df_PID177_manual_noID_noTimeStamp["ww_lab_setup_date"])
print("after converting all columns to numeric, and 'ww_lab_setup_date' columns to datetime, are the dataframes identical?")

print(df_PID177_manual_noID_noTimeStamp.equals(df_PID177_manual_noID_noTimeStamp))


# B-III) RedCap Pycap Data Export

**Conclusion**: PyCap API calls to export record DO NOT contain record status data

In [3]:
import redcap
PID177_project = redcap.Project(redcap_api_url, redcap_tokens_prod["PID177"])

In [17]:
PID177_pycap = PID177_project.export_records(format="df",
                                             event_name = "unique",
                                             #export_checkbox_labels = True,
                                             #export_data_access_groups = True,
                                             #export_survey_fields = True
                                            )



print("Additional columns present: " + str(set(ww_redcap["PID177"].columns) - set(PID177_pycap.columns)))
print(set(PID177_pycap.columns) - set(ww_redcap["PID177"].columns))

Additional columns present: {'lab_tracking_complete', 'a3_ww_lab_set_up_complete'}
set()


# C-I) Exploring LIMS data columns and REDCap data columns

In [2]:
df_WWSamples = ww_redcap["PID171"]
print(df_WWSamples.shape)

(9, 67)


In [3]:
df.dtypes.to_csv("lims_datatypes.csv")
df_WWSamples.dtypes.to_csv("PID171_WW_Samples.csv")

In [5]:
df_WWSamples.iloc[:,:5]

Unnamed: 0_level_0,sample_site_name,sample_site_id,label_date,ww_lab_name,ww_lab_id
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
210001,1,1,2021-10-04,1,1
210002,1,1,2021-10-04,1,1
210003,1,1,2021-10-04,1,1
210004,2,2,2021-10-04,2,2
210005,2,2,2021-10-04,2,2
210006,2,2,2021-10-04,2,2
210007,3,3,2021-10-04,3,3
210008,3,3,2021-10-04,3,3
210009,3,3,2021-10-04,3,3


In [6]:
df_WWSamples

Unnamed: 0_level_0,sample_site_name,sample_site_id,label_date,ww_lab_name,ww_lab_id,micro_lab_name,micro_lab_id,collection_date,sampler_initial,sample_setup_complete,...,latestrejectcomment,sample_flagged,reason_flagged,c2_viral_lab_report_form_complete,label_printed,ww_received,micro_received,viral_data_flag,data_flag_notes,sample_progress_tracker_complete
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
210001,1,1,2021-10-04,1,1,1,1,2021-10-11,,2,...,,,,0,,,,,,0
210002,1,1,2021-10-04,1,1,1,1,2021-10-13,,2,...,,,,0,,,,,,0
210003,1,1,2021-10-04,1,1,1,1,2021-10-15,,2,...,,,,0,,,,,,0
210004,2,2,2021-10-04,2,2,1,1,2021-10-11,,2,...,,,,0,,,,,,0
210005,2,2,2021-10-04,2,2,1,1,2021-10-13,,2,...,,,,0,,,,,,0
210006,2,2,2021-10-04,2,2,1,1,2021-10-15,,2,...,,,,0,,,,,,0
210007,3,3,2021-10-04,3,3,1,1,2021-10-11,,2,...,,,,0,,,,,,0
210008,3,3,2021-10-04,3,3,1,1,2021-10-13,,2,...,,,,0,,,,,,0
210009,3,3,2021-10-04,3,3,1,1,2021-10-15,,2,...,,,,0,,,,,,0
