In [1]:
# Export LIMS Data for Demo/Summary
%run -i "viral_lims_export.py"
df_lims = export_df_from_LIMS()

# Summary of LIMS Waste-Water Data Transformation and Import into REDCap
### 2021/11/14
### Alexey Gilman

## Raw LIMS Data Columns:

In [2]:
print(f" Number of columns in raw LIMS dataset: {len(df_lims.columns)}")
[i for i in df_lims.columns]

 Number of columns in raw LIMS dataset: 64


['PHLAccessionNumber',
 'EPAID',
 'WWTPName',
 'State',
 'CountiesServiced',
 'Zipcode',
 'CapacityMGD',
 'PopulationServed',
 'IndustrialInput',
 'StormwaterInput',
 'StorageonSite',
 'SewageTravelTime',
 'SampleCollectDate',
 'SampleCollectTime',
 'FlowRate',
 'CollectionWaterTemp',
 'SampleType',
 'CompositeFreq',
 'SampleMatrix',
 'SampleLocation',
 'SampleLocationSpecified',
 'CollectionStorageTime',
 'CollectionStorageTemp',
 'Pretreatment',
 'PretreatmentSpecify',
 'WWTPComments',
 'ConcentrationMethod',
 'ExtractionMethod',
 'PreConcStorageTime',
 'PreConcStorageTemp',
 'PreExtStorageTime',
 'PreExtStorageTemp',
 'TotConcVol',
 'ExtBlank',
 'RecEffTargetName',
 'RecEffSpikeMatrix',
 'RecEffSpikeConc',
 'PCRTarget',
 'PCRTargetRef',
 'PCRType',
 'LODRef',
 'QuantStanType',
 'StanRef',
 'InhibitionMethod',
 'NumNoTargetControl',
 'ReportingState',
 'pH',
 'Conductivity',
 'TSS',
 'EquivSewageAmt',
 'TestResultDate',
 'SARSCoV2Units',
 'SARSCoV2AvgConc',
 'SARSCoV2StdError',
 'SAR

## Preliminary Transformation: preparation for long-to-wide transformation
    1. Rows with missing sample ID's are removed
    2. Rows where PCRTarget column is NOT "N1" or "N2" are removed.
    
##### WARNING: Anything thats not N1 or N2 in PCRTarget field will be removed!

In [3]:
df_lims = df_lims = drop_null_sample_ID(df_lims).pipe(drop_all_but_N1_N2).pipe(below_lod_to_yes_no)
df_lims[['SubmitterSampleNumber', 'PCRTarget', 'SARSCoV2BelowLOD', 'SARSCoV2AvgConc' ]].head(10)

Unnamed: 0,SubmitterSampleNumber,PCRTarget,SARSCoV2BelowLOD,SARSCoV2AvgConc
82,210004,N1,Yes,9506.87
83,210004,N2,Yes,30835.54
84,210009,N1,Yes,38439.54
85,210009,N2,Yes,71740.81
86,210001,N1,Yes,222585.26
87,210001,N2,Yes,159718.38
88,210007,N1,Yes,71815.51
89,210007,N2,Yes,47314.22
90,210002,N1,Yes,46533.5
91,210002,N2,Yes,40145.98


## Long-to-wide transformation around PCRTarget

In [4]:
df_lims.pivot(index = 'SubmitterSampleNumber', columns = 'PCRTarget', values = ['SARSCoV2AvgConc','SARSCoV2BelowLOD']).head(10)

Unnamed: 0_level_0,SARSCoV2AvgConc,SARSCoV2AvgConc,SARSCoV2BelowLOD,SARSCoV2BelowLOD
PCRTarget,N1,N2,N1,N2
SubmitterSampleNumber,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
210001,222585.26,159718.38,Yes,Yes
210002,46533.5,40145.98,Yes,Yes
210003,386402.0,393295.0,Yes,Yes
210004,9506.87,30835.54,Yes,Yes
210005,78735.15,87655.11,Yes,Yes
210006,280038.0,501467.0,Yes,Yes
210007,71815.51,47314.22,Yes,Yes
210008,128865.0,89725.0,Yes,Yes
210009,38439.54,71740.81,Yes,Yes
210010,19388.0,19302.0,Yes,Yes


## Clean up post long-to-wide transformation
    1. Rename columns and remove multi-index
    2. Remove all duplicates (columns that are not part of long-to-wide transformation)

In [5]:
df_lims = long_to_wide(df_lims)
df_lims[["N1_SARSCoV2AvgConc", "N2_SARSCoV2AvgConc", "N1_SARSCoV2BelowLOD", "N2_SARSCoV2BelowLOD"]].head(10)

Unnamed: 0_level_0,N1_SARSCoV2AvgConc,N2_SARSCoV2AvgConc,N1_SARSCoV2BelowLOD,N2_SARSCoV2BelowLOD
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
210001,222585.26,159718.38,Yes,Yes
210002,46533.5,40145.98,Yes,Yes
210003,386402.0,393295.0,Yes,Yes
210004,9506.87,30835.54,Yes,Yes
210005,78735.15,87655.11,Yes,Yes
210006,280038.0,501467.0,Yes,Yes
210007,71815.51,47314.22,Yes,Yes
210008,128865.0,89725.0,Yes,Yes
210009,38439.54,71740.81,Yes,Yes
210010,19388.0,19302.0,Yes,Yes


## Isolate only columns to be imported into REDCap and rename columns to match REDCap Fields
#### Columns Names Dictionay: 

In [6]:
df_lims = rename_lims_columns(df_lims)
dict_lims_column_map

{'SampleCollectDate': 'sample_collect_date',
 'SampleCollectTime': 'sample_collect_time',
 'pH': 'ph',
 'Conductivity': 'conductivity',
 'CollectionWaterTemp': 'collection_water_temp',
 'TSS': 'tss',
 'CollectionStorageTime': 'collection_storage_time',
 'CollectionStorageTemp': 'collection_storage_temp',
 'Pretreatment': 'pretreatment',
 'PretreatmentSpecify': 'pretreatment_specify',
 'EquivSewageAmt': 'equiv_sewage_amt',
 'TestResultDate': 'test_result_date',
 'FlowRate': 'flow_rate',
 'SARSCoV2Units': 'sars_cov2_units',
 'SARSCoV2StdError': 'sars_cov2_std_error',
 'SARSCoV2CI95lo': 'sars_cov2_cl_95_lo',
 'SARSCoV2CI95up': 'sars_cov2_cl_95_up',
 'LODSewage': 'lod_sewage',
 'NTCAmplify': 'ntc_amplify',
 'RecEffSpikeConc': 'rec_eff_percent',
 'InhibitionDetect': 'inhibition_detect',
 'InhibitionAdjust': 'inhibition_adjust',
 'ConcentrationMethod': 'concentration_method',
 'ExtractionMethod': 'extraction_method',
 'PreConcStorageTime': 'pre_conc_storage_time',
 'PreConcStorageTemp': 'pre

## Data Validation: "sample_collect_time" must be in format of "HH:MM", if data does not match this pattern it is changed to NaN (missing value)

##### WARNING: "sample_collection_time" not in format "HH:MM" will be changed to missing value

In [7]:
df_lims["sample_collect_time"].unique()

array(['06:30', '08:00', '08:42', '08:19', '08:45', '07:34', '07:35',
       '00:18', '07:00', '06:55', '08:33', '08:43', '  :', '07:22',
       '07:19', '07:13', '06:45', '07:20', '08:32', '08:48', '08:34',
       '07:24', '07:16', '07:18', '07:40', '08:57', '08:38', '07:33',
       '07:32'], dtype=object)

In [8]:
df_lims = verify_time_field(df_lims)

## Data Validation: convert select columns to numeric values or NaN (missing value)

##### WARNING: for all fields below, presence of any non-numeric character will result in missing value

In [9]:
numeric_clms

['ph',
 'conductivity',
 'collection_water_temp',
 'tss',
 'collection_storage_temp',
 'equiv_sewage_amt',
 'flow_rate',
 'sars_cov2_std_error',
 'sars_cov2_cl_95_lo',
 'sars_cov2_cl_95_up',
 'pre_ext_storage_temp']

In [15]:
df_lims[numeric_clms].tail(10)

Unnamed: 0_level_0,ph,conductivity,collection_water_temp,tss,collection_storage_temp,equiv_sewage_amt,flow_rate,sars_cov2_std_error,sars_cov2_cl_95_lo,sars_cov2_cl_95_up,pre_ext_storage_temp
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
210027,,,7.3,,1.0,,6.305,,,,
210028,,,15.0,,< 6,,25.3,,,,
210029,,,15.1,,< 6,,24.8,,,,
210030,,,7.0,,<6,,29.5,,,,
210031,,,3.6,,3.6,,7.7,,,,
210032,,,3.9,,3.9,,8.0,,,,
210033,,,4.1,,4.1,,8.2,,,,
210034,,,6.1,,1.0,,6.381,,,,
210035,,,6.2,,1.0,,6.465,,,,
210036,7.78,856 uS/cm,6.3,,,,,,,,


## Data Validation: Transform select free-text fields to numeric

##### WARNING: for all fields below, digits will be preserved but all text will be removed.

In [16]:
text_to_numeric

['pretreatment_specify',
 'pre_conc_storage_time',
 'pre_ext_storage_time',
 'rec_eff_percent',
 'collection_storage_time',
 'tot_conc_vol']

In [18]:
df_lims = freetext_transform(df_lims)

## Data Validation: Transform select fields to either yes/no or NaN

##### Warning: for all fields below, any value other than yes/no will be changed to missing value

In [19]:
yes_no_clms

['quality_flag',
 'inhibition_adjust',
 'ntc_amplify',
 'pretreatment',
 'inhibition_detect',
 'n1_sars_cov2_below_lod',
 'n2_sars_cov2_below_lod']

In [21]:
df_lims[yes_no_clms].tail(10)

Unnamed: 0_level_0,quality_flag,inhibition_adjust,ntc_amplify,pretreatment,inhibition_detect,n1_sars_cov2_below_lod,n2_sars_cov2_below_lod
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
210027,No,,No,No,,Yes,Yes
210028,No,,No,No,,No,No
210029,No,,No,No,,No,No
210030,No,,No,No,,Yes,Yes
210031,No,,No,No,,No,No
210032,No,,No,No,,No,No
210033,No,,No,No,,Yes,Yes
210034,No,,No,No,,No,No
210035,No,,No,No,,No,No
210036,No,,No,,,Yes,Yes


## Data Validation: "extraction_method" must be one of the following values in REDCap, anything else will be changed to NaN (missing value)

#### WARNING: currently ALL values in "extraction_method" are changed to NaN because values do not match REDCap available choices.

In [27]:
fields_restricted = (
    redcap_metadata_export(redcap_api_url, redcap_tokens_prod["PID171"])
    .pipe(accepted_redcap_fields)
    )
[i for i in fields_restricted["extraction_method"].keys()]

['qiagen-viral',
 'qiagen-fecal',
 'qiagen',
 'qiagen-rneasy-power',
 'qiagen-powerwater',
 'qiagen-rneasy',
 'qiagen-qiaamp-epoch',
 'promega-ht-tna',
 'promega-ht-auto',
 'promega-manual-tna',
 'promega-ww-largevol-tna',
 'nuclisens-auto-magbead',
 'phenol',
 'chemagic300',
 'trizol-zymomagbeads-zymo',
 '4smethod',
 'zymoquick-r2014',
 'none']

## Data Validation: 'concentration_method' must be one of the following values in REDCap, anything else will be changed to NaN (missing value)

#### WARNING: currently ALL values in "extraction_method" are changed to NaN because values do not match REDCap available choices.

In [32]:
[i for i in fields_restricted['concentration_method'].keys()]

['mf-mgcl2',
 'mf-acid',
 'mf-acid-mgcl2',
 'mf',
 'mf-mgcl2-addsolids',
 'mf-acid-addsolids',
 'mf-acid-mgcl2-addsolids',
 'mf-addsolids',
 'peg',
 'ultracentrifugation',
 'skimmilk',
 'beefextract',
 'promega-tna',
 'uf-centricon',
 'uf-amicon',
 'uf-hf-deadend',
 'uf-innovaprep',
 'noconc-addsolids',
 '13']