In [1]:
# Import raw LIMS dataset
%run -i "lims_export_v2.py"

In [2]:
#order dependant: 1) export raw lims
df_lims = export_df_from_LIMS()



In [3]:
#order dependant: 2) filter only relavent fields
df_lims = isolate_relavent_data(df_lims)

#### Order independant transformations

In [4]:
##### Order independant operations #####
df_lims = convert_numeric(df_lims)
df_lims = freetext_transform(df_lims)
df_lims = convert_choice_fields(df_lims)

In [None]:
### Final Transformation ####
#last, force some column values, and set columns dtypes
df_lims = force_values(df_lims)
df_lims = set_dtypes(df_lims)

#### pipeline update progress

In [None]:
### Critical convert long to wide ####
df_lims = (
    drop_null_sample_ID(df_lims ####COMPLETE######
    .pipe(drop_all_but_N1_N2)#Remove full rows where PCR Target is enything except N1 or N2
    .pipe(below_lod_to_yes_no) ###COMPLETE####
    .pipe(long_to_wide) #meat and potatos! everything not under PCRTarget N1 or N2 will not be imported
    )

####Transform lims dataframe#### 
df_lims = (
    rename_lims_columns(df_lims) ######COMPLETE######
    .pipe(convert_numeric) ######COMPLETE######
    .pipe(freetext_transform) ######COMPLETE######
    .pipe(validate_yes_no_clms) ##### COMPLETE#####
    .pipe(validate_choice_fields))#### COMPLETE ####

# Development Cell

In [None]:
def drop_all_but_N1_N2(df_lims):
    """
    Drop all rows if PCRTarget is anything excep "N1" or "N2"
    """
    df_lims = df_lims.copy()

    df_lims = df_lims[df_lims['PCRTarget'].isin(["N1","N2"])]
    
    return df_lims

In [None]:
def long_to_wide(df_lims):
    """
    Convert the long form: same sample ID for PCRTargets: N1 and N2, into wide form: each critical value: 'SARSCoV2AvgConc','SARSCoV2BelowLOD' 
    will have a column for N1_critical_value, and N2_critical_value. This will allow unique sample ID's 
    
    """
    
    #Make sure SARSCoV2AvgConc is a numeric value before pivot transforms
    df_lims = df_lims.copy()
    
    df_lims["SARSCoV2AvgConc"] = pd.to_numeric(df_lims["SARSCoV2AvgConc"], errors = "coerce") #make sure "SARSCoV2AvgConc" is numeric 

    #drop duplicates from subset, needed for pivot
    df_lims = df_lims.drop_duplicates(subset = ['SubmitterSampleNumber', 'PCRTarget'])
    
    #Separate dataframe for pivot operation 
    df_pivot = df_lims.pivot(index = 'SubmitterSampleNumber', columns = 'PCRTarget', values = ['SARSCoV2AvgConc','SARSCoV2BelowLOD']).copy()

    #Converting multi-index pivot columns into single-index unique column names. merging the names of level0 and level1 column names. 
    new_columns = []
    for tup in df_pivot.columns:

        new_val =tup[1]+"_"+ tup[0] #strining tuples from level0 column names and level 1 column names 
        new_tup = (tup[0], new_val)
        new_columns.append(new_tup)

    multiindex = pd.MultiIndex.from_tuples(new_columns) #convert new tuple names to multi-index 
    df_pivot.columns = multiindex

    df_pivot.columns = df_pivot.columns.droplevel(0) #drop zeroth level
    
    # All remaining columns, not involved in the pivot operation. N1 and N2 are are duplicate for all remaaining columns (except 'PCRTargetRef', this doesnt go to REDCap)
    # To remove duplicates for N1 and N2 for all rows, pivot is performed on all remaining columns, N2 column is droped, and N1 renamed to the original column name

    ### Create Pivot table with remaining columns to match previous pivot operation
    not_pivot_clms = df_lims.columns[~df_lims.columns.isin(["SARSCoV2AvgConc","SARSCoV2BelowLOD"])]
    df_not_pivot = df_lims.loc[:,not_pivot_clms].pivot(index = 'SubmitterSampleNumber', columns = 'PCRTarget').copy()

    ### Drop all N2 columns
    df_not_pivot.drop("N2", axis = 1, level = 1, inplace = True)
    df_not_pivot.columns = df_not_pivot.columns.droplevel(1)

    # Merging the result of the two pivoted dataframes
    df_final = pd.merge(df_pivot, df_not_pivot, left_index= True, right_index= True)
    df_final.index.name = "sample_id"

    df_lims = df_final.copy()

    return df_lims

In [22]:
df_lims = df_lims.drop_duplicates(subset = ["sample_id", "pcr_target"], keep = "last") #drop duplicates if both the same PCR target was tested more than once per sample iD, keep last
df_pivot = df_lims.pivot(index = "sample_id", columns = "pcr_target") #values = ['sars_cov2_below_lod', 'sars_cov2_avg_conc']

#separate wide transformed dataframe intwo two parts - critical values that are dependant on PCR_target, and everything else
df_pivot_critical = df_pivot[['sars_cov2_below_lod', 'sars_cov2_avg_conc']].copy()
df_pivot_remaining = df_pivot[df_pivot.columns.get_level_values(0).difference(['sars_cov2_below_lod', 'sars_cov2_avg_conc'])].copy()

#merge multi-index columns for critical fields
new_cols = ['{1}_{0}'.format(*tup) for tup in df_pivot_critical.columns]
df_pivot_critical.columns = [x.lower() for x in new_cols]



KeyError: 'N1'

# Scratch Sheet

In [None]:
import redcap
#credentials for PID171
token = "AB21CE90EF475E08AC11F92105A39690"
url = 'https://redcap.doh.wa.gov/api/'

#Create Project objects
project = redcap.Project(url,token)


In [None]:
def describe_fields(project):
    """
    create a summary dataframe to describe every standard field from default API export
    
    args:
        pycap Project object
    return:
        Dataframe
        
    """  
    df_fields = project.export_field_names(format_type = "df")
    df_meta = project.export_metadata(format_type = "df")
    
    #set aside choice fields Series
    choice_fields = df_meta[df_meta["field_type"].isin(['dropdown', 'radio', 'checkbox'])]["select_choices_or_calculations"].copy()#select field type "dropdown","radio", "checkbox"
    choice_fields = choice_fields[choice_fields.notnull()] #remove possibility of NA fields

    #drop the first row, it is the index of dataframe export (unique identified: record_id, sample_id)
    df_meta = df_meta.drop(df_meta.index[0])
    df_fields = df_fields.drop(df_fields.index[0])

    #only need 2 fields from metadata export
    df_meta = df_meta[["form_name", "field_type"]].copy()

    #combine fields from metadata and from export_field_names 
    df_fields_2 = df_fields.join(df_meta)
    
    #add field names that have type "file" (from metadata)
    files_df = df_meta[df_meta["field_type"] == "file"].copy()
    files_df['export_field_name'] = files_df.index
    df_fields_3 = pd.concat([df_fields_2, files_df])
    
    #set index to export_field_name
    df_fields_3 = df_fields_3.set_index("export_field_name")
    
    ##### add a new columns: str(dict) of possible choices for multiple choice fields####
    fields_dict = {}
    for i in choice_fields.iteritems():

        string_to_process = i[1] #the value element of the series (as opposed to index element)
        list_of_strings = string_to_process.split("|") # split the string 
        keys_values_list = [i.split(", ", 1) for i in list_of_strings]# split each list once for list of lists [key, values]
        values_dict = {t[0]:t[1] for t in keys_values_list} #dictionary of key value pairs
        fields_dict[i[0]] =  str(values_dict)

    choice_fields_series = pd.Series(fields_dict)

    complete = pd.concat([df_fields_3 ,choice_fields_series.to_frame("Choice Values")], axis = 1)
    
    ## add special case for multiple choice fields - "yesno" field type 
    yesno_dict = {"1":"Yes", "0":"No"}
    yesno_index = df_meta[df_meta["field_type"] == "yesno"].index
    yesno_series = pd.Series(str(yesno_dict), yesno_index) #create series, yes/values and index for all yesno field type
    yesno_frame = yesno_series.to_frame("Choice Values")
    
    df_joined = complete.join(yesno_frame, lsuffix='_l', rsuffix='_r')
    
    complete["Choice Values"] = df_joined["Choice Values_l"].fillna(df_joined["Choice Values_r"])
    
    return complete


In [None]:
df = describe_fields(project)

In [None]:
yes_no_clms = [
                 'quality_flag',
                 'inhibition_adjust',
                 'ntc_amplify',
                 'pretreatment',
                 'inhibition_detect',
                 'sars_cov2_below_lod',
                 'n1_sars_cov2_below_lod',
                 'n2_sars_cov2_below_lod',
               ]


In [None]:
df[df.index.isin(yes_no_clms)]

In [None]:
df[df["field_type"] == "dropdown"]

In [None]:

redcap_dropdown = ['sars_cov2_units', 'n1_sars_cov2_below_lod', 'n2_sars_cov2_below_lod',
                   'ntc_amplify', 'inhibition_detect', 'inhibition_adjust',
                   'concentration_method', 'extraction_method', 'hum_frac_mic_unit',
                   'other_norm_unit', 'quality_flag']

In [None]:
for i in redcap_dropdown:
    print(i)

In [None]:
set(redcap_dropdown) - set(yes_no_clms)