In [None]:
# Import raw LIMS dataset
%run -i -n "lims_export_v2.py"

In [None]:
### Export LIMS and isolate relevant data ###
df_lims = (
    export_df_from_LIMS()
    .pipe(isolate_relevant_data)
    )
### Oreder independant transformations ###
df_lims = (
    convert_numeric(df_lims)
    .pipe(freetext_transform)
    .pipe(convert_choice_fields)
    .pipe(standardize_time_fields)
    )
### Critical convert long to wide ####
df_lims = (
    long_to_wide(df_lims)
    .pipe(force_values)
    .pipe(set_dtypes)
    )

#### pipeline update progress

In [None]:
# Import raw LIMS dataset
%run -i -n "viral_lims_export.py"

In [None]:
####Export all lims data####
df_lims = export_df_from_LIMS()

### Critical convert long to wide ####
df_lims = (
    drop_null_sample_ID(df_lims)# #remove artifiact data from LIMS (missing sample ID's) 
    .pipe(drop_all_but_N1_N2)#Remove full rows where PCR Target is enything except N1 or N2
    .pipe(below_lod_to_yes_no)
    .pipe(long_to_wide) #meat and potatos! everything not under PCRTarget N1 or N2 will not be imported
    )

####Transform lims dataframe#### 
df_lims = (
    rename_lims_columns(df_lims) 
    .pipe(verify_time_field) #remove from pipeline 04/12/2022
    .pipe(convert_numeric) #convert numeric columns to floats, coerce errors
    .pipe(freetext_transform)
    .pipe(validate_yes_no_clms)
    .pipe(validate_choice_fields))

In [None]:
df_lims.to_pickle("df_lims_origin.pkl")

# Development - Validate that all fields are identical to V1

In [None]:
df_lims_v2 = df_lims.copy()

In [None]:
df_lims_original = pd.read_pickle("df_lims_origin.pkl")

In [None]:
#compare shape
print(df_lims_v2.shape)
print(df_lims_original.shape)

In [None]:
#compare columns
print(set(df_lims_v2.columns) == set(df_lims_original.columns))

#compare index
print(set(df_lims_v2.index) == set(df_lims_original.index))

In [None]:
### Create datafrmae of Dtypes that are different between pipeline versions ###

original_dtypes = df_lims_original.dtypes
original_dtypes.name = "original"

v2_dtypes = df_lims_v2.dtypes
v2_dtypes.name = "v2"
df_dtypes = pd.merge(original_dtypes, v2_dtypes, left_index=True, right_index = True)

df_dtypes[df_dtypes["original"] != df_dtypes["v2"]]

In [None]:
#record 210042 is different due to this sample being analyzed twice - V2 code drops duplicates, keeps the last (by index not timestamp), V1 code keeps first by index #
#special case: making n1_sars_cov2_below_lod unequal

#record 210042 results in many unequal results
df_lims_v2.drop(index = 210042, inplace = True)
df_lims_original.drop(index = 210042, inplace = True)

In [None]:
#06/14/2022, 220369 is another value that does not match between code version
# need to verify the source, is this another duplicated sample?
df_lims_v2.drop(index = 220369, inplace = True)
df_lims_original.drop(index = 220369, inplace = True)

In [None]:
#06/14/2022, 220381 is another value that does not match between code version
# need to verify the source, is this another duplicated sample?
df_lims_v2.drop(index = 220381, inplace = True)
df_lims_original.drop(index = 220381, inplace = True)

Columns are unequal due to dtype issues

In [None]:
#rerun comparison process
#compare all columns between the dataframes for equality

for i in df_lims.columns:
    
    original = df_lims_original[i]
    v2 = df_lims_v2[i]
    
    try:
        pd.testing.assert_series_equal(original, v2, check_dtype=False)
    except:
        print(i)


#need to find source of difference in the following fields:

"pretreatment" - resolved, error in original code


# Scratch Sheet

In [None]:
import redcap
#credentials for PID171
token = "AB21CE90EF475E08AC11F92105A39690"
url = 'https://redcap.doh.wa.gov/api/'

#Create Project objects
project = redcap.Project(url,token)

In [None]:
def describe_fields(project):
    """
    create a summary dataframe to describe every standard field from default API export
    
    args:
        pycap Project object
    return:
        Dataframe
        
    """  
    df_fields = project.export_field_names(format_type = "df")
    df_meta = project.export_metadata(format_type = "df")
    
    #set aside choice fields Series
    choice_fields = df_meta[df_meta["field_type"].isin(['dropdown', 'radio', 'checkbox'])]["select_choices_or_calculations"].copy()#select field type "dropdown","radio", "checkbox"
    choice_fields = choice_fields[choice_fields.notnull()] #remove possibility of NA fields

    #drop the first row, it is the index of dataframe export (unique identified: record_id, sample_id)
    df_meta = df_meta.drop(df_meta.index[0])
    df_fields = df_fields.drop(df_fields.index[0])

    #only need 2 fields from metadata export
    df_meta = df_meta[["form_name", "field_type", "text_validation_type_or_show_slider_number"]].copy()

    #combine fields from metadata and from export_field_names 
    df_fields_2 = df_fields.join(df_meta)
    
    #add field names that have type "file" (from metadata)
    files_df = df_meta[df_meta["field_type"] == "file"].copy()
    files_df['export_field_name'] = files_df.index
    df_fields_3 = pd.concat([df_fields_2, files_df])
    
    #set index to export_field_name
    df_fields_3 = df_fields_3.set_index("export_field_name")
    
    ##### add a new columns: str(dict) of possible choices for multiple choice fields####
    fields_dict = {}
    for i in choice_fields.iteritems():

        string_to_process = i[1] #the value element of the series (as opposed to index element)
        list_of_strings = string_to_process.split("|") # split the string 
        keys_values_list = [i.split(", ", 1) for i in list_of_strings]# split each list once for list of lists [key, values]
        values_dict = {t[0]:t[1] for t in keys_values_list} #dictionary of key value pairs
        fields_dict[i[0]] =  str(values_dict)

    choice_fields_series = pd.Series(fields_dict)

    complete = pd.concat([df_fields_3 ,choice_fields_series.to_frame("Choice Values")], axis = 1)
    
    ## add special case for multiple choice fields - "yesno" field type 
    yesno_dict = {"1":"Yes", "0":"No"}
    yesno_index = df_meta[df_meta["field_type"] == "yesno"].index
    yesno_series = pd.Series(str(yesno_dict), yesno_index) #create series, yes/values and index for all yesno field type
    yesno_frame = yesno_series.to_frame("Choice Values")
    
    df_joined = complete.join(yesno_frame, lsuffix='_l', rsuffix='_r')
    
    complete["Choice Values"] = df_joined["Choice Values_l"].fillna(df_joined["Choice Values_r"])
    
    return complete



In [None]:
df = describe_fields(project)

In [None]:
df.loc["sample_collect_time",:]

In [None]:
df["text_validation_type_or_show_slider_number"].value_counts()