# Export project data from Prod to QA for pipeline testing

Production PID's: 100, 278

QA PID's: Copies of above PID's: 75, 74

In [1]:
import redcap

In [2]:
#API URL and API keys import
%run -i -n "credentials.py"

print(dev_url)
print(prod_url)
print(["dev_pid100", "dev_pid278", "prod_pid100", "prod_pid278"])

https://qa-redcap.doh.wa.gov/api/
https://redcap.doh.wa.gov/api/
['dev_pid100', 'dev_pid278', 'prod_pid100', 'prod_pid278']


In [3]:
#Create Project objects
prod100 = redcap.Project(prod_url, prod_pid100)
prod278 = redcap.Project(prod_url, prod_pid278)

dev100 = redcap.Project(dev_url, dev_pid100)
dev278 = redcap.Project(dev_url, dev_pid278)

## Explore PID100 Dataset, Export Text Fields to QA

In [None]:
prod100_df = prod100.export_records(format_type = "df", raw_or_label = "raw")
p100meta = prod100.export_metadata(format_type = "df")
p100meta.drop("record_id", inplace = True) #remove "record_id" from metadata (this is REDCap index)

In [None]:
#import 500 rows of all fields that are "text" type
filt_txt = p100meta["field_type"] == "text"
fields_txt = set(p100meta[filt_txt].index)

In [None]:
#verify txt fields are the same length with the intersection of txt fields
print(len(fields_txt))
print(len(fields_txt & set(prod100_df.columns)))

#verify that all txt fields exist in dataset 
common = fields_txt & set(prod100_df.columns)
common == fields_txt

In [None]:
#first 500 rows, and all text fields
df_import_txt = prod100_df.iloc[0:500,:].loc[:, list(fields_txt)].copy()

In [None]:
#Convert all float type columns that are defined as text fields in redcap, into Int64 type
floating = df_import_txt.dtypes == np.floating
df_import_txt.loc[:, floating] = df_import_txt.loc[:, floating].astype("Int64")

In [None]:
dev100.import_records(df_import_txt, import_format = "df")

## Explore PID100 Dataset, Export radio fields to QA

In [None]:
p100meta["field_type"].value_counts()

In [None]:
meta_groups = p100meta.groupby("field_type")
fields_radio = meta_groups.get_group("radio").index

#verify all columns exist within dataframe
print(len(fields_radio))
print(len(set(fields_radio) & set(prod100_df.columns)))

In [None]:
df_import_radio = prod100_df.iloc[0:500,:].loc[:, list(fields_radio)].copy()

#changing float dtypes to Int
floating_radio = df_import_radio.dtypes == np.floating
df_import_radio.loc[:, floating_radio] = df_import_radio.loc[:, floating_radio].astype("Int64")

In [None]:
dev100.import_records(df_import_radio, import_format = "df")

## Explore PID100 Dataset, Export radio fields to QA

file fields are exported as str names of the uploaded files. Will not export these fields.

In [None]:
fields_file = meta_groups.get_group("file").index

In [None]:
print(len(fields_file))
print(len(set(prod100_df.columns) & set(fields_file)))

In [None]:
prod100_df.loc[:, list(fields_file)]

## Explore PID100 Dataset, Export dropdown fields to QA



In [None]:
fields_dropdown = meta_groups.get_group("dropdown").index

In [None]:
print(len(fields_dropdown))
print(len(set(prod100_df.columns) & set(fields_dropdown)))

In [None]:
df_import_dropdown = prod100_df.iloc[0:500,:].loc[:, list(fields_dropdown)].copy()

#changing float dtypes to Int
floating_dropdown = df_import_dropdown.dtypes == np.floating
df_import_dropdown.loc[:, floating_dropdown] = df_import_dropdown.loc[:, floating_dropdown].astype("Int64")

In [None]:
#renaming values from raw to coded, why do some fields import raw values and some fields import on labels???
org_wa_county = {"Pend_Orielle": "Pend_Oreille"}
df_import_dropdown["org_wa_county"] = df_import_dropdown["org_wa_county"].replace(org_wa_county)

df_import_dropdown["org_wa_county"].value_counts()


In [None]:
df_import_dropdown[["org_wa_county","org_cmo_county", "org_ceo_county"]] = df_import_dropdown[["org_wa_county","org_cmo_county", "org_ceo_county"]].replace(org_wa_county)

In [None]:
df_import_dropdown[["org_wa_county","org_cmo_county", "org_ceo_county"]].value_counts()

In [None]:
dev100.import_records(df_import_dropdown, import_format = "df")

## Explore PID100 Dataset, Export yes/no fields to QA


In [None]:
fields_yesno = meta_groups.get_group("yesno").index

#verify all columns exist within dataframe
print(len(fields_yesno))
print(len(set(fields_yesno) & set(prod100_df.columns)))

In [None]:
df_import_yesno = prod100_df.iloc[0:500,:].loc[:, list(fields_yesno)].copy()

In [None]:
df_import_yesno = df_import_yesno.astype("Int64")

In [None]:
dev100.import_records(df_import_yesno, import_format = "df")

## Explore PID100 Dataset, Export Notes fields to QA

In [None]:
fields_notes = meta_groups.get_group("notes").index

#verify all columns exist within dataframe
print(len(fields_notes))
print(len(set(fields_notes) & set(prod100_df.columns)))

In [None]:
df_import_notes = prod100_df.iloc[0:500,:].loc[:, list(fields_notes)].copy()

In [None]:
dev100.import_records(df_import_notes, import_format = "df")

## Explore PID100 Dataset, Export Notes and Checkboxes fields to QA

In [None]:
fields_checkbox_orig = meta_groups.get_group("checkbox").index

#verify all columns exist within dataframe
print(len(fields_checkbox_orig))
print(len(set(fields_checkbox_orig) & set(prod100_df.columns)))

In [None]:
fields_checkbox = list(fields_checkbox_orig + "__")
a = "|".join(fields_checkbox)
b = prod100_df.columns.str.contains(a)

df_import_checkbox = prod100_df.iloc[0:500,:].loc[:, list(prod100_df.columns[b])].copy()

In [None]:
len(df_import_checkbox.columns)

In [None]:
dev100.import_records(df_import_checkbox, import_format = "df")

## Evaluate All Imported Fields

All remaining after importing by category type (metadata)

In [None]:
imported_dfs = [df_import_checkbox, df_import_dropdown, df_import_notes, df_import_radio, df_import_txt, df_import_yesno]

#list of lists for all imported fields
imported_fields = []
for i in imported_dfs:
    clms_list = list(i.columns)
    imported_fields.append(clms_list)
    


In [None]:

all_imported_fields = [item for sublist in imported_fields for item in sublist]

In [None]:
len(all_imported_fields)

In [None]:
#All standard columns except for fields that contain files
prod100_clms = set(prod100_df.columns) - set(fields_file)
len(prod100_clms)

In [None]:
#are all imported fields contained in the standard columns?
common_0 = set(prod100_clms) & set(all_imported_fields)
common_0 == set(all_imported_fields)

In [None]:
len(set(prod100_clms) & set(all_imported_fields))

In [None]:
#what columns make up the difference? 
diff = prod100_clms - set(all_imported_fields)

In [None]:
diff

In [None]:
prod100_df.loc[:,list(diff)].dtypes

In [None]:
df_import_complete = prod100_df.iloc[0:500].loc[:,list(diff)]

In [None]:
dev100.import_records(df_import_complete, import_format = "df")

## Explore PID278 Dataset, Export Text Fields to QA


In [4]:
def describe_fields(project):
    """
    create a summary dataframe to describe every standard field from default API export
    
    args:
        pycap Project object
    return:
        Dataframe
        
    """
    
    df_fields = project.export_field_names(format_type = "df")
    df_meta = project.export_metadata(format_type = "df")

    #drop "record_id" field, it is the row index of dataframe export 
    #IS THIS STANDARD FIELD?? Possible bug if record_id is renamed?
    df_meta = df_meta.drop("record_id")
    df_fields = df_fields.drop("record_id")

    #only need 2 fields from metadata export
    df_meta = df_meta[["form_name", "field_type"]].copy()

    #combine fields from metadata and from export_field_names 
    df_fields_2 = df_fields.join(df_meta)
    
    #add field names that have type "file" (from metadata)
    files_df = df_meta[df_meta["field_type"] == "file"].copy()
    files_df['export_field_name'] = files_df.index
    df_fields_3 = pd.concat([df_fields_2, files_df])
    
    return df_fields_3

In [5]:
df_fields_278 = describe_fields(prod278)

In [6]:
df_fields_278.shape

(133, 4)

In [12]:
df_records_278 = prod278.export_records(format_type = "df")

  dataframe = pd.read_csv(buf, **df_kwargs)


In [25]:
float_columns = df_records_278.dtypes[df_records_278.dtypes == float].index
df_records_278.loc[:,float_columns] = df_records_278.loc[:,float_columns].astype("Int64")

df_records_import = df_records_278.iloc[0:500,:]

In [34]:
dev278.import_records(df_records_import_resett_index, import_format = "df")

{'count': 173}

In [32]:
df_records_import_resett_index = df_records_import.reset_index()

In [33]:
df_records_import_resett_index

Unnamed: 0,record_id,redcap_event_name,pause___1,org_name,waiis_fac_name,org_email,loc_vtrcks_id,loc_vfc_pin,loc_type,batch_group,...,followup_unit_5,approved_followup_5,notes_unit_5,update,other_info_v2203,monthly_upload_complete,date_reviewed,reviewer,reviewer_notes,review_complete
0,1,facility_arm_1,0,Kindred Hospital - Seattle,KINDRED HOSPTIAL-SEATTLE,Shannon.Stone@kindred.com,WAAP81021,P81021,9,1,...,,,,,,,,,,
1,1,march_2022_arm_1,,,,,,,,,...,,,,,,0,,,,0
2,1,april_2022_arm_1,,,,,,,,,...,,,,,,0,,,,0
3,1-TEST,facility_arm_1,1,Wischnesky Hospital,Wischnesky Hospital,Shanae.Wischnesky@doh.wa.gov,,,,,...,,,,,,,,,,
4,1-TEST,march_2022_arm_1,,,,,,,,,...,,,,1,,2,2022-03-29,1,completed,0
5,1-TEST,april_2022_arm_1,,,,,,,,,...,,,,,,0,2022-04-04,5,,2
6,2,facility_arm_1,0,Adams County Hospital District #2,Ritzville Medical Clinic,jpepperd@earh.org,WAA150006,150006,12,1,...,,,,,,,,,,
7,2,march_2022_arm_1,,,,,,,,,...,,,,0,,2,2022-04-11,8,Sent email in regards to deleting storage unit...,2
8,2,april_2022_arm_1,,,,,,,,,...,,,,,,0,,,,0
9,3,facility_arm_1,0,Virginia Mason Franciscan Health,VIRGINIA MASON DOWNTOWN MED CT,VMCommandCenter@virginiamason.org,WAAP81003,P81003,28,1,...,,,,,,,,,,
