### This jupyter note book include scripts
- get  data in import csv format that used in redcap database to import synthea patient, condition, medication three csv files, where condition and medication are repeatable instruments 
- the code utilize api to import the csv file if the csv file is too large

#### Before running the script, please make sure that
- The corresponding redcap database has aleady set up with 3 instruments
    - patient 
    - meidcation(repetative instrument)
    - condition(repetative instrument)
- Template of the imported data download from redcap database
- A valid API token used for import data into database through API

In [2]:
import numpy as np
import pandas as pd
import csv
import requests
import time, sys
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [2]:
# the function to get the repetitive_instrument_format for a repeatable instrument
def get_repetitive_instrument_format(instrument_df,instrument_name,selected_data):
    """
    instrument_df: dataframe for the instrument
    instrument_name: instrument name in redcap database
    selected_data: selected data that need to store 
    """
    
    df = selected_data[instrument_df.columns]
    df['redcap_repeat_instrument'] = instrument_name
    df['redcap_repeat_instance'] = 'new'
    
    return df.drop_duplicates(keep='first')

# this function is used to get the data csv file that used to import data in redcap database, where condition and medication are repeatable instrument 
# it will save the import data csv file as insert_dataset.csv under current folder
def get_import_data_format(Patient,Conditions,Medications,chosen_condition,Template):
    
    """
    Patient: dataframe for the patient.csv in Synthea format
    Conditions: dataframe for the conditions.csv in Synthea format
    Medications: dataframe for the medication.csv in Synthea format
    chosen_condition: list of condition chosen that the cohort of patient need to diagonized with the condition and only medication with the reason as the one of the condition will be selected, but if the list is empty then all the data will be imported to database
    Template: which is the template download from the redcapdatabase
    """
    
    # change the text to related number in dropdown box in redcap defined
    Patient.MARITAL = Patient.MARITAL.map({'M': '1', 'S': '2'})
    Patient.GENDER = Patient.GENDER.map({'M': '1', 'F': '2'})

    #here we choose the data that is with Malignant neoplasm of breast (disorder) condition
    if(len(chosen_condition)!=0):
        Conditions = Conditions[Conditions['DESCRIPTION'].isin(chosen_condition)]


    # in redcap, data are case insensitive, we made the name aligned with the column name in redcap 
    Patient.columns = Patient.columns.str.lower()
    Conditions.columns = Conditions.columns.str.lower()
    Medications.columns = Medications.columns.str.lower()
    Patient.columns += '_patient'
    Conditions.columns += '_condition'
    Medications.columns += '_medication'


    # join patient, condition and medication csv to choose rows with all three information
    if(len(chosen_condition)!=0):
        selected_data = Medications.merge(Conditions, how='inner',  right_on=['patient_condition','description_condition'],left_on=['patient_medication','reasondescription_medication'],)
        selected_data = Patient.merge(selected_data, how='inner', left_on=['id_patient'], right_on=['patient_medication'])
    
    Medications['id_patient'] = Medications['patient_medication']
    instrument_df = Medications
    instrument_name = 'medications'
    # if need all medication
    if(len(chosen_condition)==0):
        selected_data = Medications
    medication_format = get_repetitive_instrument_format(instrument_df,instrument_name,selected_data)
    
    
    Conditions['id_patient'] = Conditions['patient_condition']
    instrument_df = Conditions
    instrument_name = 'conditions'
    # if need all data
    if(len(chosen_condition)==0):
        selected_data = Conditions

    condition_format = get_repetitive_instrument_format(instrument_df,instrument_name,selected_data)
    
    
    # if need all data
    if(len(chosen_condition)==0):
        selected_data = Patient
    
    patient_format= selected_data[Patient.columns].drop_duplicates(keep='first')
    
    
    # # insert the other three columns should include in the import csv file
    patient_format['patients_complete'] = '0'
    medication_format['medications_complete'] = '0'
    condition_format['conditions_complete'] = '0'


    #concatenated data include all the information that need to upload
    concatenated = pd.concat([Template,patient_format,medication_format, condition_format])

    # redcap only recongnize data in format Y-M-D when using data import tool then for datewith format Y/M/D we need to change to Y-M-D
    concatenated['birthdate_patient'] = concatenated['birthdate_patient'].apply(lambda row :row.replace('/','-')if (type(row)==str) else row)
    concatenated['start_condition'] = concatenated['start_condition'].apply(lambda row :row.replace('/','-')if (type(row)==str) else row)
    concatenated['stop_condition'] = concatenated['stop_condition'].apply(lambda row :row.replace('/','-') if (type(row)==str) else row)
    concatenated['deathdate_patient'] = concatenated['deathdate_patient'].apply(lambda row :row.replace('/','-') if (type(row)==str) else row)


    # get the column order of the format of the insert file, remove the last column which is the unnamed
    columns_order = Template.columns[:-1]
    concatenated = concatenated[columns_order]

    #concatenated data save as csv include all the information that need to upload
    concatenated.to_csv('insert_dataset.csv', index=False) 

    return concatenated




    

In [3]:
# get the template that download from redcap data import tool-> csv_import page
Template = pd.read_csv("ClinicalDashboardsDataset_ImportTemplate.csv")
chosen_condition = ['Malignant neoplasm of breast (disorder)']
Medications = pd.read_csv("csv/medications.csv")
Patient = pd.read_csv("csv/patients.csv")
Conditions = pd.read_csv("csv/conditions.csv")
chosen_condition = []
insert_data = get_import_data_format(Patient,Conditions,Medications,chosen_condition,Template)


#### Import the CSV file
- The first way is through data import tool in the redcap database webpage and take the advantage that it can detect added row and provided feedback, but if the csv file is too large, it need to split into smaller csv files(max accepted csv file size is around 30MB)
- The another way to import the data is to utilize the redcap api to import records. Notice that the api token is required for this method 

In [9]:
def import_data_to_redcap(api_token,path_to_csv):
    '''
    api_token: token used for secure authentifocation in redcap database,
    path_to_csv: csv_file stored data that in correct csv format can be imported to oue defined redcap database
    '''
    myFile = open(path_to_csv)
    total_records = pd.read_csv(path_to_csv).shape[0]
    text = myFile.readline()
    header = text
    insert_content_str = header
    # chunk size is the number of records to import in one http post request
    chunk_size = 10000
    row_index = 0
    total_time = 0
    while text != "":
        text = myFile.readline()
        insert_content_str += text
        row_index +=1
        if(row_index%chunk_size==0):
            start_time = time.time()
            data = {
                    'token': api_token,
                    'content': 'record',
                    'action': 'import',
                    'format': 'csv',
                    'type': 'flat',
                    'overwriteBehavior': 'normal',
                    'forceAutoNumber': 'false',
                    'data': insert_content_str,
                    'returnContent': 'count',
                    'returnFormat': 'json'
                    }
            r = requests.post('https://redcap.wehi.edu.au/api/',data=data)
            end_time = time.time()
            total_time += end_time-start_time
         
            if(str(r.status_code) != '200'):
                print("Error occur at row {}".format(row_index))
            else:
                clear_output(wait = True)
                print("Already {}/{} records upload successful".format(row_index,total_records))
            insert_content_str = header
        
    

    # dont forget the last part of records
    start_time = time.time()
    data = {
                    'token': '5384D3408C5BD6C3611DFFCA1D3B4E7F',
                    'content': 'record',
                    'action': 'import',
                    'format': 'csv',
                    'type': 'flat',
                    'overwriteBehavior': 'normal',
                    'forceAutoNumber': 'false',
                    'data': insert_content_str,
                    'returnContent': 'count',
                    'returnFormat': 'json'
                    }
    r = requests.post('https://redcap.wehi.edu.au/api/',data=data)
    end_time = time.time()
    total_time += end_time-start_time
    if(str(r.status_code) != '200'):
        print("Error occur at row {}".format(row_index))
    else:
        clear_output(wait = True)
        print("Already {}/{} records upload successful".format(row_index-1,total_records))


    myFile.close()
    print("There are {} records in total has been upload to the redcap database and time used is around {} hours".format(row_index-1,round(total_time/3600,3)))  
    
    return 

In [10]:
API = '5384D3408C5BD6C3611DFFCA1D3B4E7F'
path_to_csv = 'insert_dataset.csv'
import_data_to_redcap(API,path_to_csv)

Already 545172/545172 records upload successful
There are 545172 records in total has been upload to the redcap database and time used is around 1.714 hours
