In [1]:
# Import dependencies
import requests
import pandas as pd
import numpy as np

In [2]:
# Base url for the api call
url = "https://clinicaltrials.gov/api/query/study_fields"

In [3]:
# Headers for json call
headers = {'Content-Type': 'application/json', 
           'Accept': 'application/json'}

In [4]:
# Extract data and create DataFrame
def create_df(x):

    # Create a list to hold the dictionaries from the json response
    sourced_data = []

    # Set a counter to increase the ranks with each iteration
    # Table search criteria limited to 'breast cancer' and from 01/01/2018
    counter = 0

    while True:
        # Define the parameters of the url
        params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', 
                'fields': ','.join(x), 
                'min_rnk': 1 + 1000 * counter,  
                'max_rnk': 1000 + 1000 * counter, 
                'fmt': 'json'}

        # Make the call 
        response = requests.get(url, 
                                headers = headers, 
                                params = params)

        

        # Increment the counter
        counter += 1
        
        # Check to see if more data was returned for the current response
        if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:
            # If there was no new data returned, break out of the while loop
            break

        # Add the data from the api call to the sourced_data list
        sourced_data += response.json()['StudyFieldsResponse']['StudyFields']
        return sourced_data

In [5]:
# Define function to clean DataFrame removing unnecessary characters for analysis

def clean_data(df, fields):
    # Change datatype of the values, preparing for reg expression
    df[fields] = df[fields].astype(str)

    # Remove any unnecessary characters and turn blank values with NaN (null)
    for i in fields:
        df[i] = df[i].str.replace("^\[.|.\]$|'","")
        df[i] = df[i].replace('',np.nan)

In [21]:
# List of fields for table d
table_d_cols = ['OrgStudyId',
          'StudyType',
          'ArmGroupType',
          'InterventionType',
          'DesignInterventionModel',
          'DesignObservationalModel',
          'TargetDuration',
          'SamplingMethod',
          'Phase']

# Call function to create DataFrame for table D
study_method_df = pd.DataFrame(create_df(table_d_cols))
study_method_df.set_index('Rank',inplace=True)

In [22]:
# Clean DataFrame for table c
clean_data(study_method_df, table_d_cols)
study_method_df.head()

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,StudyType,ArmGroupType,InterventionType,DesignInterventionModel,DesignObservationalModel,TargetDuration,SamplingMethod,Phase
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,BTX-BCI-016-PRT,Observational,,Diagnostic Test,,Case-Only,5 Years,Non-Probability Sample,
2,2018-TJ-BCD,Observational,,"Diagnostic Test, Diagnostic Test",,Case-Control,,Non-Probability Sample,
3,Breast cancer,Observational,,"Genetic, Other",,Case-Control,,Probability Sample,
4,BC-BOMET,Observational,,"Other, Other",,Case-Control,,Non-Probability Sample,
5,241391,Observational,,Diagnostic Test,,Cohort,1 Year,Non-Probability Sample,


In [23]:
# List of fields for table e and export dataset into DataFrame
table_e_cols = ['OrgStudyId',
          'WhyStopped',
          'EnrollmentCount',
          'PrimaryOutcomeMeasure',
          'FlowDropWithdrawType']
free_text_df = pd.DataFrame(create_df(table_e_cols))
free_text_df.set_index('Rank', inplace=True)


In [24]:
clean_data(free_text_df, table_e_cols)
free_text_df.head()

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,WhyStopped,EnrollmentCount,PrimaryOutcomeMeasure,FlowDropWithdrawType
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BTX-BCI-016-PRT,,3000,To determine BCI test performance by evaluatin...,
2,2018-TJ-BCD,,2300,Diagnostic potential of SEMA4C as a biomarker ...,
3,Breast cancer,,80,Role of SORCIN in patients with breast cancer,
4,BC-BOMET,,30,SENP1 expression,
5,241391,,600,Performance of the Syantra DX Breast Cancer te...,


In [25]:
# Export DataFrames as csv files 
study_method_df.to_csv('Tables/study_method_df.csv',index=True)
free_text_df.to_csv('Tables/free_text_df.csv', index=True)