In [1]:
# Import dependencies
import requests
import pandas as pd
import numpy as np

In [2]:
# Base url for the api call
url = "https://clinicaltrials.gov/api/query/study_fields"

In [3]:
# Headers for json call
headers = {'Content-Type': 'application/json', 
           'Accept': 'application/json'}

In [4]:
# Extract data and create DataFrame
def create_df(x):

    # Create a list to hold the dictionaries from the json response
    sourced_data = []

    # Set a counter to increase the ranks with each iteration
    # Table search criteria limited to 'breast cancer' and from 01/01/2018
    counter = 0

    while True:
        # Define the parameters of the url
        params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', 
                'fields': ','.join(x), 
                'min_rnk': 1 + 1000 * counter,  
                'max_rnk': 1000 + 1000 * counter, 
                'fmt': 'json'}

        # Make the call 
        response = requests.get(url, 
                                headers = headers, 
                                params = params)

        

        # Increment the counter
        counter += 1
        
        # Check to see if more data was returned for the current response
        if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:
            # If there was no new data returned, break out of the while loop
            break

        # Add the data from the api call to the sourced_data list
        sourced_data += response.json()['StudyFieldsResponse']['StudyFields']
    return sourced_data

In [5]:
# Define function to clean DataFrame removing unnecessary characters for analysis

def clean_data(df, fields):
    # Change datatype of the values, preparing for reg expression
    df[fields] = df[fields].astype(str)

    # Remove any unnecessary characters and turn blank values with NaN (null)
    for i in fields:
        df[i] = df[i].str.replace("^\[.|.\]$|'","")
        df[i] = df[i].replace('',np.nan)
        

In [6]:
# Define function to clean DataFrame removing unnecessary characters for analysis

def clean_data_years(df, fields):
    # Change datatype of the values, preparing for reg expression
    df[fields] = df[fields].astype(str)

    # Remove any unnecessary characters and turn blank values with NaN (null)
    for i in fields:
        df[i] = df[i].str.replace('Years', '')
        df[i] = df[i].str.replace('Year', '')

In [7]:
# List of fields for table A
table_a_cols = ['OrgStudyId',
               'BriefTitle',
               'StartDate',
               'CompletionDate',
               'OverallStatus',
               'StudyType']

In [8]:
# Call the function the create DataFrame for table A
registration_df = pd.DataFrame(create_df(table_a_cols))
registration_df.set_index('Rank', inplace=True)

In [9]:
# Clean DataFrame for table A
clean_data(registration_df, table_a_cols)

  df[i] = df[i].str.replace("^\[.|.\]$|'","")


In [10]:
# Renaming the columns for table A and convertion to datetime and removing duplicates
registration_df.columns = ['ID', 'Title', 'Start_Date', 'Completion_Date', 'Status', 'Study_Type']
registration_df.index.names = ['Index']
registration_df['Start_Date'] = pd.to_datetime(registration_df['Start_Date'])
registration_df['Completion_Date'] = pd.to_datetime(registration_df['Completion_Date'])
registration_df = registration_df.drop_duplicates(subset=['ID'], keep='first')
registration_df

Unnamed: 0_level_0,ID,Title,Start_Date,Completion_Date,Status,Study_Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,2021-04-14,2028-12-01,Recruiting,Observational
2,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,2023-09-01,2024-09-01,Not yet recruiting,Observational
3,Breast cancer,Role of Sorcin and Annexin A3 in Breast Cancer...,2019-01-20,2019-09-30,Unknown status,Observational
4,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,2020-01-13,2024-11-12,Recruiting,Observational
5,241391,A Study to Identify Breast Cancer (IDBC),2019-01-24,2022-12-31,Unknown status,Observational
...,...,...,...,...,...,...
5004,32900654326,"TPVB, PECSB, ESPB for Postmastectmy Pain",2019-04-10,2021-08-10,Completed,Interventional
5005,ReDA 13176,A Randomized Phase III Trial of Stereotactic A...,2023-04-01,2029-04-01,Not yet recruiting,Interventional
5006,64121317.4.1001.5330,Nivolumab in Prostate Cancer With DNA Repair D...,2018-06-01,2022-03-01,"Active, not recruiting",Interventional
5007,3-2020-0038,Intraoperative Radiation Therapy for Resectabl...,2020-04-02,2026-04-01,Recruiting,Interventional


In [11]:
# List of fields for table B
table_b_cols = ['OrgStudyId',
               'Gender',
               'MinimumAge',
               'HealthyVolunteers']

In [12]:
# Call the function the create DataFrame for table B
participant_df = pd.DataFrame(create_df(table_b_cols))
participant_df.set_index('Rank', inplace=True)

In [13]:
# Clean DataFrame for table B
clean_data(participant_df, table_b_cols)
clean_data_years(participant_df, table_b_cols)

  df[i] = df[i].str.replace("^\[.|.\]$|'","")


In [14]:
# Renaming the columns for table B and getting rid of rows
participant_df.columns = ['ID', 'Gender', 'Minimum_Age', 'Healthy_Volunteers']
participant_df.index.names = ['Index']
participant_df = participant_df[participant_df['Minimum_Age'].str.contains('Months') == False]
participant_df = participant_df.drop_duplicates(subset=['ID'], keep='first')
participant_df

Unnamed: 0_level_0,ID,Gender,Minimum_Age,Healthy_Volunteers
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,BTX-BCI-016-PRT,Female,18,No
2,2018-TJ-BCD,Female,18,No
3,Breast cancer,Female,20,Accepts Healthy Volunteers
4,BC-BOMET,Female,18,No
5,241391,Female,30,Accepts Healthy Volunteers
...,...,...,...,...
5004,32900654326,Female,18,No
5005,ReDA 13176,All,18,No
5006,64121317.4.1001.5330,Male,18,No
5007,3-2020-0038,All,20,No


In [15]:
# List of fields for table C
table_c_cols = ['OrgStudyId',
               'IsFDARegulatedDrug',
               'IsFDARegulatedDevice',
               'ResponsiblePartyType']

In [16]:
# Call the function the create DataFrame for table C
study_details_df = pd.DataFrame(create_df(table_c_cols))
study_details_df.set_index('Rank', inplace=True)

In [17]:
# Clean DataFrame for table C
clean_data(study_details_df, table_c_cols)

  df[i] = df[i].str.replace("^\[.|.\]$|'","")


In [18]:
# Renaming the columns for table C and removing duplicates
study_details_df.columns = ['ID', 'FDA_Regulated_Drug', 'FDA_Regulated_Device', 'Responsible_Party']
study_details_df.index.names = ['Index']
study_details_df = study_details_df.drop_duplicates(subset=['ID'], keep='first')
study_details_df

Unnamed: 0_level_0,ID,FDA_Regulated_Drug,FDA_Regulated_Device,Responsible_Party
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,BTX-BCI-016-PRT,No,No,Sponsor
2,2018-TJ-BCD,No,No,Principal Investigator
3,Breast cancer,No,No,Principal Investigator
4,BC-BOMET,No,No,Sponsor
5,241391,No,No,Sponsor
...,...,...,...,...
5004,32900654326,No,No,Principal Investigator
5005,ReDA 13176,No,No,Sponsor-Investigator
5006,64121317.4.1001.5330,Yes,No,Sponsor
5007,3-2020-0038,No,No,Sponsor-Investigator


In [19]:
# List of fields for table d
table_d_cols = ['OrgStudyId',
          'StudyType',
          'ArmGroupType',
          'InterventionType',
          'DesignInterventionModel',
          'DesignObservationalModel',
          'TargetDuration',
          'SamplingMethod',
          'Phase']

# Call function to create DataFrame for table D
study_method_df = pd.DataFrame(create_df(table_d_cols))
study_method_df.set_index('Rank',inplace=True)

In [20]:
# Clean DataFrame for table c
clean_data(study_method_df, table_d_cols)

  df[i] = df[i].str.replace("^\[.|.\]$|'","")


In [21]:
# Renaming the columns for table D and removing duplicates
study_method_df.columns = ['ID', 'Study_Type', 'Arm_Group_Type', 'Intervention_Type', 'Interventional_Study_Model', 'Observational_Study_Model', 'Target_Duration', 'Sampling_Method', 'Phase']
study_method_df.index.names = ['Index']
study_method_df = study_method_df.drop_duplicates(subset=['ID'], keep='first')
study_method_df

Unnamed: 0_level_0,ID,Study_Type,Arm_Group_Type,Intervention_Type,Interventional_Study_Model,Observational_Study_Model,Target_Duration,Sampling_Method,Phase
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,BTX-BCI-016-PRT,Observational,,Diagnostic Test,,Case-Only,5 Years,Non-Probability Sample,
2,2018-TJ-BCD,Observational,,"Diagnostic Test, Diagnostic Test",,Case-Control,,Non-Probability Sample,
3,Breast cancer,Observational,,"Genetic, Other",,Case-Control,,Probability Sample,
4,BC-BOMET,Observational,,"Other, Other",,Case-Control,,Non-Probability Sample,
5,241391,Observational,,Diagnostic Test,,Cohort,1 Year,Non-Probability Sample,
...,...,...,...,...,...,...,...,...,...
5004,32900654326,Interventional,"Placebo Comparator, Active Comparator, Active ...","Procedure, Drug",Parallel Assignment,,,,Not Applicable
5005,ReDA 13176,Interventional,"Active Comparator, Experimental","Radiation, Drug, Drug, Drug, Drug, Other, Radi...",Parallel Assignment,,,,Phase 3
5006,64121317.4.1001.5330,Interventional,Experimental,Drug,Single Group Assignment,,,,Phase 2
5007,3-2020-0038,Interventional,Experimental,Radiation,Single Group Assignment,,,,Not Applicable


In [22]:
# List of fields for table e and export dataset into DataFrame
table_e_cols = ['OrgStudyId',
          'EnrollmentCount',
          'PrimaryOutcomeMeasure',]
free_text_df = pd.DataFrame(create_df(table_e_cols))
free_text_df.set_index('Rank', inplace=True)


In [23]:
clean_data(free_text_df, table_e_cols)

  df[i] = df[i].str.replace("^\[.|.\]$|'","")


In [24]:
# Renaming the columns for table E and removing duplicates
free_text_df.columns = ['ID', 'Enrollment_Count', 'Primary_Outcome_Measure']
free_text_df.index.names = ['Index']
free_text_df = free_text_df.drop_duplicates(subset=['ID'], keep='first')
free_text_df

Unnamed: 0_level_0,ID,Enrollment_Count,Primary_Outcome_Measure
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,BTX-BCI-016-PRT,3000,To determine BCI test performance by evaluatin...
2,2018-TJ-BCD,2300,Diagnostic potential of SEMA4C as a biomarker ...
3,Breast cancer,80,Role of SORCIN in patients with breast cancer
4,BC-BOMET,30,SENP1 expression
5,241391,600,Performance of the Syantra DX Breast Cancer te...
...,...,...,...
5004,32900654326,80,The primary outcome of the study will be durat...
5005,ReDA 13176,180,Overall Survival
5006,64121317.4.1001.5330,38,PSA response rate
5007,3-2020-0038,80,3-year local recurrence rate


In [25]:
# Export DataFrames as csv files 
study_method_df.to_csv('Tables/study_method_df.csv',index=False)
free_text_df.to_csv('Tables/free_text_df.csv', index=False)
registration_df.to_csv('Tables/registration_df.csv', index=False)
participant_df.to_csv('Tables/participant_df.csv', index=False)
study_details_df.to_csv('Tables/study_details_df.csv', index=False)