In [1]:
# Import dependencies
import requests
import pandas as pd
import numpy as np

In [2]:
# Base url for the api call
url = "https://clinicaltrials.gov/api/query/study_fields"

In [3]:
# Headers for json call
headers = {'Content-Type': 'application/json', 
           'Accept': 'application/json'}

In [4]:
# Extract data and create DataFrame
def create_df(x):

    # Create a list to hold the dictionaries from the json response
    sourced_data = []

    # Set a counter to increase the ranks with each iteration
    # Table search criteria limited to 'breast cancer' and from 01/01/2018
    counter = 0

    while True:
        # Define the parameters of the url
        params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', 
                'fields': ','.join(x), 
                'min_rnk': 1 + 1000 * counter,  
                'max_rnk': 1000 + 1000 * counter, 
                'fmt': 'json'}

        # Make the call 
        response = requests.get(url, 
                                headers = headers, 
                                params = params)

        

        # Increment the counter
        counter += 1
        
        # Check to see if more data was returned for the current response
        if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:
            # If there was no new data returned, break out of the while loop
            break

        # Add the data from the api call to the sourced_data list
        sourced_data += response.json()['StudyFieldsResponse']['StudyFields']
    return sourced_data

In [5]:
# Define function to clean DataFrame removing unnecessary characters for analysis

def clean_data(df, fields):
    # Change datatype of the values, preparing for reg expression
    df[fields] = df[fields].astype(str)

    # Remove any unnecessary characters and turn blank values with NaN (null)
    for i in fields:
        df[i] = df[i].str.replace("^\[.|.\]$|'","")
        df[i] = df[i].replace('',np.nan)

In [21]:
# List of fields for table A
table_a_cols = ['OrgStudyId',
               'BriefTitle',
               'StartDate',
               'CompletionDate',
               'OverallStatus',
               'StudyType']

In [18]:
# Call the function the create DataFrame for table A
registration_df = pd.DataFrame(create_df(table_a_cols))
registration_df.set_index('Rank', inplace=True)

In [20]:
# Clean DataFrame for table A
clean_data(registration_df, table_a_cols)
registration_df

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,BriefTitle,StartDate,CompletionDate,OverallStatus,StudyType
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,"April 14, 2021",December 2028,Recruiting,Observational
2,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,"September 1, 2023","September 1, 2024",Not yet recruiting,Observational
3,Breast cancer,Role of Sorcin and Annexin A3 in Breast Cancer...,"January 20, 2019","September 30, 2019",Unknown status,Observational
4,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,"January 13, 2020","November 12, 2024",Recruiting,Observational
5,241391,A Study to Identify Breast Cancer (IDBC),"January 24, 2019","December 31, 2022",Unknown status,Observational
...,...,...,...,...,...,...
5009,NCI-2022-02915,Testing the Combination of the Anti-cancer Dru...,"August 4, 2022","August 4, 2025",Recruiting,Interventional
5010,4-2017-0840,Efficacy and Safety of Modified Nab-Paclitaxel...,"April 1, 2018",June 2020,Unknown status,Interventional
5011,21953,A Study Called ARAMON to Learn to What Extent ...,"December 19, 2022","October 20, 2025",Recruiting,Interventional
5012,J1940,Patient Response to Immunotherapy Using Splice...,"September 17, 2020",August 2025,Recruiting,Observational


In [24]:
# List of fields for table B
table_b_cols = ['OrgStudyId',
               'Gender',
               'MinimumAge',
               'MaximumAge',
               'HealthyVolunteers']

In [25]:
# Call the function the create DataFrame for table B
participant_df = pd.DataFrame(create_df(table_b_cols))
participant_df.set_index('Rank', inplace=True)

In [26]:
# Clean DataFrame for table B
clean_data(participant_df, table_b_cols)
participant_df

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,Gender,MinimumAge,MaximumAge,HealthyVolunteers
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BTX-BCI-016-PRT,Female,18 Years,,No
2,2018-TJ-BCD,Female,18 Years,80 Years,No
3,Breast cancer,Female,20 Years,75 Years,Accepts Healthy Volunteers
4,BC-BOMET,Female,18 Years,,No
5,241391,Female,30 Years,75 Years,Accepts Healthy Volunteers
...,...,...,...,...,...
4999,32900654326,Female,18 Years,60 Years,No
5000,ReDA 13176,All,18 Years,,No
5001,64121317.4.1001.5330,Male,18 Years,,No
5002,3-2020-0038,All,20 Years,,No


In [6]:
# List of fields for table d
table_d_cols = ['OrgStudyId',
          'StudyType',
          'ArmGroupType',
          'InterventionType',
          'DesignInterventionModel',
          'DesignObservationalModel',
          'TargetDuration',
          'SamplingMethod',
          'Phase']

# Call function to create DataFrame for table D
study_method_df = pd.DataFrame(create_df(table_d_cols))
study_method_df.set_index('Rank',inplace=True)

In [7]:
# Clean DataFrame for table c
clean_data(study_method_df, table_d_cols)
study_method_df

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,StudyType,ArmGroupType,InterventionType,DesignInterventionModel,DesignObservationalModel,TargetDuration,SamplingMethod,Phase
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,BTX-BCI-016-PRT,Observational,,Diagnostic Test,,Case-Only,5 Years,Non-Probability Sample,
2,2018-TJ-BCD,Observational,,"Diagnostic Test, Diagnostic Test",,Case-Control,,Non-Probability Sample,
3,Breast cancer,Observational,,"Genetic, Other",,Case-Control,,Probability Sample,
4,BC-BOMET,Observational,,"Other, Other",,Case-Control,,Non-Probability Sample,
5,241391,Observational,,Diagnostic Test,,Cohort,1 Year,Non-Probability Sample,
...,...,...,...,...,...,...,...,...,...
4999,32900654326,Interventional,"Placebo Comparator, Active Comparator, Active ...","Procedure, Drug",Parallel Assignment,,,,Not Applicable
5000,ReDA 13176,Interventional,"Active Comparator, Experimental","Radiation, Drug, Drug, Drug, Drug, Other, Radi...",Parallel Assignment,,,,Phase 3
5001,64121317.4.1001.5330,Interventional,Experimental,Drug,Single Group Assignment,,,,Phase 2
5002,3-2020-0038,Interventional,Experimental,Radiation,Single Group Assignment,,,,Not Applicable


In [8]:
# List of fields for table e and export dataset into DataFrame
table_e_cols = ['OrgStudyId',
          'WhyStopped',
          'EnrollmentCount',
          'PrimaryOutcomeMeasure',
          'FlowDropWithdrawType']
free_text_df = pd.DataFrame(create_df(table_e_cols))
free_text_df.set_index('Rank', inplace=True)


In [9]:
clean_data(free_text_df, table_e_cols)
free_text_df

  if __name__ == "__main__":


Unnamed: 0_level_0,OrgStudyId,WhyStopped,EnrollmentCount,PrimaryOutcomeMeasure,FlowDropWithdrawType
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BTX-BCI-016-PRT,,3000,To determine BCI test performance by evaluatin...,
2,2018-TJ-BCD,,2300,Diagnostic potential of SEMA4C as a biomarker ...,
3,Breast cancer,,80,Role of SORCIN in patients with breast cancer,
4,BC-BOMET,,30,SENP1 expression,
5,241391,,600,Performance of the Syantra DX Breast Cancer te...,
...,...,...,...,...,...
4999,32900654326,,80,The primary outcome of the study will be durat...,
5000,ReDA 13176,,180,Overall Survival,
5001,64121317.4.1001.5330,,38,PSA response rate,
5002,3-2020-0038,,80,3-year local recurrence rate,


In [10]:
# Export DataFrames as csv files 
study_method_df.to_csv('Tables/study_method_df.csv',index=True)
free_text_df.to_csv('Tables/free_text_df.csv', index=True)