In [1]:
# Import dependencies
import requests
import pandas as pd

In [2]:
# Base url for the api call
url = "https://clinicaltrials.gov/api/query/study_fields"

In [3]:
# Headers for json call
headers = {'Content-Type': 'application/json', 
           'Accept': 'application/json'}

In [4]:
# List of fields
fields = ['OrgStudyId',
          'BriefTitle',
          'StudyType', 
          'StartDate',
          'CompletionDate',
          'OverallStatus',  
          'ResponsiblePartyType', 
          'Gender', 
          'GenderBased', 
          'MinimumAge', 
          'MaximumAge']

In [5]:
# Create a list to hold the dictionaries from the json response
sourced_data = []

In [6]:
# Set a counter to increase the ranks with each iteration
counter = 0
while True:
    # Define the parameters of the url
    params = {'expr': 'breast cancer AND SEARCH[Location](AREA[LocationState] California)', 
              'fields': ','.join(fields),
              'min_rnk': 1 + 1000 * counter,  
              'max_rnk': 1000 + 1000 * counter, 
              'fmt': 'json'}
    # Make the call 
    response = requests.get(url, 
                        headers = headers, 
                        params = params)
    # Increment the counter
    counter += 1
     # Check to see if more data was returned for the current response
    if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:
        
        # If there was no new data returned, break out of the while loop
        break
    # Add the data from the api call to the sourced_data list
    sourced_data += response.json()['StudyFieldsResponse']['StudyFields']

In [11]:
print(sourced_data)

[{'Rank': 1, 'OrgStudyId': ['CCT1043'], 'BriefTitle': ['Breast Cancer Trials Education Program'], 'StudyType': ['Interventional'], 'StartDate': ['May 2007'], 'CompletionDate': ['December 2015'], 'OverallStatus': ['Completed'], 'ResponsiblePartyType': ['Principal Investigator'], 'Gender': ['Female'], 'GenderBased': [], 'MinimumAge': ['21 Years'], 'MaximumAge': []}, {'Rank': 2, 'OrgStudyId': ['150B-0158'], 'BriefTitle': ['Breast Cancer Risk Reduction: A Patient Doctor Intervention'], 'StudyType': ['Interventional'], 'StartDate': ['June 2011'], 'CompletionDate': ['December 2012'], 'OverallStatus': ['Completed'], 'ResponsiblePartyType': ['Sponsor'], 'Gender': ['All'], 'GenderBased': [], 'MinimumAge': ['40 Years'], 'MaximumAge': ['74 Years']}, {'Rank': 3, 'OrgStudyId': ['CHUM-CRYOABLATION'], 'BriefTitle': ['Anti-Tumor Immunity Induced by Cryoablation of Invasive Breast Cancers'], 'StudyType': ['Interventional'], 'StartDate': ['April 2013'], 'CompletionDate': ['October 2014'], 'OverallStatus

In [25]:
# Create a dataframe from the dictionaries
df = pd.DataFrame(sourced_data)


Unnamed: 0_level_0,OrgStudyId,BriefTitle,StudyType,StartDate,CompletionDate,OverallStatus,ResponsiblePartyType,Gender,GenderBased,MinimumAge,MaximumAge
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,[CCT1043],[Breast Cancer Trials Education Program],[Interventional],[May 2007],[December 2015],[Completed],[Principal Investigator],[Female],[],[21 Years],[]
2,[150B-0158],[Breast Cancer Risk Reduction: A Patient Docto...,[Interventional],[June 2011],[December 2012],[Completed],[Sponsor],[All],[],[40 Years],[74 Years]
3,[CHUM-CRYOABLATION],[Anti-Tumor Immunity Induced by Cryoablation o...,[Interventional],[April 2013],[October 2014],[Terminated],[Sponsor],[All],[],[18 Years],[]
4,[20107812],[Monitoring and Predicting Breast Cancer Neoad...,[Observational],[December 2010],[October 2013],[Withdrawn],[Principal Investigator],[Female],[],[18 Years],[]
5,[234870],[Blood Glycan Biomarkers in Women With Stage I...,[Observational],[September 2006],"[January 12, 2016]",[Completed],[Sponsor],[Female],[],[18 Years],[]


In [None]:
# Set the rank as the index
df.set_index('Rank', inplace = True)
df.head()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 1 to 1685
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   OrgStudyId            1685 non-null   object
 1   BriefTitle            1685 non-null   object
 2   StudyType             1685 non-null   object
 3   StartDate             1685 non-null   object
 4   CompletionDate        1685 non-null   object
 5   OverallStatus         1685 non-null   object
 6   ResponsiblePartyType  1685 non-null   object
 7   Gender                1685 non-null   object
 8   GenderBased           1685 non-null   object
 9   MinimumAge            1685 non-null   object
 10  MaximumAge            1685 non-null   object
dtypes: object(11)
memory usage: 158.0+ KB
