In [1]:
# Import dependencies
import requests
import pandas as pd

In [2]:
# Base url for the api call
url = "https://clinicaltrials.gov/api/query/study_fields"

In [3]:
# Headers for json call
headers = {'Content-Type': 'application/json', 
           'Accept': 'application/json'}

In [10]:
# List of fields for table c
fields = ['OrgStudyId',
          'StudyType',
          'ArmGroupType',
          'InterventionType',
          'DesignInterventionModel',
          'DesignObservationalModel',
          'TargetDuration',
          'SamplingMethod',
          'Phase']

In [11]:
# Create a list to hold the dictionaries from the json response
sourced_data = []

In [177]:
# Set a counter to increase the ranks with each iteration
# Table search criteria limited to 'breast cancer' and from 01/01/2018
counter = 0

while True:
    # Define the parameters of the url
    params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', 
              'fields': ','.join(fields), 
              'min_rnk': 1 + 1000 * counter,  
              'max_rnk': 1000 + 1000 * counter, 
              'fmt': 'json'}

    # Make the call 
    response = requests.get(url, 
                            headers = headers, 
                            params = params)


    

    # Increment the counter
    counter += 1
    
    # Check to see if more data was returned for the current response
    if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:
        # If there was no new data returned, break out of the while loop
        break

    # Add the data from the api call to the sourced_data list
    sourced_data += response.json()['StudyFieldsResponse']['StudyFields']
    

In [211]:
# Create a dataframe from the dictionaries
df = pd.DataFrame(sourced_data)

In [212]:
# Set the rank as the index
df.set_index('Rank', inplace = True)
df.head()

Unnamed: 0_level_0,OrgStudyId,StudyType,ArmGroupType,InterventionType,DesignInterventionModel,DesignObservationalModel,TargetDuration,SamplingMethod,Phase
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,[BTX-BCI-016-PRT],[Observational],[],[Diagnostic Test],[],[Case-Only],[5 Years],[Non-Probability Sample],[]
2,[2018-TJ-BCD],[Observational],[],"[Diagnostic Test, Diagnostic Test]",[],[Case-Control],[],[Non-Probability Sample],[]
3,[Breast cancer],[Observational],[],"[Genetic, Other]",[],[Case-Control],[],[Probability Sample],[]
4,[BC-BOMET],[Observational],[],"[Other, Other]",[],[Case-Control],[],[Non-Probability Sample],[]
5,[241391],[Observational],[],[Diagnostic Test],[],[Cohort],[1 Year],[Non-Probability Sample],[]


In [213]:
# Change datatype of the values, preparing for reg expression
df[fields] = df[fields].astype(str)

In [214]:
# Run through each value in DataFrame to remove unncessary characters
# Also turns blank values with NaN (null)
import numpy as np
for i in fields:
    df[i] = df[i].str.replace("^\[.|.\]$|'","")
    df[i] = df[i].replace('',np.nan)

  """


In [220]:
# Export table as csv file 
study_method_df = df
study_method_df.to_csv('Tables/study_method_df.csv',index=True)