In [16]:
import datetime
import os
from requests_cache import CachedSession
from tqdm.notebook import tqdm
import pandas as pd

session = CachedSession(
    expire_after=datetime.timedelta(days=1), allowable_methods=["GET", "POST"]
)

key = os.getenv("CTS_V2_API_KEY")

In [24]:
OPEN_TRIALS = [
    "Active",
    "Approved",
    "Enrolling by Invitation",
    "In Review",
    "Temporarily Closed to Accrual",
    "Temporarily Closed to Accrual and Intervention",
]
OPEN_SITES = [
    "active",
    "approved",
    "enrolling_by_invitation",
    "in_review",
    "temporarily_closed_to_accrual",
]
TODAY = datetime.date.today().strftime("%Y%m%d")


def get_ctsapi_trials(start: int, **others):
    res = session.post(
        "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
        json={
            "current_trial_status": OPEN_TRIALS,
            "include": ["nci_id", "nct_id", "brief_title", "current_trial_status"],
            "from": start,
            "sites.recruitment_status": OPEN_SITES,
            "size": 50,
            **others,
        },
        headers={"X-API-KEY": key},
    )
    res.raise_for_status()
    return res.json()


def gather_trials(**kwargs):
    page = get_ctsapi_trials(start=0, **kwargs)
    total = page["total"]
    trials = page["data"]
    pbar = tqdm(total=total)
    pbar.update(len(trials))
    while len(trials) < total:
        next_page = get_ctsapi_trials(start=len(trials), **kwargs)
        trials.extend(next_page["data"])
        pbar.update(len(next_page["data"]))
    pbar.refresh()
    return trials


def get_nci_campus_trials():
    trials = gather_trials(**{"sites.org_postal_code": "20892"})
    df = pd.DataFrame(
        trials, columns=["nct_id", "nci_id", "current_trial_status", "brief_title"]
    )
    df.to_csv(f"nih_onsite_trials_{TODAY}.csv", index=False, encoding="utf_8_sig")


def get_nci_funded_trials():
    trials = gather_trials(
        **{"nci_funded": "Direct", "outer_or_nci_funded": "Indirect"}
    )
    df = pd.DataFrame(
        trials, columns=["nct_id", "nci_id", "current_trial_status", "brief_title"]
    )
    df.to_csv(f"nci_funded_trials_{TODAY}.csv", index=False, encoding="utf_8_sig")

In [26]:
# get_nci_campus_trials()
get_nci_funded_trials()

  0%|          | 0/21631 [00:00<?, ?it/s]