In [1]:
import os

from requests_cache import CachedSession
import pandas as pd

In [2]:
session = CachedSession(
    expire_after=1, allowable_methods=("GET", "POST"), backend="sqlite"
)

In [3]:
def get_trials(body):
    res = session.post(
        "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
        json={
            "current_trial_status": [
                "Active",
                "Approved",
                "Active, not recruiting",
                "Enrolling by Invitation",
                "In Review",
                "Temporarily Closed to Accrual",
                "Temporarily Closed to Accrual and Intervention",
            ],
            "primary_purpose": ["treatment"],
            "include": [
                "nct_id",
                "official_title",
                "brief_summary",
                "detail_description",
                "anatomic_sites",
                "diseases.name",
                "diseases.nci_thesaurus_concept_id",
                "diseases.inclusion_indicator",
                "primary_purpose",
            ],
            "size": 50,
            **body,
        },
        headers={"X-API-KEY": os.getenv("CTS_V2_API_KEY")},
    )
    res.raise_for_status()
    res_json = res.json()
    return res_json

In [4]:
body = {
    "maintype": ["C2926"],  # Lung Non-Small Cell Carcinoma
    "subtype": ["C156094"],  # Metastatic Lung Non-Small Cell Carcinoma
    "from": 0,
}
met_trials = get_trials(body=body)
met_ids = set()
data = met_trials["data"]
for t in data:
    met_ids.add(t["nct_id"])

while data:
    met_trials = get_trials(body={**body, "from": len(met_ids)})
    data = met_trials["data"]
    print("\rMet Trials:", len(met_ids), flush=True, end="")
    for t in data:
        met_ids.add(t["nct_id"])

met_ids, len(met_ids)

Met Trials: 338

({'NCT01639508',
  'NCT02133196',
  'NCT02264678',
  'NCT02484404',
  'NCT02715284',
  'NCT02817633',
  'NCT02955290',
  'NCT03025256',
  'NCT03175224',
  'NCT03190941',
  'NCT03191149',
  'NCT03259867',
  'NCT03260491',
  'NCT03391869',
  'NCT03412877',
  'NCT03425279',
  'NCT03476681',
  'NCT03485209',
  'NCT03527108',
  'NCT03581487',
  'NCT03647163',
  'NCT03739710',
  'NCT03745326',
  'NCT03797391',
  'NCT03801902',
  'NCT03808337',
  'NCT03821935',
  'NCT03937154',
  'NCT03986606',
  'NCT04007744',
  'NCT04042701',
  'NCT04045496',
  'NCT04083599',
  'NCT04105270',
  'NCT04137900',
  'NCT04140526',
  'NCT04143711',
  'NCT04143789',
  'NCT04145622',
  'NCT04157985',
  'NCT04158336',
  'NCT04163432',
  'NCT04165070',
  'NCT04180371',
  'NCT04181060',
  'NCT04197934',
  'NCT04198766',
  'NCT04219254',
  'NCT04253964',
  'NCT04266730',
  'NCT04282044',
  'NCT04300556',
  'NCT04340882',
  'NCT04370587',
  'NCT04374877',
  'NCT04389632',
  'NCT04410796',
  'NCT04418167',
  'NCT04423029

In [5]:
body = {
    "maintype": ["C2926"],  # Lung Non-Small Cell Carcinoma
    "stage": ["C4012"],  # Stage IV Lung Non-Small Cell Cancer AJCC v7
    "from": 0,
}
stage_v7_trials = get_trials(body=body)
data = stage_v7_trials["data"]
stage_v7_trials = []
stage_v7_ids = set()
for t in data:
    stage_v7_ids.add(t["nct_id"])

while data:
    stage_v7_trials = get_trials(body={**body, "from": len(stage_v7_ids)})
    data = stage_v7_trials["data"]
    print("\rStage v7 Trials:", len(stage_v7_ids), flush=True, end="")
    for t in data:
        stage_v7_ids.add(t["nct_id"])

stage_v7_ids, len(stage_v7_ids)

Stage v7 Trials: 9

({'NCT00246727',
  'NCT00706862',
  'NCT01630733',
  'NCT01639508',
  'NCT02264678',
  'NCT02321501',
  'NCT02715284',
  'NCT03066206',
  'NCT03191149'},
 9)

In [6]:
body = {
    "maintype": ["C2926"],  # Lung Non-Small Cell Carcinoma
    "stage": [
        "C136482"
    ],  # Stage IV Lung Cancer AJCC v8, NOTE: that it's more generic than NSCLC
    "from": 0,
}
stage_v8_trials = get_trials(body=body)
data = stage_v8_trials["data"]
stage_v8_trials = []
stage_v8_ids = set()
for t in data:
    stage_v8_ids.add(t["nct_id"])

while data:
    stage_v8_trials = get_trials(body={**body, "from": len(stage_v8_ids)})
    data = stage_v8_trials["data"]
    print("\rStage v8 Trials:", len(stage_v8_ids), flush=True, end="")
    for t in data:
        stage_v8_ids.add(t["nct_id"])

stage_v8_ids, len(stage_v8_ids)

Stage v8 Trials: 371

({'NCT02133196',
  'NCT02178163',
  'NCT02484404',
  'NCT02817633',
  'NCT02955290',
  'NCT02991651',
  'NCT03025256',
  'NCT03175224',
  'NCT03190941',
  'NCT03260491',
  'NCT03391869',
  'NCT03412877',
  'NCT03425279',
  'NCT03476681',
  'NCT03485209',
  'NCT03518554',
  'NCT03527108',
  'NCT03581487',
  'NCT03645928',
  'NCT03647163',
  'NCT03735095',
  'NCT03739710',
  'NCT03744468',
  'NCT03745326',
  'NCT03797391',
  'NCT03808337',
  'NCT03819296',
  'NCT03821935',
  'NCT03824327',
  'NCT03905148',
  'NCT03937154',
  'NCT03986606',
  'NCT04007744',
  'NCT04042701',
  'NCT04045496',
  'NCT04073745',
  'NCT04083599',
  'NCT04085315',
  'NCT04093167',
  'NCT04105270',
  'NCT04137900',
  'NCT04140526',
  'NCT04143711',
  'NCT04143789',
  'NCT04152499',
  'NCT04157985',
  'NCT04158336',
  'NCT04163432',
  'NCT04165070',
  'NCT04180371',
  'NCT04181060',
  'NCT04198766',
  'NCT04219254',
  'NCT04253964',
  'NCT04266730',
  'NCT04282044',
  'NCT04300556',
  'NCT04332367',
  'NCT04340882

In [7]:
all_stage_ids = stage_v7_ids.copy()
all_stage_ids.update(stage_v8_ids)
diff = met_ids.difference(all_stage_ids)
diff, len(diff)

({'NCT03259867',
  'NCT03801902',
  'NCT04145622',
  'NCT04197934',
  'NCT04585490',
  'NCT04717375',
  'NCT05136846',
  'NCT05170204',
  'NCT05358691',
  'NCT05579366',
  'NCT05624996',
  'NCT05718557',
  'NCT06124118',
  'NCT06194448',
  'NCT06333678',
  'NCT06667908'},
 16)

In [9]:
print(len(all_stage_ids))
for id in diff:
    print(id)

380
NCT04717375
NCT06333678
NCT06124118
NCT05358691
NCT04197934
NCT05136846
NCT04585490
NCT03259867
NCT05624996
NCT03801902
NCT05718557
NCT05170204
NCT04145622
NCT06667908
NCT06194448
NCT05579366


In [18]:
pd.DataFrame(met_ids).to_csv('mets.csv', index=False)
pd.DataFrame(stage_v8_ids).to_csv('stages.csv', index=False)