In [None]:
!pip install groq

In [None]:
import pandas as pd
from tqdm import tqdm

import re
import requests
import json
import time
import copy
from typing import Dict, Any

# ClinicalTrials.gov Fetcher (Data Acquisition Layer)

Programmatically fetch lung-cancer clinical trial metadata from ClinicalTrials.gov using their API, with pagination, filtering, and safety control (timeouts, progress bars).

In [None]:

# ============= LUNG CANCER-ONLY FETCHER =============
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"

def fetch_lung_cancer_trials(
    page_size=10,
    max_total=50,
    status_filter=None         # e.g., "RECRUITING", "ACTIVE_NOT_RECRUITING"
):
    

        
    params = {
        "format": "json",
        "pageSize": page_size,
        # MeSH filter for lung neoplasms
        "filter.advanced": "AREA[ConditionMeshTerm]Neoplasms/Lung",
    }
    if status_filter:
        params["filter.status"] = status_filter

    all_studies = []
    next_token = None
    pbar = tqdm(desc="Fetching lung cancer trials", unit="trial")

    while True:
        if next_token:
            params["pageToken"] = next_token  #Token-based pagination (not offset pagination) → stable & scalable.

        resp = requests.get(BASE_URL, params=params, timeout=60)
        resp.raise_for_status()
        data = resp.json()

        batch = data.get("studies", [])
        all_studies.extend(batch)
        pbar.update(len(batch))

        next_token = data.get("nextPageToken")
        if not next_token or (max_total and len(all_studies) >= max_total):
            break

        time.sleep(0.1)

    pbar.close()
    studies = all_studies[:max_total] if max_total else all_studies
    print(f"\nFetched {len(studies)} lung cancer trials")
    return studies

# Usage
studies = fetch_lung_cancer_trials(
    page_size=100,
    max_total=100,
    # status_filter="RECRUITING|ACTIVE_NOT_RECRUITING"  # optional
)


print("Done!")


Fetching lung cancer trials: 100trial [00:00, 104.23trial/s]


Fetched 100 lung cancer trials
Done!





# Flattening & Preprocessing Clinical Trial Records

Transforming messy, nested API responses into clean, machine-friendly tabular structures
for downstream modeling, LLM extraction, and DataFrame storage.

The ClinicalTrials.gov API returns nested JSON structure, arrays inside dictionaries inside arrays
But for ML workflows, we need flat rows, such as:
| nct_id | title | status | eligibility_text | conditions | locations | ... |

This is what the next preprocessing code accomplish.

In [None]:
# Flatten helper
def safe_get(d, path, default=""):

    """
    Safely retrieve a value from a nested dictionary using a list of keys.

    Args:
        d (dict): The dictionary to traverse (can be None or non-dict at any level)
        path (list): Sequence of keys, e.g. ["designModule", "phases", "current"]
        default: Value returned if any key is missing or a non-dict is encountered

    Returns:
        The value at the given path, or `default` if the path cannot be followed.
    """

    for p in path:
        if not isinstance(d, dict):
            return default
        d = d.get(p, default)
    return d if d is not None else default


# Flattening

"""    
Phases and Conditions can be lists and can be a single value the code bellow 
Ensures consistent string representation by Joining list values with ", " 
if the target not a list return the single value (or empty string)
"""

records = []
for s in studies:
    ps = s.get("protocolSection", {})

    row = {
        "NCT_ID": safe_get(ps, ["identificationModule", "nctId"]),
        "Title": safe_get(ps, ["identificationModule", "briefTitle"]),
        "Official_Title": safe_get(ps, ["identificationModule", "officialTitle"]),
        "Status": safe_get(ps, ["statusModule", "overallStatus"]),
        "Start_Date": safe_get(ps, ["statusModule", "startDateStruct", "date"]),
        "Primary_Completion": safe_get(ps, ["statusModule", "primaryCompletionDateStruct", "date"]),
        "Study_Type": safe_get(ps, ["designModule", "studyType"]),
        "Phases": ", ".join(safe_get(ps, ["designModule", "phases"], [])) if isinstance(safe_get(ps, ["designModule", "phases"], []), list) else safe_get(ps, ["designModule", "phases"], ""),
        "Enrollment": safe_get(ps, ["designModule", "enrollmentInfo", "count"]),
        "Conditions": ", ".join(safe_get(ps, ["conditionsModule", "conditions"], [])) if isinstance(safe_get(ps, ["conditionsModule", "conditions"], []), list) else safe_get(ps, ["conditionsModule", "conditions"], ""),
        "Sex": safe_get(ps, ["eligibilityModule", "sex"]),
        "Min_Age": safe_get(ps, ["eligibilityModule", "minimumAge"]),
        "Max_Age": safe_get(ps, ["eligibilityModule", "maximumAge"]),
        "Lead_Sponsor": safe_get(ps, ["sponsorCollaboratorsModule", "leadSponsor", "name"]),
        "Brief_Summary": safe_get(ps, ["descriptionModule", "briefSummary"]),
        "Eligibility": safe_get(ps, ["eligibilityModule", "eligibilityCriteria"]),
    }
    # outcomes & interventions always lists
    prim = safe_get(ps, ["outcomesModule", "primaryOutcomes"], [])
    row["Primary_Measures"] = " | ".join([p.get("measure", "") for p in prim]) if prim else ""
    sec = safe_get(ps, ["outcomesModule", "secondaryOutcomes"], [])
    row["Secondary_Measures"] = " | ".join([p.get("measure", "") for p in sec]) if sec else ""
    ints = safe_get(ps, ["armsInterventionsModule", "interventions"], [])
    row["Interventions"] = " | ".join([f"{i.get('type','')}:{i.get('name','')}" for i in ints]) if ints else ""
    records.append(row)

df = pd.DataFrame(records)
print("Raw df shape:", df.shape)

Raw df shape: (100, 19)


# Clinical Trial Text Normalization

Cleaning and Normalizing raw textual fields, especially the eligibility criteria by removing artifacts, whitespace noise, and metadata markup.

In [None]:
# ============= Clean text ============
def clean_text(t):
    if not isinstance(t, str): return ""
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[•\n\t\r]", " ", t)
    t = re.sub(r"\(Version [^\)]*\)", "", t)
    return t.strip()

for col in ["Title", "Brief_Summary", "Official_Title", "Eligibility"]:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# adding eligibility length field
df["elig_len"] = df["Eligibility"].apply(lambda x: len(x or ""))



In [None]:
df.head(5)

Unnamed: 0,NCT_ID,Title,Official_Title,Status,Start_Date,Primary_Completion,Study_Type,Phases,Enrollment,Conditions,Sex,Min_Age,Max_Age,Lead_Sponsor,Brief_Summary,Eligibility,Primary_Measures,Secondary_Measures,Interventions,elig_len
0,NCT01189968,A Study of Carboplatin and Pemetrexed Plus Dem...,A Phase 1b Study of Carboplatin and Pemetrexed...,COMPLETED,2010-09,2016-09,INTERVENTIONAL,PHASE1,50,Non Small Cell Lung Cancer,ALL,21 Years,,"OncoMed Pharmaceuticals, Inc.",The purpose of this study is to test the safet...,Inclusion criteria 1. Subjects must have histo...,To the determine the maximum tolerated dose of...,To determine the safety of carboplatin and pem...,DRUG:Demcizumab,4781
1,NCT03199586,Clinical Trial of Metastasis Inhibitor NP-G2-0...,"First-in-Human, Dose Finding, Open Label Phase...",COMPLETED,2017-12-21,2020-05-07,INTERVENTIONAL,PHASE1,23,"Breast Cancer, Pancreas Cancer, Prostate Cance...",ALL,18 Years,,"Novita Pharmaceuticals, Inc.",Phase 1 A: First-in-human phase 1 study to det...,Inclusion Criteria: 1. Signed informed consent...,Establish the safe recommended phase 2 dose,Identify and characterize preliminary anti tum...,DRUG:NP-G2-044,3702
2,NCT00305786,Gemcitabine and Oxaliplatin as Second-Line The...,Phase II Study of Oxaliplatin in Combination W...,COMPLETED,2005-08,2007-12,INTERVENTIONAL,PHASE2,30,Lung Cancer,ALL,18 Years,120 Years,University of Miami,"RATIONALE: Drugs used in chemotherapy, such as...",DISEASE CHARACTERISTICS: * Histologically prov...,Response rate as measured by RECIST criteria,Toxicity as monitored by DSMC,DRUG:gemcitabine hydrochloride | DRUG:oxaliplatin,2228
3,NCT06508307,A Phase I Clinical Study of Intratumoral Injec...,"A Phase I Study Evaluating the Safety, Tolerab...",RECRUITING,2023-04-26,2026-01,INTERVENTIONAL,PHASE1,21,"Sarcoma, Cervical Cancer, Colon Cancer, Lung C...",ALL,18 Years,75 Years,"GONGCHU Biotechnology Co., Ltd","The present trial is an open, single-arm phase...",Inclusion Criteria: To be eligible for partici...,Evaluate the safety and tolerability of GC001 ...,Anti-tumor activity of GC001: overall response...,BIOLOGICAL:A Phase I Clinical Study of Intratu...,8665
4,NCT05926336,The Effects of Using Different Anesthetics on ...,To Compare the Effects of Intraoperative Use o...,RECRUITING,2023-05-23,2026-07,INTERVENTIONAL,PHASE4,1316,"Lung Cancer, Brain Tumor, Liver Cancer, Ovaria...",ALL,20 Years,80 Years,Kaohsiung Medical University Chung-Ho Memorial...,1. Eligible participants were assessed prior t...,Inclusion Criteria: * eighteen to eighty-year-...,Overall survival | The presence of disease pro...,Postoperative complications | Karnofsky perfor...,DRUG:Propofol | DRUG:Sevoflurane,726


# Filtering for Systemic Anticancer Trials

This section implements high-precision filtering to ensure the dataset contains only relevant interventional systemic anticancer trials Focusing on therapeutic, drug-based interventions Avoiding noise in ML/LLM training

In [None]:
def is_systemic_anticancer_trial(row):
    text = " ".join([
        str(row.get('Title', '')),
        str(row.get('Official_Title', '')),
        str(row.get('Brief_Summary', '')),
        str(row.get('Eligibility', '')),
        str(row.get('Conditions', '')),
        str(row.get('Interventions', ''))
    ]).lower()

    # HARD EXCLUDE — non-drug / procedural / local therapy
    junk_keywords = [
        # Locoregional / ablation / device
        "hifu", "radiofrequency ablation", "rfa", "cryoablation", "microwave ablation",
        "tace", "deb-tace", "y90", "radioembolization", "sir-spheres", "therasphere",
        "hepatic arterial", "haic", "chemoembolization", "embolization",
        "sbrt", "cyberknife", "gamma knife", "stereotactic body", "stereotactic radiosurgery",
        "brachytherapy", "hdr ", "intrathecal", "lumbar puncture", "pleurodesis",
        "thoracentesis", "paracentesis", "pleural catheter", "chest tube", "pleurx",

        # Surgical / diagnostic procedures
        "surgery", "resection", "lobectomy", "wedge resection", "thoracoscopic", "vats", "ivats",
        "biopsy only", "diagnostic biopsy", "tissue collection",

        # Prevention / screening / supportive care
        "prevention", "chemoprevention", "smoking cessation", "aspirin", "statin",
        "screening", "early detection", "surveillance",

        # Non-cancer or minimal anticancer intent
        "supportive care", "palliative care only", "best supportive care", "placebo",
        "radiation only", "radiotherapy alone", "radiation therapy as single modality",

        # Others
        "stage iiia", "stage iii", "inoperable locally advanced", "concurrent chemoradiation", "definitive chemoradiation",
        "induction chemotherapy.*allowed", "no study drug", "radiation therapy.*primary", "thoracic radiation"
    ]
    if any(kw in text for kw in junk_keywords):
        return False

    # Must have at least ONE real anticancer drug keyword
    drug_keywords = [
        "chemotherapy", "targeted therapy", "immunotherapy", "pd-1", "pd-l1", "ctla-4",
        "parp inhibitor", "tkis", "egfr", "alk ", "braf", "mek", "mtor", "pi3k",
        "cdk4/6", "bcl-2", "antibody-drug conjugate", "adc", "bispecific",
        "car-t", "tcr-t", "til therapy", "vaccine", # (vaccine often therapeutic in oncology)
        "olaparib", "osimertinib", "pembrolizumab", "nivolumab", "atezolizumab",
        "bevacizumab", "trastuzumab", "cetuximab", "ramucirumab", "everolimus",
        "lenvatinib", "sunitinib", "pazopanib", "cabozantinib", "regorafenib"
    ]
    if not any(kw in text for kw in drug_keywords):
        return False

    # Must contain at least one of these strong signals
    strong_signals = [
        "phase 1", "phase i", "phase 2", "phase ii", "phase 3", "phase iii",
        "dose escalation", "maximum tolerated dose", "recommended phase 2 dose",
        "progression-free survival", "overall survival", "objective response rate",
        "recist", "irrc", "pfs", "os", "orr", "dcr"
    ]
    if not any(signal in text for signal in strong_signals):
        return False

    return True


# Apply
df = df[
    (df['Study_Type'] == 'INTERVENTIONAL') &
    df.apply(is_systemic_anticancer_trial, axis=1)
].copy()

In [None]:

# Keep only rows where 300 <= elig_len <= 3500
df_filtered = df[(df['elig_len'] <= 3500) & (df['elig_len'] >= 300)].copy()


if len(df_filtered) > 100:
    df_filtered = df_filtered.sample(n=100, random_state=2)
else:
    print(f"After filtering, only {len(df_filtered)} rows remain (≤100), keeping all.")

# Reset index
df = df_filtered.reset_index(drop=True)

After filtering, only 4 rows remain (≤100), keeping all.


# LLM-Based Eligibility Extraction
This section defines extract_eligibility, a robust wrapper to convert unstructured trial eligibility text into a strict JSON schema using a Large Language Model (LLM).

In [None]:
#Seting up Groq client

from groq import Groq
from getpass import getpass

GROQ_API_KEY = getpass("Get your free key at https://console.groq.com/keys → ")
client = Groq(api_key=GROQ_API_KEY)

Get your free key at https://console.groq.com/keys → ··········


In [None]:
# ─────────────────────────────────────────────────────────────
# FINAL SYSTEM PROMPT
# ─────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """
You are a world-class, high-precision clinical trial eligibility parser specialized in oncology interventional therapeutic trials. Your only job is to output perfect, strictly valid JSON using the exact schema below. Never add, remove, or rename any field. Never output explanations, markdown, or extra text.

### MANDATORY SCHEMA (use EXACTLY this structure)
{
  "trial_id": null,
  "trial_category": "therapeutic_interventional",
  "age": {"min": 18, "max": null},
  "region_specific_age": {"japan_min": null},
  "inclusion": {
    "disease": {
      "confirmed_by": null,
      "cancer_type": "",
      "histology_subtype": "",
      "stage": "",
      "stage_list": [],
      "metastatic": null,
      "measurable_disease_recist": null,
      "biomarker_required": []
    },
    "performance_status": {
      "scale": null,
      "min": null,
      "max": null
    },
    "life_expectancy_weeks": null,
    "prior_therapy": {
      "required": [],
      "allowed": [],
      "disallowed": [],
      "max_lines_systemic": null,
      "washout_weeks": {
        "chemotherapy": null,
        "targeted_therapy": null,
        "immunotherapy": null,
        "investigational": null,
        "radiation": null,
        "major_surgery": null
      }
    },
    "brain_metastases": null,
    "brain_mets_stable_duration_weeks": null,
    "organ_function": {
      "anc": null,
      "platelets": null,
      "hemoglobin_g_per_dl": null,
      "creatinine_clearance_ml_min": null,
      "bilirubin_x_uln": null,
      "ast_alt_x_uln": null,
      "albumin_g_per_dl": null
    },
    "cardiac": {
      "qtcf_ms_max": null,
      "recent_mi_months_exclusion": null,
      "nyha_class_max": null,
      "lvef_percent_min": null
    },
    "contraception_required": null,
    "other_inclusions": []
  },
  "exclusion": {
    "pregnant_or_breastfeeding": null,
    "active_cns_metastases": null,
    "uncontrolled_intercurrent_illness": null,
    "grade_2_or_higher_neuropathy": null,
    "history_of": [],
    "concurrent_medications_disallowed": [],
    "other_exclusions": []
  }
}

### STRICT RULES & CONVENTIONS (follow exactly)

1. Age
   - Default "min": 18 unless explicitly different
   - Japan-specific pediatric trials → fill japan_min only

2. Performance Status
   - Convert everything to ECOG 0–5 scale
   - Karnofsky 70–100 → ECOG 0–1 ("min": 0, "max": 1)
   - Karnofsky ≥70 → "max": 1
   - ECOG ≤1 → "max": 1
   - Always fill "scale": "ECOG"

3. Brain Metastases – ONLY use these 5 values:
   null | "excluded" | "allowed_if_asymptomatic" | "allowed_if_stable" | "allowed_if_treated_and_stable"
   - If stable duration specified → fill brain_mets_stable_duration_weeks
   - "Symptomatic", "uncontrolled", "requiring steroids" → "excluded"

4. Biomarkers
   - Use exact wording from trial: "ALK positive by FDA-approved test", "EGFR exon 19 del or L858R", "PD-L1 TPS ≥50%", "BRCA1/2 mutated", "MSI-H/dMMR"

5. Prior Therapy
   - "max_lines_systemic" = total systemic lines (not including adjuvant)
   - "required" = must have received
   - "disallowed" = must NOT have received

6. Lab Values – ALWAYS normalize
   - Bilirubin ≤1.5 mg/dL → "bilirubin_x_uln": 1.5
   - AST/ALT ≤3×ULN (≤5× if liver mets) → "ast_alt_x_uln": 3 + note exception in other_inclusions
   - Hemoglobin ≥9 g/dL → "hemoglobin_g_per_dl": 9

7. Life Expectancy
   - ≥3 months = 12, ≥6 months = 24, ≥12 months = 52

8. Contraception
   - Any mention of highly effective contraception → "contraception_required": true

9. cancer_type & histology_subtype
   - cancer_type: "non-small cell lung cancer", "hepatocellular carcinoma", "urothelial carcinoma"
   - histology_subtype: "adenocarcinoma", "squamous", "small cell", "clear cell RCC"

10. NEVER leave critical fields null if information exists
    - If ECOG mentioned → fill performance_status
    - If any labs mentioned → fill organ_function
    - If any prior therapy rules → fill prior_therapy

11. other_inclusions / other_exclusions
    - Short, precise bullets only
    - Examples:
      "Archival tumor tissue required"
      "Liver metastases: AST/ALT ≤5×ULN allowed"
      "Able to swallow tablets"
      "No active autoimmune disease requiring systemic treatment in past 2 years"

Only output raw JSON. No ```json wrapper, no extra characters, no thinking step.

Now parse the following trial eligibility text:
"""


In [None]:

# ─────────────────────────────────────────────────────────────
# FINAL EXTRACTION FUNCTION
# ─────────────────────────────────────────────────────────────
def extract_eligibility(
    text: str,
    client,
    model: str = "meta-llama/llama-4-maverick-17b-128e-instruct",
    max_retries: int = 3
) -> Dict[str, Any]:


    """
    Parse unstructured clinical trial eligibility text into a structured JSON format 
    using an LLM with a strict schema.

    Args:
        text (str): Raw eligibility criteria text from a clinical trial.
        client: LLM API client (e.g., Groq) used for inference.
        model (str, optional): LLM model identifier. Defaults to a Meta-LLaMA model.
        max_retries (int, optional): Number of retries for API failures. Defaults to 3.

    Returns:
        Dict[str, Any]: A dictionary following the strict clinical eligibility JSON schema.

    Raises:
        ValueError: If input text is empty or LLM output is invalid.
        RuntimeError: If extraction fails after all retries.
    """


    if not text or not text.strip():
        raise ValueError("Empty eligibility text")

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": text}
                ],
                temperature=0.0, # deterministic output for reproducible JSON
                top_p=1.0, # use full distribution for completeness
                max_tokens=3000, # large enough to capture long trial eligibility text
                timeout=90
            )

            raw = response.choices[0].message.content.strip()

            # Remove any accidental wrappers
            raw = re.sub(r"^```json\s*", "", raw, flags=re.IGNORECASE)
            raw = re.sub(r"^```\s*", "", raw)
            raw = re.sub(r"```$", "", raw)
            raw = raw.strip()

            if not raw.startswith("{"):
                raise ValueError("Output does not start with {")

            # Find first complete JSON object
            brace_level = 0
            end_idx = None
            for i, char in enumerate(raw):
                if char == '{': brace_level += 1
                if char == '}': brace_level -= 1
                if brace_level == 0:
                    end_idx = i + 1
                    break
            if end_idx is None:
                raise ValueError("Unbalanced braces")

            json_str = raw[:end_idx]
            result = json.loads(json_str)

            # Basic validation
            if not isinstance(result, dict) or "inclusion" not in result:
                raise ValueError("Invalid top-level structure")

            # Final safety: ensure performance_status scale is ECOG
            ps = result["inclusion"].get("performance_status", {})
            if ps.get("scale") in [None, ""]:
                ps["scale"] = "ECOG"

            return result

        except Exception as e:
            print(f"[Attempt {attempt + 1}] Failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            else:
                raise RuntimeError(f"Eligibility parsing failed after {max_retries} attempts") from e

    raise RuntimeError("extract_eligibility_perfect exited unexpectedly")

In [None]:
tqdm.pandas()

df["eligibility_json"] = df["Eligibility"].progress_apply(
    lambda x: extract_eligibility(x, client=client)
)

100%|██████████| 4/4 [00:21<00:00,  5.29s/it]


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df[["eligibility_json","Eligibility"] ].head()

Unnamed: 0,eligibility_json,Eligibility
0,"{'trial_id': None, 'trial_category': 'therapeutic_interventional', 'age': {'min': 18, 'max': None}, 'region_specific_age': {'japan_min': None}, 'inclusion': {'disease': {'confirmed_by': 'histology', 'cancer_type': 'non-small cell lung cancer', 'histology_subtype': '', 'stage': 'T1 to T4, N0-N3, M0-M1', 'stage_list': [], 'metastatic': None, 'measurable_disease_recist': None, 'biomarker_required': []}, 'performance_status': {'scale': 'ECOG', 'min': None, 'max': 3}, 'life_expectancy_weeks': None, 'prior_therapy': {'required': [], 'allowed': [], 'disallowed': [], 'max_lines_systemic': None, 'washout_weeks': {'chemotherapy': 4, 'targeted_therapy': None, 'immunotherapy': None, 'investigational': None, 'radiation': 4, 'major_surgery': None}}, 'brain_metastases': 'excluded', 'brain_mets_stable_duration_weeks': None, 'organ_function': {'anc': None, 'platelets': 100000, 'hemoglobin_g_per_dl': None, 'creatinine_clearance_ml_min': None, 'bilirubin_x_uln': None, 'ast_alt_x_uln': None, 'albumin_g_per_dl': None}, 'cardiac': {'qtcf_ms_max': None, 'recent_mi_months_exclusion': None, 'nyha_class_max': None, 'lvef_percent_min': None}, 'contraception_required': True, 'other_inclusions': ['Able to tolerate repeated bronchial endoscopy', 'Written consent given']}, 'exclusion': {'pregnant_or_breastfeeding': True, 'active_cns_metastases': True, 'uncontrolled_intercurrent_illness': True, 'grade_2_or_higher_neuropathy': None, 'history_of': [], 'concurrent_medications_disallowed': [], 'other_exclusions': ['Tracheal lesions or lesions affecting carina tracheae', 'Painful bone metastases', 'Previous pneumonectomy', 'Risk of large vessel erosion or perforation', 'Allergy to photosensitizer', 'Leukopenia (WBC<2000)', 'Thrombocytopenia (<100000)', 'PT >1.5 normal', 'Fibrinogen <2g/l', 'PTT >1.5 ULN', 'Renal insufficiency', 'Hepatic insufficiency', 'Previous 70 Gy radiation on lesion', 'Existing tracheoesophageal or bronchoesophageal fistula', 'Severe acute respiratory distress']}}","Inclusion Criteria: * Histologically proven lung cancer * Inoperable * Non-Small Cell Cancer * Partial or total bronchial obstruction responsible for functional signs * T1 to T4, N0-N3, M0-M1 * Patients with functional signs: hemoptysis, infection, cough and, above all, dyspnea * Contralateral metastases not representing a contraindication insofar as they do not represent a risk of impairment of respiratory function during treatment * The Karnofsky index should be greater than or equal to 40 * Patients should agree to and tolerate repeated bronchial endoscopy (a disadvantage of all endoscopic treatments) * Male or female patients aged over 18 years, female patients should not be pregnant (menopause or contraception) * Patients should have given their written consent to take part in the study Exclusion Criteria: * Tracheal lesions and lesions affecting the carina tracheae * Patients with painful bone metastases (not an absolute criterion since the extent of dyspnea is the decisive element) * Patients with brain metastases * Patients having undergone pneumonectomy * Patients undergoing chemotherapy or radiotherapy or having undergone chemotherapy less than 4 weeks before the procedure or radiotherapy less than 4 weeks before the procedure * Patients with risk of large vessel erosion or perforation resulting from lesion topography * In case of allergy to the photosensitizer * Leukopenia (WBC\<2000), Thrombocytopenia (\< 100 000), PT \> 1.5 normal, Fibrinogen \< 2g/l, a PTT \> 1.5 ULN (Upper Limit of Normal) * Renal insufficiency * Hepatic insufficiency * Patients having already received 70 Gy on the lesion * Existing tracheoesophageal or bronchoesophageal fistula * Emergency treatment of patients with severe acute respiratory distress caused by an obstructing endobronchial lesion"
1,"{'trial_id': None, 'trial_category': 'therapeutic_interventional', 'age': {'min': 18, 'max': 75}, 'region_specific_age': {'japan_min': None}, 'inclusion': {'disease': {'confirmed_by': 'pathology', 'cancer_type': 'lung cancer', 'histology_subtype': '', 'stage': '', 'stage_list': [], 'metastatic': None, 'measurable_disease_recist': True, 'biomarker_required': []}, 'performance_status': {'scale': 'ECOG', 'min': None, 'max': 2}, 'life_expectancy_weeks': 24, 'prior_therapy': {'required': [], 'allowed': [], 'disallowed': [], 'max_lines_systemic': None, 'washout_weeks': {'chemotherapy': None, 'targeted_therapy': None, 'immunotherapy': None, 'investigational': None, 'radiation': 2, 'major_surgery': None}}, 'brain_metastases': None, 'brain_mets_stable_duration_weeks': None, 'organ_function': {'anc': 1.5, 'platelets': 100, 'hemoglobin_g_per_dl': 9, 'creatinine_clearance_ml_min': None, 'bilirubin_x_uln': 1.5, 'ast_alt_x_uln': 2.5, 'albumin_g_per_dl': None}, 'cardiac': {'qtcf_ms_max': None, 'recent_mi_months_exclusion': None, 'nyha_class_max': None, 'lvef_percent_min': None}, 'contraception_required': None, 'other_inclusions': ['FEV1 >1L and >50% normal value', 'AST/ALT ≤5×ULN allowed if liver metastases']}, 'exclusion': {'pregnant_or_breastfeeding': True, 'active_cns_metastases': None, 'uncontrolled_intercurrent_illness': True, 'grade_2_or_higher_neuropathy': None, 'history_of': ['serious psychological/psychiatric disorders', 'drug addiction', 'alcohol dependence'], 'concurrent_medications_disallowed': [], 'other_exclusions': ['Previous radiotherapy to lung/mediastinum within 2 weeks', 'Previous treatment with Compound Kushen Injection within 2 weeks', 'Participating in other clinical trials within last 30 days', 'Hypersensitivity to trial regimen']}}","Inclusion Criteria: * Before the start of the study, All patients have been fully understood the reseach and the must sign the informed consent * To be aged from 18 to 75 years old, both gender * The lung cancer diagnosis must be proved by pathology * According to RECIST (version 1.1), At least 1 objectively measurable Tumor lesion (iconography: CT, MRI), the assessable lesion can be measured accurately, maximum diameter more than at least 10mm (Malignant lymph nodes on CT scans short diameter less than at least 15 mm) * Eastern Cooperative Oncology Group (ECOG) performance status less than or equal to 2 * Lung function FEV1 more than at least 1Land more than 50% A normal value * The function of each organ is basically normal :ANC more than 1.5\*10\^9/L, Platelet count more than 100\*10\^9/L, Hb more than 9.0g/dl, BIL at normal level or less than 1.5\*ULN, AST (SGOT), ALT (SGPT) less than 2.5\*ULN(less than 5\*ULN, if with liver metastases), SCr less than 1.5\*ULN * The expected survival tme must more than 6 months. Exclusion Criteria: * Lung or mediastinal have received radiotherapy before or ever treated with Compound Kushen Injection within 2weeks * Pregnancy or lactation women * Patients with severe, uncontrolled organic lesions or infection, such as decompensated heart, lung, kidney failure can lead to tolerance of chemotherapy - Participating or within the last 30 days participated in other clinical trials * Hypersensitiveness to any kind of trial regime * Had a history of serious Psychological or Psychiatric disorders, Drug addiction or Alcohol dependence * Estimating the compliance of patients to participate in this clinical trial is insufficient."
2,"{'trial_id': None, 'trial_category': 'therapeutic_interventional', 'age': {'min': 18, 'max': None}, 'region_specific_age': {'japan_min': None}, 'inclusion': {'disease': {'confirmed_by': 'histology or cytology', 'cancer_type': 'solid tumors', 'histology_subtype': '', 'stage': 'advanced or metastatic', 'stage_list': [], 'metastatic': True, 'measurable_disease_recist': True, 'biomarker_required': []}, 'performance_status': {'scale': 'ECOG', 'min': 0, 'max': 1}, 'life_expectancy_weeks': None, 'prior_therapy': {'required': [], 'allowed': [], 'disallowed': [], 'max_lines_systemic': None, 'washout_weeks': {'chemotherapy': None, 'targeted_therapy': None, 'immunotherapy': None, 'investigational': None, 'radiation': 2, 'major_surgery': None}}, 'brain_metastases': 'excluded', 'brain_mets_stable_duration_weeks': None, 'organ_function': {'anc': None, 'platelets': None, 'hemoglobin_g_per_dl': None, 'creatinine_clearance_ml_min': None, 'bilirubin_x_uln': None, 'ast_alt_x_uln': None, 'albumin_g_per_dl': None}, 'cardiac': {'qtcf_ms_max': None, 'recent_mi_months_exclusion': None, 'nyha_class_max': None, 'lvef_percent_min': None}, 'contraception_required': None, 'other_inclusions': ['Toxic effects of previous therapy recovered to ≤ Grade 1']}, 'exclusion': {'pregnant_or_breastfeeding': True, 'active_cns_metastases': True, 'uncontrolled_intercurrent_illness': True, 'grade_2_or_higher_neuropathy': None, 'history_of': [], 'concurrent_medications_disallowed': ['Live vaccine within 30 days of study therapy start'], 'other_exclusions': ['Active infection requiring systemic therapy', 'Active or inactive autoimmune disease or syndrome']}}",Inclusion Criteria: * Histologically or cytologically confirmed diagnosis of selected advanced or metastatic solid tumors. * Presence of measurable disease per RECIST v1.1. * Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1. Exclusion Criteria: * Laboratory and medical history parameters not within the Protocol-defined range. * Receipt of anticancer medications or investigational drugs within the Protocol-defined intervals before the first administration of study drug. * Previous radiotherapy within 2 weeks of starting study therapy. * Known active central nervous system (CNS) metastases and/or carcinomatous meningitis. * Has not recovered to ≤ Grade 1 from toxic effects of previous therapy and/or complications from previous surgical intervention before starting study therapy. * Receipt of a live vaccine within 30 days of planned start of study therapy. * Active infection requiring systemic therapy. * Subjects who have any active or inactive autoimmune disease or syndrome. * Women who are pregnant or breastfeeding.
3,"{'trial_id': None, 'trial_category': 'therapeutic_interventional', 'age': {'min': 18, 'max': None}, 'region_specific_age': {'japan_min': None}, 'inclusion': {'disease': {'confirmed_by': 'histology or cytology', 'cancer_type': 'non-small cell lung cancer', 'histology_subtype': '', 'stage': 'IV', 'stage_list': ['IV'], 'metastatic': True, 'measurable_disease_recist': True, 'biomarker_required': ['PD-L1 expression ≥1%']}, 'performance_status': {'scale': 'ECOG', 'min': 0, 'max': 1}, 'life_expectancy_weeks': 12, 'prior_therapy': {'required': ['PD-1/PD-L1 inhibitor and platinum-based chemotherapy'], 'allowed': [], 'disallowed': ['docetaxel for NSCLC', '4-1BB targeted agent', 'antitumor vaccine', 'autologous cell immunotherapy', 'unapproved immunotherapy'], 'max_lines_systemic': 2, 'washout_weeks': {'chemotherapy': 4, 'targeted_therapy': 4, 'immunotherapy': 4, 'investigational': 4, 'radiation': None, 'major_surgery': None}}, 'brain_metastases': 'excluded', 'brain_mets_stable_duration_weeks': None, 'organ_function': {'anc': None, 'platelets': None, 'hemoglobin_g_per_dl': None, 'creatinine_clearance_ml_min': None, 'bilirubin_x_uln': None, 'ast_alt_x_uln': None, 'albumin_g_per_dl': None}, 'cardiac': {'qtcf_ms_max': None, 'recent_mi_months_exclusion': None, 'nyha_class_max': None, 'lvef_percent_min': None}, 'contraception_required': None, 'other_inclusions': ['Tumor sample from metastatic setting required for PD-L1 testing', 'Adequate organ and bone marrow function']}, 'exclusion': {'pregnant_or_breastfeeding': None, 'active_cns_metastases': True, 'uncontrolled_intercurrent_illness': None, 'grade_2_or_higher_neuropathy': None, 'history_of': ['carcinomatous meningitis'], 'concurrent_medications_disallowed': [], 'other_exclusions': ['Known targetable EGFR mutations', 'Known ALK rearrangement', 'Known RET rearrangement', 'Known ROS1 rearrangement', 'Known MET exon 14 skipping mutations/MET amplification', 'Known KRAS/BRAF mutations with access to approved targeted therapies']}}","Key Inclusion Criteria: * Participant has histologically or cytologically confirmed metastatic NSCLC (stage IV with known subtype). * Participant has progressed radiographically on or after receiving: * One prior line of therapy (PD-1/PD-L1 inhibitor and platinum-based chemotherapy concomitantly) in the metastatic disease setting; OR * No more than 2 prior lines of therapy (PD-1/PD-L1 inhibitor and platinum-based chemotherapy sequentially, irrespective of the order) in the metastatic disease setting. * Participant must have positive tumor PD-L1 expression (tumor cells ≥1%) determined prospectively on a tumor sample from the metastatic setting at a sponsor-designated central laboratory. * Participant has measurable disease according to RECIST v1.1 as assessed by the investigator at baseline. * Participant has an Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1 within 7 days of Cycle 1 Day 1. * Participant has a life expectancy of ≥3 months. * Participant must have adequate organ and bone marrow function, per laboratory test results within 7 days of trial treatment. Key Exclusion Criteria: * Documentation of known targetable epidermal growth factor receptor (EGFR) sensitizing mutations, anaplastic lymphoma kinase (ALK), RET proto-oncogene (RET), ROS proto-oncogene 1; receptor tyrosine kinase (ROS1) rearrangement, Kirsten rat sarcoma virus (KRAS), B-Raf proto-oncogene (BRAF) mutations, and MET proto-oncogene; receptor tyrosine kinase (MET) exon 14 skipping mutations/MET amplification. NOTE: MET amplification testing is optional based on local availability of the test. * Participants with known KRAS/BRAF mutations are eligible for the trial if they do not have access to approved targeted therapies. * Participants with newly identified or known unstable or symptomatic central nervous system (CNS) metastases or history of carcinomatous meningitis. * Prior treatment with docetaxel for NSCLC. * Prior treatment with a 4-1BB (CD137) targeted agent, any type of antitumor vaccine, autologous cell immunotherapy, or any unapproved immunotherapy. * Treatment with an anticancer agent within 28 days prior to the first dose of trial treatment. Note: Other protocol-defined inclusion and exclusion criteria may apply."


In [None]:
# ============= Save CSV ============
csv_name = "./data/trials_parsed.csv"
df.to_csv(csv_name, index=False)
print("Saved:", csv_name)