In [None]:
!pip install groq

In [None]:
import pandas as pd
from tqdm import tqdm

import re
import requests
import json
import time
import copy
from typing import Dict, Any

In [None]:

# ============= LUNG CANCER-ONLY FETCHER =============
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"

def fetch_lung_cancer_trials(
    page_size=10,
    max_total=50,
    status_filter=None         # e.g., "RECRUITING", "ACTIVE_NOT_RECRUITING"
):
    params = {
        "format": "json",
        "pageSize": page_size,
        # MeSH filter for lung neoplasms
        "filter.advanced": "AREA[ConditionMeshTerm]Neoplasms/Lung",
    }
    if status_filter:
        params["filter.status"] = status_filter

    all_studies = []
    next_token = None
    pbar = tqdm(desc="Fetching lung cancer trials", unit="trial")

    while True:
        if next_token:
            params["pageToken"] = next_token

        resp = requests.get(BASE_URL, params=params, timeout=60)
        resp.raise_for_status()
        data = resp.json()

        batch = data.get("studies", [])
        all_studies.extend(batch)
        pbar.update(len(batch))

        next_token = data.get("nextPageToken")
        if not next_token or (max_total and len(all_studies) >= max_total):
            break

        time.sleep(0.1)  # being polite to the API

    pbar.close()
    studies = all_studies[:max_total] if max_total else all_studies
    print(f"\nFetched {len(studies)} lung cancer trials")
    return studies

# Usage
studies = fetch_lung_cancer_trials(
    page_size=100,
    max_total=100,
    # status_filter="RECRUITING|ACTIVE_NOT_RECRUITING"  # optional
)

# Save to file if needed
# with open("lung_cancer_trials_2025.json", "w", encoding="utf-8") as f:
#     json.dump(studies, f, indent=2)

print("Done!")


In [None]:
# ============= Flatten helper =============
def safe_get(d, path, default=""):
    for p in path:
        if not isinstance(d, dict):
            return default
        d = d.get(p, default)
    return d if d is not None else default

records = []
for s in studies:
    ps = s.get("protocolSection", {})
    row = {
        "NCT_ID": safe_get(ps, ["identificationModule", "nctId"]),
        "Title": safe_get(ps, ["identificationModule", "briefTitle"]),
        "Official_Title": safe_get(ps, ["identificationModule", "officialTitle"]),
        "Status": safe_get(ps, ["statusModule", "overallStatus"]),
        "Start_Date": safe_get(ps, ["statusModule", "startDateStruct", "date"]),
        "Primary_Completion": safe_get(ps, ["statusModule", "primaryCompletionDateStruct", "date"]),
        "Study_Type": safe_get(ps, ["designModule", "studyType"]),
        "Phases": ", ".join(safe_get(ps, ["designModule", "phases"], [])) if isinstance(safe_get(ps, ["designModule", "phases"], []), list) else safe_get(ps, ["designModule", "phases"], ""),
        "Enrollment": safe_get(ps, ["designModule", "enrollmentInfo", "count"]),
        "Conditions": ", ".join(safe_get(ps, ["conditionsModule", "conditions"], [])) if isinstance(safe_get(ps, ["conditionsModule", "conditions"], []), list) else safe_get(ps, ["conditionsModule", "conditions"], ""),
        "Sex": safe_get(ps, ["eligibilityModule", "sex"]),
        "Min_Age": safe_get(ps, ["eligibilityModule", "minimumAge"]),
        "Max_Age": safe_get(ps, ["eligibilityModule", "maximumAge"]),
        "Lead_Sponsor": safe_get(ps, ["sponsorCollaboratorsModule", "leadSponsor", "name"]),
        "Brief_Summary": safe_get(ps, ["descriptionModule", "briefSummary"]),
        "Eligibility": safe_get(ps, ["eligibilityModule", "eligibilityCriteria"]),
    }
    # outcomes & interventions
    prim = safe_get(ps, ["outcomesModule", "primaryOutcomes"], [])
    row["Primary_Measures"] = " | ".join([p.get("measure", "") for p in prim]) if prim else ""
    sec = safe_get(ps, ["outcomesModule", "secondaryOutcomes"], [])
    row["Secondary_Measures"] = " | ".join([p.get("measure", "") for p in sec]) if sec else ""
    ints = safe_get(ps, ["armsInterventionsModule", "interventions"], [])
    row["Interventions"] = " | ".join([f"{i.get('type','')}:{i.get('name','')}" for i in ints]) if ints else ""
    records.append(row)

df = pd.DataFrame(records)
print("Raw df shape:", df.shape)

In [None]:
# ============= Clean text ============
def clean_text(t):
    if not isinstance(t, str): return ""
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[•\n\t\r]", " ", t)
    t = re.sub(r"\(Version [^\)]*\)", "", t)
    return t.strip()

for col in ["Title", "Brief_Summary", "Official_Title", "Eligibility"]:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# optional: drop trials that have no eligibility text or extremely short eligibility
df["elig_len"] = df["Eligibility"].apply(lambda x: len(x or ""))
# df = df[df["elig_len"] > 50].copy()
print("After filtering short elig texts:", df.shape)

In [None]:
df

In [None]:
def is_systemic_anticancer_trial(row):
    text = " ".join([
        str(row.get('Title', '')),
        str(row.get('Official_Title', '')),
        str(row.get('Brief_Summary', '')),
        str(row.get('Eligibility', '')),
        str(row.get('Conditions', '')),
        str(row.get('Interventions', ''))
    ]).lower()

    # 1. HARD EXCLUDE — non-drug / procedural / local therapy (kill 99% of false positives instantly)
    junk_keywords = [
        # Locoregional / ablation / device
        "hifu", "radiofrequency ablation", "rfa", "cryoablation", "microwave ablation",
        "tace", "deb-tace", "y90", "radioembolization", "sir-spheres", "therasphere",
        "hepatic arterial", "haic", "chemoembolization", "embolization",
        "sbrt", "cyberknife", "gamma knife", "stereotactic body", "stereotactic radiosurgery",
        "brachytherapy", "hdr ", "intrathecal", "lumbar puncture", "pleurodesis",
        "thoracentesis", "paracentesis", "pleural catheter", "chest tube", "pleurx",

        # Surgical / diagnostic procedures
        "surgery", "resection", "lobectomy", "wedge resection", "thoracoscopic", "vats", "ivats",
        "biopsy only", "diagnostic biopsy", "tissue collection",

        # Prevention / screening / supportive care
        "prevention", "chemoprevention", "smoking cessation", "aspirin", "statin",
        "screening", "early detection", "surveillance",

        # Non-cancer or minimal anticancer intent
        "supportive care", "palliative care only", "best supportive care", "placebo",
        "radiation only", "radiotherapy alone", "radiation therapy as single modality",

        # Others
        "stage iiia", "stage iii", "inoperable locally advanced", "concurrent chemoradiation", "definitive chemoradiation",
        "induction chemotherapy.*allowed", "no study drug", "radiation therapy.*primary", "thoracic radiation"
    ]
    if any(kw in text for kw in junk_keywords):
        return False

    # 2. Must have at least ONE real anticancer drug keyword
    drug_keywords = [
        "chemotherapy", "targeted therapy", "immunotherapy", "pd-1", "pd-l1", "ctla-4",
        "parp inhibitor", "tkis", "egfr", "alk ", "braf", "mek", "mtor", "pi3k",
        "cdk4/6", "bcl-2", "antibody-drug conjugate", "adc", "bispecific",
        "car-t", "tcr-t", "til therapy", "vaccine", # (vaccine often therapeutic in oncology)
        "olaparib", "osimertinib", "pembrolizumab", "nivolumab", "atezolizumab",
        "bevacizumab", "trastuzumab", "cetuximab", "ramucirumab", "everolimus",
        "lenvatinib", "sunitinib", "pazopanib", "cabozantinib", "regorafenib"
    ]
    if not any(kw in text for kw in drug_keywords):
        return False

    # 3. Final safety net: must contain at least one of these strong signals
    strong_signals = [
        "phase 1", "phase i", "phase 2", "phase ii", "phase 3", "phase iii",
        "dose escalation", "maximum tolerated dose", "recommended phase 2 dose",
        "progression-free survival", "overall survival", "objective response rate",
        "recist", "irrc", "pfs", "os", "orr", "dcr"
    ]
    if not any(signal in text for signal in strong_signals):
        return False

    return True


# Apply it
df = df[
    (df['Study_Type'] == 'INTERVENTIONAL') &
    df.apply(is_systemic_anticancer_trial, axis=1)
].copy()

In [None]:

# Step 1: Keep only rows where elig_len <= 3500
df_filtered = df[(df['elig_len'] <= 3500) & (df['elig_len'] >= 300)].copy()

# Step 2: If still more than 100 rows → randomly sample 100 (reproducible if you set seed)
if len(df_filtered) > 100:
    df_filtered = df_filtered.sample(n=100, random_state=2)   # remove random_state if you want true random each time
else:
    print(f"After filtering, only {len(df_filtered)} rows remain (≤100), keeping all.")

# Reset index if you want clean numbering
df = df_filtered.reset_index(drop=True)

In [None]:
from groq import Groq
from getpass import getpass


GROQ_API_KEY = getpass("Get your free key at https://console.groq.com/keys → ")

client = Groq(api_key=GROQ_API_KEY)

In [None]:
df_1 = df.copy()

In [None]:
# ─────────────────────────────────────────────────────────────
# FINAL 2025 GOLD SYSTEM PROMPT
# ─────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """
You are a world-class, high-precision clinical trial eligibility parser specialized in oncology interventional therapeutic trials. Your only job is to output perfect, strictly valid JSON using the exact schema below. Never add, remove, or rename any field. Never output explanations, markdown, or extra text.

### MANDATORY SCHEMA (use EXACTLY this structure)
{
  "trial_id": null,
  "trial_category": "therapeutic_interventional",
  "age": {"min": 18, "max": null},
  "region_specific_age": {"japan_min": null},
  "inclusion": {
    "disease": {
      "confirmed_by": null,
      "cancer_type": "",
      "histology_subtype": "",
      "stage": "",
      "stage_list": [],
      "metastatic": null,
      "measurable_disease_recist": null,
      "biomarker_required": []
    },
    "performance_status": {
      "scale": null,
      "min": null,
      "max": null
    },
    "life_expectancy_weeks": null,
    "prior_therapy": {
      "required": [],
      "allowed": [],
      "disallowed": [],
      "max_lines_systemic": null,
      "washout_weeks": {
        "chemotherapy": null,
        "targeted_therapy": null,
        "immunotherapy": null,
        "investigational": null,
        "radiation": null,
        "major_surgery": null
      }
    },
    "brain_metastases": null,
    "brain_mets_stable_duration_weeks": null,
    "organ_function": {
      "anc": null,
      "platelets": null,
      "hemoglobin_g_per_dl": null,
      "creatinine_clearance_ml_min": null,
      "bilirubin_x_uln": null,
      "ast_alt_x_uln": null,
      "albumin_g_per_dl": null
    },
    "cardiac": {
      "qtcf_ms_max": null,
      "recent_mi_months_exclusion": null,
      "nyha_class_max": null,
      "lvef_percent_min": null
    },
    "contraception_required": null,
    "other_inclusions": []
  },
  "exclusion": {
    "pregnant_or_breastfeeding": null,
    "active_cns_metastases": null,
    "uncontrolled_intercurrent_illness": null,
    "grade_2_or_higher_neuropathy": null,
    "history_of": [],
    "concurrent_medications_disallowed": [],
    "other_exclusions": []
  }
}

### STRICT RULES & CONVENTIONS (follow exactly)

1. Age
   - Default "min": 18 unless explicitly different
   - Japan-specific pediatric trials → fill japan_min only

2. Performance Status
   - Convert everything to ECOG 0–5 scale
   - Karnofsky 70–100 → ECOG 0–1 ("min": 0, "max": 1)
   - Karnofsky ≥70 → "max": 1
   - ECOG ≤1 → "max": 1
   - Always fill "scale": "ECOG"

3. Brain Metastases – ONLY use these 5 values:
   null | "excluded" | "allowed_if_asymptomatic" | "allowed_if_stable" | "allowed_if_treated_and_stable"
   - If stable duration specified → fill brain_mets_stable_duration_weeks
   - "Symptomatic", "uncontrolled", "requiring steroids" → "excluded"

4. Biomarkers
   - Use exact wording from trial: "ALK positive by FDA-approved test", "EGFR exon 19 del or L858R", "PD-L1 TPS ≥50%", "BRCA1/2 mutated", "MSI-H/dMMR"

5. Prior Therapy
   - "max_lines_systemic" = total systemic lines (not including adjuvant)
   - "required" = must have received
   - "disallowed" = must NOT have received

6. Lab Values – ALWAYS normalize
   - Bilirubin ≤1.5 mg/dL → "bilirubin_x_uln": 1.5
   - AST/ALT ≤3×ULN (≤5× if liver mets) → "ast_alt_x_uln": 3 + note exception in other_inclusions
   - Hemoglobin ≥9 g/dL → "hemoglobin_g_per_dl": 9

7. Life Expectancy
   - ≥3 months = 12, ≥6 months = 24, ≥12 months = 52

8. Contraception
   - Any mention of highly effective contraception → "contraception_required": true

9. cancer_type & histology_subtype
   - cancer_type: "non-small cell lung cancer", "hepatocellular carcinoma", "urothelial carcinoma"
   - histology_subtype: "adenocarcinoma", "squamous", "small cell", "clear cell RCC"

10. NEVER leave critical fields null if information exists
    - If ECOG mentioned → fill performance_status
    - If any labs mentioned → fill organ_function
    - If any prior therapy rules → fill prior_therapy

11. other_inclusions / other_exclusions
    - Short, precise bullets only
    - Examples:
      "Archival tumor tissue required"
      "Liver metastases: AST/ALT ≤5×ULN allowed"
      "Able to swallow tablets"
      "No active autoimmune disease requiring systemic treatment in past 2 years"

Only output raw JSON. No ```json wrapper, no extra characters, no thinking step.

Now parse the following trial eligibility text:
"""

# ─────────────────────────────────────────────────────────────
# FINAL PERFECT EXTRACTION FUNCTION
# ─────────────────────────────────────────────────────────────
def extract_eligibility_perfect(
    text: str,
    client,
    model: str = "meta-llama/llama-4-maverick-17b-128e-instruct",
    max_retries: int = 3
) -> Dict[str, Any]:
    """
    Final ultra-strict parser using your exact schema + rules.
    Assumes input is a real therapeutic interventional trial.
    Returns perfect JSON or raises exception.
    """
    if not text or not text.strip():
        raise ValueError("Empty eligibility text")

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": text}
                ],
                temperature=0.0,
                top_p=1.0,
                max_tokens=3000,
                timeout=90
            )

            raw = response.choices[0].message.content.strip()

            # Remove any accidental wrappers
            raw = re.sub(r"^```json\s*", "", raw, flags=re.IGNORECASE)
            raw = re.sub(r"^```\s*", "", raw)
            raw = re.sub(r"```$", "", raw)
            raw = raw.strip()

            if not raw.startswith("{"):
                raise ValueError("Output does not start with {")

            # Find first complete JSON object
            brace_level = 0
            end_idx = None
            for i, char in enumerate(raw):
                if char == '{': brace_level += 1
                if char == '}': brace_level -= 1
                if brace_level == 0:
                    end_idx = i + 1
                    break
            if end_idx is None:
                raise ValueError("Unbalanced braces")

            json_str = raw[:end_idx]
            result = json.loads(json_str)

            # Basic validation
            if not isinstance(result, dict) or "inclusion" not in result:
                raise ValueError("Invalid top-level structure")

            # Final safety: ensure performance_status scale is ECOG
            ps = result["inclusion"].get("performance_status", {})
            if ps.get("scale") in [None, ""]:
                ps["scale"] = "ECOG"

            return result

        except Exception as e:
            print(f"[Attempt {attempt + 1}] Failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            else:
                raise RuntimeError(f"Eligibility parsing failed after {max_retries} attempts") from e

    raise RuntimeError("extract_eligibility_perfect exited unexpectedly")

In [None]:
tqdm.pandas()

df_1["eligibility_json"] = df_1["Eligibility"].progress_apply(
    lambda x: extract_eligibility_perfect(x, client=client)
)

In [None]:
df_1[["eligibility_json","Eligibility"] ].head(10)

In [None]:
# ============= Save CSV ============
csv_name = "mavrik_trials_parsed_latest.csv"
df_1.to_csv(csv_name, index=False)
print("Saved:", csv_name)