In [None]:
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps xformers==0.0.28 trl peft accelerate bitsandbytes
!pip install -q huggingface_hub transformers sentencepiece

In [None]:
from unsloth import FastLanguageModel
import torch
import json

In [3]:
from huggingface_hub import login
login("hf_token")

In [None]:
from huggingface_hub import whoami
print(whoami())

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    "ayham0010/LungTrial-Eligible",
    load_in_4bit=True,
    dtype=None,
    max_seq_length=4096,
)
FastLanguageModel.for_inference(model)  # 2x faster inference

In [None]:
!pip install groq

from groq import Groq
from getpass import getpass


GROQ_API_KEY = getpass("Get your free key at https://console.groq.com/keys → ")

client = Groq(api_key=GROQ_API_KEY)

In [7]:
from typing import Dict, Any
import re
import time

SYSTEM_PROMPT = """
You are a world-class, high-precision clinical trial eligibility parser specialized in oncology interventional therapeutic trials. Your only job is to output perfect, strictly valid JSON using the exact schema below. Never add, remove, or rename any field. Never output explanations, markdown, or extra text.

### MANDATORY SCHEMA (use EXACTLY this structure)
{
  "trial_id": null,
  "trial_category": "therapeutic_interventional",
  "age": {"min": 18, "max": null},
  "region_specific_age": {"japan_min": null},
  "inclusion": {
    "disease": {
      "confirmed_by": null,
      "cancer_type": "",
      "histology_subtype": "",
      "stage": "",
      "stage_list": [],
      "metastatic": null,
      "measurable_disease_recist": null,
      "biomarker_required": []
    },
    "performance_status": {
      "scale": null,
      "min": null,
      "max": null
    },
    "life_expectancy_weeks": null,
    "prior_therapy": {
      "required": [],
      "allowed": [],
      "disallowed": [],
      "max_lines_systemic": null,
      "washout_weeks": {
        "chemotherapy": null,
        "targeted_therapy": null,
        "immunotherapy": null,
        "investigational": null,
        "radiation": null,
        "major_surgery": null
      }
    },
    "brain_metastases": null,
    "brain_mets_stable_duration_weeks": null,
    "organ_function": {
      "anc": null,
      "platelets": null,
      "hemoglobin_g_per_dl": null,
      "creatinine_clearance_ml_min": null,
      "bilirubin_x_uln": null,
      "ast_alt_x_uln": null,
      "albumin_g_per_dl": null
    },
    "cardiac": {
      "qtcf_ms_max": null,
      "recent_mi_months_exclusion": null,
      "nyha_class_max": null,
      "lvef_percent_min": null
    },
    "contraception_required": null,
    "other_inclusions": []
  },
  "exclusion": {
    "pregnant_or_breastfeeding": null,
    "active_cns_metastases": null,
    "uncontrolled_intercurrent_illness": null,
    "grade_2_or_higher_neuropathy": null,
    "history_of": [],
    "concurrent_medications_disallowed": [],
    "other_exclusions": []
  }
}

### STRICT RULES & CONVENTIONS (follow exactly)

1. Age
   - Default "min": 18 unless explicitly different
   - Japan-specific pediatric trials → fill japan_min only

2. Performance Status
   - Convert everything to ECOG 0–5 scale
   - Karnofsky 70–100 → ECOG 0–1 ("min": 0, "max": 1)
   - Karnofsky ≥70 → "max": 1
   - ECOG ≤1 → "max": 1
   - Always fill "scale": "ECOG"

3. Brain Metastases – ONLY use these 5 values:
   null | "excluded" | "allowed_if_asymptomatic" | "allowed_if_stable" | "allowed_if_treated_and_stable"
   - If stable duration specified → fill brain_mets_stable_duration_weeks
   - "Symptomatic", "uncontrolled", "requiring steroids" → "excluded"

4. Biomarkers
   - Use exact wording from trial: "ALK positive by FDA-approved test", "EGFR exon 19 del or L858R", "PD-L1 TPS ≥50%", "BRCA1/2 mutated", "MSI-H/dMMR"

5. Prior Therapy
   - "max_lines_systemic" = total systemic lines (not including adjuvant)
   - "required" = must have received
   - "disallowed" = must NOT have received

6. Lab Values – ALWAYS normalize
   - Bilirubin ≤1.5 mg/dL → "bilirubin_x_uln": 1.5
   - AST/ALT ≤3×ULN (≤5× if liver mets) → "ast_alt_x_uln": 3 + note exception in other_inclusions
   - Hemoglobin ≥9 g/dL → "hemoglobin_g_per_dl": 9

7. Life Expectancy
   - ≥3 months = 12, ≥6 months = 24, ≥12 months = 52

8. Contraception
   - Any mention of highly effective contraception → "contraception_required": true

9. cancer_type & histology_subtype
   - cancer_type: "non-small cell lung cancer", "hepatocellular carcinoma", "urothelial carcinoma"
   - histology_subtype: "adenocarcinoma", "squamous", "small cell", "clear cell RCC"

10. NEVER leave critical fields null if information exists
    - If ECOG mentioned → fill performance_status
    - If any labs mentioned → fill organ_function
    - If any prior therapy rules → fill prior_therapy

11. other_inclusions / other_exclusions
    - Short, precise bullets only
    - Examples:
      "Archival tumor tissue required"
      "Liver metastases: AST/ALT ≤5×ULN allowed"
      "Able to swallow tablets"
      "No active autoimmune disease requiring systemic treatment in past 2 years"

Only output raw JSON. No ```json wrapper, no extra characters, no thinking step.

Now parse the following trial eligibility text:
"""






def extract_eligibility(
    text: str,
    client,
    model: str = "meta-llama/llama-4-maverick-17b-128e-instruct",
    max_retries: int = 3
) -> Dict[str, Any]:

    if not text or not text.strip():
        raise ValueError("Empty eligibility text")

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": text}
                ],
                temperature=0.0,
                top_p=1.0,
                max_tokens=3000,
                timeout=90
            )

            raw = response.choices[0].message.content.strip()

            # Remove any accidental wrappers
            raw = re.sub(r"^```json\s*", "", raw, flags=re.IGNORECASE)
            raw = re.sub(r"^```\s*", "", raw)
            raw = re.sub(r"```$", "", raw)
            raw = raw.strip()

            if not raw.startswith("{"):
                raise ValueError("Output does not start with {")

            # Find first complete JSON object
            brace_level = 0
            end_idx = None
            for i, char in enumerate(raw):
                if char == '{': brace_level += 1
                if char == '}': brace_level -= 1
                if brace_level == 0:
                    end_idx = i + 1
                    break
            if end_idx is None:
                raise ValueError("Unbalanced braces")

            json_str = raw[:end_idx]
            result = json.loads(json_str)

            # Basic validation
            if not isinstance(result, dict) or "inclusion" not in result:
                raise ValueError("Invalid top-level structure")

            # Final safety: ensure performance_status scale is ECOG
            ps = result["inclusion"].get("performance_status", {})
            if ps.get("scale") in [None, ""]:
                ps["scale"] = "ECOG"

            return result

        except Exception as e:
            print(f"[Attempt {attempt + 1}] Failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            else:
                raise RuntimeError(f"Eligibility parsing failed after {max_retries} attempts") from e

    raise RuntimeError("extract_eligibility_perfect exited unexpectedly")

In [8]:

# =====================================================
# Test function
# =====================================================
def is_eligible(patient_and_trial_dict):
    messages = [
        {"role": "system", "content": "You are an expert in assessing eligibility for oncology clinical trials. Decide whether the patient is eligible and explain your reasoning step by step."},
        {"role": "user", "content": "Decide whether the following patient is eligible for the given clinical trial, and provide reasoning for your decision.\n\nPatient data and trial criteria:\n" + json.dumps(patient_and_trial_dict, indent=2)},
    ]
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        temperature=0.0,      # deterministic = perfect JSON every time
        top_p=0.95,
        do_sample=False
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response.split("assistant")[-1].strip())

In [23]:
eligibility_text = "Key Inclusion Criteria: * Participant has histologically or cytologically confirmed metastatic NSCLC (stage IV with known subtype). * Participant has progressed radiographically on or after receiving: * One prior line of therapy (PD-1/PD-L1 inhibitor and platinum-based chemotherapy concomitantly) in the metastatic disease setting; OR * No more than 2 prior lines of therapy (PD-1/PD-L1 inhibitor and platinum-based chemotherapy sequentially, irrespective of the order) in the metastatic disease setting. * Participant must have positive tumor PD-L1 expression (tumor cells ≥1%) determined prospectively on a tumor sample from the metastatic setting at a sponsor-designated central laboratory. * Participant has measurable disease according to RECIST v1.1 as assessed by the investigator at baseline. * Participant has an Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1 within 7 days of Cycle 1 Day 1. * Participant has a life expectancy of ≥3 months. * Participant must have adequate organ and bone marrow function, per laboratory test results within 7 days of trial treatment. Key Exclusion Criteria: * Documentation of known targetable epidermal growth factor receptor (EGFR) sensitizing mutations, anaplastic lymphoma kinase (ALK), RET proto-oncogene (RET), ROS proto-oncogene 1; receptor tyrosine kinase (ROS1) rearrangement, Kirsten rat sarcoma virus (KRAS), B-Raf proto-oncogene (BRAF) mutations, and MET proto-oncogene; receptor tyrosine kinase (MET) exon 14 skipping mutations/MET amplification. NOTE: MET amplification testing is optional based on local availability of the test. * Participants with known KRAS/BRAF mutations are eligible for the trial if they do not have access to approved targeted therapies. * Participants with newly identified or known unstable or symptomatic central nervous system (CNS) metastases or history of carcinomatous meningitis. * Prior treatment with docetaxel for NSCLC. * Prior treatment with a 4-1BB (CD137) targeted agent, any type of antitumor vaccine, autologous cell immunotherapy, or any unapproved immunotherapy. * Treatment with an anticancer agent within 28 days prior to the first dose of trial treatment. Note: Other protocol-defined inclusion and exclusion criteria may apply."


In [24]:
eligibility_json= extract_eligibility( eligibility_text,client=client)

In [67]:
eligibility_json

{'trial_id': None,
 'trial_category': 'therapeutic_interventional',
 'age': {'min': 18, 'max': None},
 'region_specific_age': {'japan_min': None},
 'inclusion': {'disease': {'confirmed_by': 'histology or cytology',
   'cancer_type': 'non-small cell lung cancer',
   'histology_subtype': '',
   'stage': 'IV',
   'stage_list': ['IV'],
   'metastatic': True,
   'measurable_disease_recist': True,
   'biomarker_required': ['PD-L1 expression ≥1%']},
  'performance_status': {'scale': 'ECOG', 'min': 0, 'max': 1},
  'life_expectancy_weeks': 12,
  'prior_therapy': {'required': ['PD-1/PD-L1 inhibitor and platinum-based chemotherapy'],
   'allowed': [],
   'disallowed': ['docetaxel for NSCLC',
    '4-1BB targeted agent',
    'antitumor vaccine',
    'autologous cell immunotherapy',
    'unapproved immunotherapy'],
   'max_lines_systemic': 2,
   'washout_weeks': {'chemotherapy': 4,
    'targeted_therapy': 4,
    'immunotherapy': 4,
    'investigational': 4,
    'radiation': None,
    'major_surgery'

In [106]:
patient_1 = {"age":77,"gender":"M","diagnosis":"metastatic NSCLC squamous","prior_lines":3,"last_chemo_weeks_ago":10,"pd_l1_tps":30,"brain_mets":"none","ecog":2,"pregnant":False}
patient_2 = {"age":55,"gender":"M","diagnosis":"metastatic NSCLC adenocarcinoma","prior_lines":1,"last_chemo_weeks_ago":5,"pd_l1_tps":1,"brain_mets":"none","ecog":1,"pregnant":False}


In [107]:
def pateint_trial_match(patient, trial):
  test_case = {
    "patient": patient,
    "trial_eligibility": trial
  }

  print("=== MODEL OUTPUT ON THE CASE ===\n")
  is_eligible(test_case["patient"] | {"trial_eligibility": test_case["trial_eligibility"]})


In [109]:
pateint_trial_match(patient_2, eligibility_json )

=== MODEL OUTPUT ON THE CASE ===

{
  "is_eligible": true,
  "reasoning": "Eligible: Metastatic NSCLC adenocarcinoma, 1 prior line (PD-1/PD-L1 inhibitor and platinum-based chemotherapy), ECOG 1, no brain mets, adequate organ function, and meets PD-L1 expression \u22651% criteria."
}
