In [None]:
import pandas as pd
import json
import ast

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
pd.reset_option('display.max_colwidth')

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
pd.reset_option('display.max_rows')

# Preparing Patient-Trial Eligibility Dataset
This section merges patient-level data with the parsed trial eligibility JSON to create a training-ready dataset.

In [23]:
trials_df = pd.read_csv("./data/trials_parsed.csv")

In [5]:
trials_df = trials_df.drop_duplicates(subset=['NCT_ID'], keep='first').reset_index(drop=True)

In [None]:
# trials_df[["NCT_ID","Brief_Summary","Eligibility","eligibility_json"]].loc[59]

In [None]:
trials_df.head()

In [24]:
elgibiltiy_df = pd.read_csv("./data/patiens_trials_elgibiltiy.csv")

In [None]:
elgibiltiy_df.loc[elgibiltiy_df["trial_id"] == "NCT03346728", "trial_id"] = "NCT03361228"
elgibiltiy_df = elgibiltiy_df.rename(columns={"trial_id": "NCT_ID"})
elgibiltiy_df["unique_patient_id"] = elgibiltiy_df["NCT_ID"] + "_" + elgibiltiy_df["patient_id"]

This step links each patient to his corresponding structured trial eligibility JSONs:

In [11]:
elgibiltiy_df = elgibiltiy_df.merge(
    trials_df[["NCT_ID", "eligibility_json"]],
    on="NCT_ID",
    how="left"
)

Final data set preview:

In [14]:
elgibiltiy_df.sample(5)

Unnamed: 0,NCT_ID,patient_id,patient_json,is_eligible,reasoning,unique_patient_id,eligibility_json
231,NCT01951469,P002,"{""age"":42,""gender"":""M"",""diagnosis"":""stage IV N...",True,Eligible: EGFR mutant NSCLC with 1-2 brain les...,NCT01951469_P002,"{'trial_id': None, 'trial_category': 'therapeu..."
379,NCT04311710,P010,"{""age"":61,""gender"":""F"",""trial_part"":""Part 2, A...",True,"Eligible (NSCLC): Although ALK-positive, the m...",NCT04311710_P010,"{'trial_id': None, 'trial_category': 'therapeu..."
686,NCT02772107,P012,"{""age"":58,""gender"":""F"",""sclc_stage"":""Extensive...",True,"Eligible: Extensive-stage SCLC, ECOG 1, LFTs u...",NCT02772107_P012,"{'trial_id': None, 'trial_category': 'therapeu..."
383,NCT07006727,P004,"{""age"": 45, ""diagnosis"": ""LCNEC of the lung"", ...",True,"Eligible (LCNEC, DE): LCNEC is an inclusion on...",NCT07006727_P004,"{'trial_id': None, 'trial_category': 'therapeu..."
159,NCT06963502,P010,"{""age"":60,""gender"":""F"",""diagnosis"":""metastatic...",True,"Eligible: textbook KRAS G12C NSCLC, ECOG 0, no...",NCT06963502_P010,"{'trial_id': None, 'trial_category': 'therapeu..."


In [None]:
# Saving patiens-trials elgibiltiy training data
elgibiltiy_df.to_csv("./data/patiens_trials_elgibiltiy_training_data.csv", index=False)

In [None]:
# Loading patiens-trials elgibiltiy training data
elgibiltiy_df = pd.read_csv("./data/patiens_trials_elgibiltiy_training_data.csv")

Inspecting patient distribution and eligibility labels:

In [None]:
counts_df = elgibiltiy_df.groupby("NCT_ID").size().reset_index(name="record_count")
# print(counts_df)0
unique_counts = counts_df["record_count"].unique()
print(f"Patients per trial: {unique_counts[0]}")

In [16]:
elgibiltiy_df['is_eligible'].value_counts()

Unnamed: 0_level_0,count
is_eligible,Unnamed: 1_level_1
True,467
False,433


# Preparing JSON Examples for LLM Fine-Tuning
This section prepares the final dataset used to train the clinical-trial eligibility model.

It includes:
* Cleaning and standardizing JSON fields
* Structuring patient–trial pairs into well-defined prompt/response examples
* Converting those examples into LLaMA-3–compatible training text using the official chat template

The end result is a HuggingFace Dataset fully ready for supervised fine-tuning (SFT).

Standardizing JSON columns:

In [None]:
# Ensures original merged DataFrame remains untouched during JSON processing.
df =elgibiltiy_df.copy()


# Function to standardize JSON columns
def fix_json_column(col):
    def convert(x):
        # If already a dict → dump to JSON
        if isinstance(x, dict):
            return json.dumps(x)

        # Try JSON first
        try:
            return json.dumps(json.loads(x))
        except:
            pass

        # Try Python dict format
        try:
            py_obj = ast.literal_eval(x) # safely convert a Python dictionary-like string into proper JSON.
            return json.dumps(py_obj)
        except Exception as e:
            print("BAD JSON:", x)
            raise e

    return col.apply(convert)

# Both patient and trial eligibility columns are now safe for LLM ingestion.
df["patient_json"] = fix_json_column(df["patient_json"])
df["eligibility_json"] = fix_json_column(df["eligibility_json"])


Converting each row into a structured training example:

In [None]:

def make_example(row):
    instruction = (
        "Decide whether the following patient is eligible for the given clinical trial, "
        "and provide reasoning for your decision."
    )

    input_data = {
        "patient": json.loads(row["patient_json"]),
        "trial_eligibility": json.loads(row["eligibility_json"]),
    }

    output = {
        "is_eligible": bool(row["is_eligible"]),
        "reasoning": row["reasoning"]
    }

    return {
        "instruction": instruction,
        "input": input_data,
        "output": output,
    }

# Apply to entire DataFrame
examples = df.apply(make_example, axis=1).tolist()


Converting clinical trial examples into Llama-3.1-Instruct chat format (system + user + assistant) and wraps them into a Hugging Face Dataset ready for SFT training.

In [None]:
from datasets import Dataset

# No split — use all data for training
train_examples = examples

# =====================================================
# Formatting function
# =====================================================

def formatting_prompts_func(example):
    messages = [
        {"role": "system", "content": "You are an expert in assessing eligibility for oncology clinical trials. Decide whether the patient is eligible and explain your reasoning step by step."},
        {"role": "user", "content": example["instruction"] + "\n\nPatient data and trial criteria:\n" + json.dumps(example["input"], indent=2)},
        {"role": "assistant", "content": json.dumps(example["output"], indent=2)}
    ]

    text = "".join(
        f"<|begin_of_text|><|start_header_id|>{msg['role']}<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
        for msg in messages
    ) + "<|start_header_id|>assistant<|end_header_id|>\n\n"

    return {"text": text} # return a dict {"text": "the full formatted string"}

# Build dataset
train_dataset = Dataset.from_list([formatting_prompts_func(ex) for ex in train_examples])

print(f"Train size: {len(train_dataset)}")

# Setting Up the LLM for Fine-Tuning (Unsloth + LoRA)
This section installs dependencies, loads the base model, and prepares it for parameter-efficient fine-tuning on a T4 GPU (which has limited VRAM).

In [None]:
# The first line installs the latest version directly from their GitHub (usually weeks ahead of PyPI).
# The [colab-new] extra pulls in all the Colab-specific optimizations.

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch

# Making 8B fit on T4 GPU
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # already 4-bit quantized + Unsloth-optimized
    max_seq_length=4096, # big enough to fit long trial eligibilty text
    dtype=None,
    
    load_in_4bit=True, 
    use_gradient_checkpointing="unsloth", # unsloth efficient checkpointing (save VRAM)

    # 8B 4-bit + Unsloth checkpointing → fits comfortably on T4 GPU.
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,              # lowered from 64 (save VRAM)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16, # r = 2*alpha, strength = 0.5  (Medium Strength)
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth", 
    random_state=3407, # reproducibility
)

# Fine-Tuning the Model (SFT Training Loop)

This section launches supervised fine-tuning (SFT) using the `trl` SFTTrainer.  

In [None]:
# =====================================================
# Training
# =====================================================

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=4096,
    packing=True, # reduces padding waist by concatenates short examples into full length sequences efficintly using GPU compute time
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4, # 2 × 4 = effective batch size of 8
        warmup_steps=10,
        max_steps=90,
        learning_rate=2e-4, # 2025 standard for LoRA on any 8B–70B model
        fp16=True, # T4 loves fp16
        bf16=False, # bf16 is only faster/better on A100+
        logging_steps=10,
        save_strategy="steps",
        save_steps=30,
        output_dir="/content/llama3.1-8b-clinical-eligibility",
        optim="adamw_8bit",# 2025 standard saves VRAM with zero downsides
        seed=3407,  # same seed as LoRA init → fully reproducible
        report_to="none",
    ),
)

In [None]:
trainer.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Final LoRA adapter saveing
model.save_pretrained("/content/drive/MyDrive/Clinical_Trial_LLM/final_lora_adapter_3")
tokenizer.save_pretrained("/content/drive/MyDrive/Clinical_Trial_LLM/final_lora_adapter_3")

In [None]:
!pip install -q huggingface_hub transformers sentencepiece

In [None]:
from huggingface_hub import login
from huggingface_hub import whoami

login("your_hf_token")
print(whoami())

In [None]:
# Loading LoRA adapter + base model

model, tokenizer = FastLanguageModel.from_pretrained(
    "/content/drive/MyDrive/Clinical_Trial_LLM/final_lora_adapter_3",
    max_seq_length=4096,
    dtype=None,           
    load_in_4bit=True,    # load quantized first (saves RAM during merge)
)

# Enable native inference mode (needed before merging)
FastLanguageModel.for_inference(model)

In [None]:
# Merge and upload the FULL model

# Merged 16-bit
model.push_to_hub_merged(
    "ayham0010/LungTrial-Eligible",
    tokenizer,
    save_method="merged_16bit",
    token=True,
)