In [None]:
import pandas as pd

In [None]:
trials_df = pd.read_csv("/content/mavrik_trials_parsed_latest.csv")

In [None]:
trials_df

In [None]:
trials_df = trials_df.drop_duplicates(subset=['NCT_ID'], keep='first').reset_index(drop=True)

In [None]:
trials_df

In [None]:
pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# trials_df[["NCT_ID","Brief_Summary","Eligibility","eligibility_json"]].loc[59]

In [None]:
elgibiltiy_df = pd.read_csv("/content/patiens_trials_elgibiltiy.csv")

In [None]:
elgibiltiy_df.loc[elgibiltiy_df["trial_id"] == "NCT03346728", "trial_id"] = "NCT03361228"

In [None]:
elgibiltiy_df = elgibiltiy_df.rename(columns={"trial_id": "NCT_ID"})

In [None]:
elgibiltiy_df["unique_patient_id"] = elgibiltiy_df["NCT_ID"] + "_" + elgibiltiy_df["patient_id"]

In [None]:
elgibiltiy_df = elgibiltiy_df.merge(
    trials_df[["NCT_ID", "eligibility_json"]],
    on="NCT_ID",
    how="left"
)

In [None]:
elgibiltiy_df

In [None]:
elgibiltiy_df.to_csv("/content/patiens_trials_elgibiltiy_final.csv", index=False)

In [None]:
elgibiltiy_df = pd.read_csv("/content/patiens_trials_elgibiltiy_final.csv")

In [None]:
elgibiltiy_df.sample(5)

In [None]:
counts_df = elgibiltiy_df.groupby("NCT_ID").size().reset_index(name="record_count")
print(counts_df)

In [None]:
elgibiltiy_df['is_eligible'].value_counts()

In [None]:
import json
import ast
df =elgibiltiy_df.copy()
def fix_json_column(col):
    def convert(x):
        # If already a dict → dump to JSON
        if isinstance(x, dict):
            return json.dumps(x)

        # Try JSON first
        try:
            return json.dumps(json.loads(x))
        except:
            pass

        # Try Python dict format
        try:
            py_obj = ast.literal_eval(x)
            return json.dumps(py_obj)
        except Exception as e:
            print("BAD JSON:", x)
            raise e

    return col.apply(convert)


df["patient_json"] = fix_json_column(df["patient_json"])
df["eligibility_json"] = fix_json_column(df["eligibility_json"])


In [None]:
def make_example(row):
    instruction = (
        "Decide whether the following patient is eligible for the given clinical trial, "
        "and provide reasoning for your decision."
    )

    input_data = {
        "patient": json.loads(row["patient_json"]),
        "trial_eligibility": json.loads(row["eligibility_json"]),
    }

    output = {
        "is_eligible": bool(row["is_eligible"]),
        "reasoning": row["reasoning"]
    }

    return {
        "instruction": instruction,
        "input": input_data,
        "output": output,
    }

examples = df.apply(make_example, axis=1).tolist()


In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
# 1. Install latest Unsloth (supports DeepSpeed ZeRO-3 on Colab)

from unsloth import FastLanguageModel
import torch

# This is the magic line that makes 8B fit on T4
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
    # This enables DeepSpeed ZeRO-3 automatically on Colab
    use_gradient_checkpointing="unsloth",
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,              # ↓ lowered from 64 to save VRAM
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

In [None]:
from datasets import Dataset
import json

# No split — use all data for training
train_examples = examples

# =====================================================
# Formatting function
# =====================================================

def formatting_prompts_func(example):
    messages = [
        {"role": "system", "content": "You are an expert in assessing eligibility for oncology clinical trials. Decide whether the patient is eligible and explain your reasoning step by step."},
        {"role": "user", "content": example["instruction"] + "\n\nPatient data and trial criteria:\n" + json.dumps(example["input"], indent=2)},
        {"role": "assistant", "content": json.dumps(example["output"], indent=2)}
    ]

    text = "".join(
        f"<|begin_of_text|><|start_header_id|>{msg['role']}<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
        for msg in messages
    ) + "<|start_header_id|>assistant<|end_header_id|>\n\n"

    return {"text": text}

# Build dataset
train_dataset = Dataset.from_list([formatting_prompts_func(ex) for ex in train_examples])

print(f"Train size: {len(train_dataset)}")


# =====================================================
# Training
# =====================================================

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=4096,
    packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=40,
        learning_rate=2e-4,
        fp16=True,
        bf16=False,
        logging_steps=10,
        save_strategy="steps",
        save_steps=40,
        output_dir="/content/llama3.1-8b-clinical-eligibility",
        optim="adamw_8bit",
        seed=3407,
        report_to="none",
    ),
)

In [None]:
trainer.train()

In [None]:
# Final LoRA adapter saveing
model.save_pretrained("/content/drive/MyDrive/Clinical_Trial_LLM/final_lora_adapter_3")
tokenizer.save_pretrained("/content/drive/MyDrive/Clinical_Trial_LLM/final_lora_adapter_3")