In [None]:
!nvidia-smi || echo "No GPU available"
!pip show transformers

In [None]:
!pip install -q peft accelerate bitsandbytes
!pip install -q fsspec==2023.9.2
!pip install -q -U datasets

print("All packages installed successfully.")

In [None]:
!pip install -q peft accelerate bitsandbytes
!pip install -q fsspec==2023.9.2
!pip install -q -U datasets

print("All packages installed successfully.")

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

assert torch.cuda.is_available(), "CUDA GPU not available!"
assert torch.cuda.device_count() == 1, f"Multiple GPUs detected: {torch.cuda.device_count()}"

print("Using GPU:", torch.cuda.get_device_name(0))
print("VRAM (total):", round(torch.cuda.get_device_properties(0).total_memory / 1e9, 2), "GB")

print(torch.cuda.memory_summary(device=0))

In [None]:
#loading datasets
from datasets import load_dataset

train_easy = load_dataset("json", data_files="/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/train_easy.jsonl", split="train")
train_medium = load_dataset("json", data_files="/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/train_medium.jsonl", split="train")

print(f"train_easy: {len(train_easy)} questions")
print(f"train_medium: {len(train_medium)} questions")

print("Sample from train_easy:\n", train_easy[0])
print("Sample from train_medium:\n", train_medium[0])

In [None]:
#loading the model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

MODEL_PATH = "/kaggle/input/phi-3/pytorch/phi-3.5-mini-instruct/2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
    local_files_only=True
)

model = prepare_model_for_kbit_training(model)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
#preprocessing
def preprocess_clinical_mcq(example):
    question = example["question"]
    options = example["options"]
    answer_label = example["answer"]

    option_texts = [f"({key}) {value}" for key, value in options.items()]
    prompt = f"Question: {question}\nChoices:\n" + "\n".join(option_texts) + "\nAnswer: "
    full_text = prompt + answer_label

    encoded = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=256
    )

    labels = encoded["input_ids"].copy()

    prompt_len = len(tokenizer(prompt, padding="max_length", truncation=True, max_length=256)["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len

    return {
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"],
        "labels": labels
    }

print("preprocessing done")

In [None]:
from datasets import concatenate_datasets

train_combined = concatenate_datasets([train_easy, train_medium])
train_combined = train_combined.select(range(64000))

tokenized_dataset = train_combined.map(
    preprocess_clinical_mcq,
    remove_columns=train_combined.column_names
)

split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")

In [None]:
from transformers import EvalPrediction
import numpy as np

def decode_labels(preds, labels):
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return decoded_preds, decoded_labels

def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    decoded_preds, decoded_labels = decode_labels(predictions, labels)

    correct = 0
    total = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_ans = pred.strip().split("Answer:")[-1].strip().upper()[:1]
        true_ans = label.strip().split("Answer:")[-1].strip().upper()[:1]

        if pred_ans == true_ans:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    return {"accuracy": accuracy}
print("computed metrics")

In [None]:
#lora
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"], 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/kaggle/working/lora_phi3_stage1",  
    per_device_train_batch_size=2,                    
    gradient_accumulation_steps=8,                    
    learning_rate=5e-5,                               
    num_train_epochs=2,                               
    logging_steps=500,         
    save_steps=500,
    save_total_limit=2,
    save_strategy="steps", 
    fp16=True,                                       
    gradient_checkpointing=True,                     
    remove_unused_columns=False,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("training args defined")

In [None]:
def train_model():
    print("Training started...")
    try:
        trainer.train()
        print("Training complete.")
        trainer.save_model("/kaggle/working/lora_phi3_stage1")
        tokenizer.save_pretrained("/kaggle/working/lora_phi3_stage1")
        print("Model and tokenizer saved to /kaggle/working/lora_phi3_stage1")
    except Exception as e:
        print("Training failed with error:", str(e))

# ⛔️ Don't use threading on Kaggle commit
train_model()

In [None]:
import shutil
shutil.make_archive("/kaggle/working/lora_phi3_stage1", "zip", "/kaggle/working/lora_phi3_stage1")
