**Note:** I've cleared all the outputs from this notebook because I downloaded it from Google Colab and it was causing some metadata issues when I tried to push it to Git.


## Mount Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Install Dependencies

In [None]:
!pip install unsloth

## Import Libraries

In [None]:
import json
from datasets import Dataset # This gives the dataset class to tranform our json data into hugging face dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

from dotenv import load_dotenv
import os

load_dotenv("/content/drive/MyDrive/.env")

In [None]:
import wandb
wandb.login(key=os.getenv("WANDB_API_KEY"))

## Hugging Face CLI Setup


In [None]:
!pip install -U "huggingface_hub[cli]" --quiet
!hf auth login

## Load Base Model and Tokenizer


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = 'unsloth/Phi-3-mini-4k-instruct-bnb-4bit',
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

## Converting JSON Data into a Chat-Formatted Dataset provided by the Tokenize

In [None]:
with open("/content/drive/MyDrive/patient_admission.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for item in data:
    if "metadata" in item:
        item["metadata"].pop("widgets", None)

ds = Dataset.from_list(data) # Huggingface dataset object

def to_text(ex):
    resp = ex["response"]
    if not isinstance(resp, str):
        resp = json.dumps(resp, ensure_ascii=False) # "response" dictionary is converted into JSON string
    msgs = [
        {"role": "user", "content": ex["prompt"]}, # for the prompt role will be user, content --> our prompt
        {"role": "assistant", "content": resp}, # for the response role will be assistant, content --> our response in json string
    ]
    return {
        "text": tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )                          # this is where our json data is wrapped with the model's chat template
    }

dataset = ds.map(to_text, remove_columns=ds.column_names) # dromp columns drops original columns (prompt, respone)

print("=== BEFORE (raw JSON row) ===")
print(ds[0])   # has "prompt" and "response"

print("\n=== AFTER (chat template applied) ===")
print(dataset[0]["text"]) # this will have "user" and "assistant"

## Load PEFT (qLoRA) Model

In [None]:
# Default LoRA Configuration from github
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,  # rank of matrices (for LoRA)
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj',
    ],  # which layers to inject LoRA into
    lora_alpha = 64 * 2,  # scaling factor, usually 2x rank
    lora_dropout = 0,  # no regularization, but still since LoRA is often small there is no risk of overfitting
    bias = 'none',  # bias stays frozen, only learn the low-rank matrices
    use_gradient_checkpointing = 'unsloth',  # activate custom checkpointing scheme of Unsloth -> higher compute but less GPU memory when backpropagating
)

## Supervised Fine-Tuning Trainer Setup with SFTTraine

In [None]:
trainer = SFTTrainer(  # supervised fine-tuning trainer
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    dataset_text_field = 'text', # tells trainer which column in the dataset contains text sequences.
    max_seq_length = 2048,
    args = SFTConfig(
        per_device_train_batch_size = 2,  # each GPU reads 2 tokenized sequences at once
        gradient_accumulation_steps = 4,  # accumulate loss for 4 iterations before optimizer step -> effective batch 2 * 4 = 8
        warmup_steps = 10,  # linearly "climb" to the learning rate from 0 in the first 10 steps
        max_steps = 60,  # max steps before stopping (unless epochs out before that)
        logging_steps = 1,  # log every single step
        output_dir = "checkpoints/",  # where to store checkpoints, logs etc.
        optim = "adamw_8bit",  # 8-bit AdamW optimizer
        num_train_epochs = 3,  # number of epochs, unless we reach 60 steps first
        report_to=[])
)

## Train and Save the Fine-Tuned Model

In [None]:
trainer.train()

# Save Finetuned model
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")

## Upload Model to Hugging Face Hub

In [None]:
!hf upload r0hxt/phi3-mini-medical-admission ./finetuned_model

__________________

## Output Comparison of Base and Fine-Tuned Models 

##### Load both Models

In [None]:
# Load base model
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Load finetuned model
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name="finetuned_model",  # path to saved LoRA model
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(base_model)
FastLanguageModel.for_inference(ft_model)

##### Output Comparison

In [None]:
messages = [
    {"role": "user", "content": "Mike is 30 years old, admitted on Sept 6, 2025 with severe headache. bp: 140/70."}
]

# Base
base_inputs = base_tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(base_model.device)
base_attention = base_inputs.ne(base_tokenizer.pad_token_id).long()
base_outputs = base_model.generate(input_ids=base_inputs, attention_mask=base_attention, max_new_tokens=256)
base_response = base_tokenizer.batch_decode(base_outputs)[0]

# Finetuned
ft_inputs = ft_tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(ft_model.device)
ft_attention = ft_inputs.ne(ft_tokenizer.pad_token_id).long()
ft_outputs = ft_model.generate(input_ids=ft_inputs, attention_mask=ft_attention, max_new_tokens=256)
ft_response = ft_tokenizer.batch_decode(ft_outputs)[0]


print("Base Model Output:\n", base_response)
print("\n\nFinetuned Model Output:\n", ft_response)