## Mount Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Install Dependencies

In [None]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.9.1-py3-none-any.whl.metadata (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.9.1 (from unsloth)
  Downloading unsloth_zoo-2025.9.2-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Do

## Import Libraries

In [None]:
import json
from datasets import Dataset # This gives the dataset class to tranform our json data into hugging face dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

from dotenv import load_dotenv
import os

load_dotenv("/content/drive/MyDrive/.env")
print(os.getenv("WANDB_API_KEY"))


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
f111083f50f6309c930fc9da1726e8aee6da1eec


In [None]:
import wandb
wandb.login(key=os.getenv("WANDB_API_KEY"))

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrkbeo5[0m ([33mrkbeo5-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Load Base Model and Tokenizer


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = 'unsloth/Phi-3-mini-4k-instruct-bnb-4bit',
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

==((====))==  Unsloth 2025.9.1: Fast Mistral patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

## Converting JSON Data into a Chat-Formatted Dataset provided by the Tokenize

In [None]:
with open("/content/drive/MyDrive/patient_admission.json", "r", encoding="utf-8") as f:
    data = json.load(f)

ds = Dataset.from_list(data) # Huggingface dataset object

def to_text(ex):
    resp = ex["response"]
    if not isinstance(resp, str):
        resp = json.dumps(resp, ensure_ascii=False) # "response" dictionary is converted into JSON string
    msgs = [
        {"role": "user", "content": ex["prompt"]}, # for the prompt role will be user, content --> our prompt
        {"role": "assistant", "content": resp}, # for the response role will be assistant, content --> our response in json string
    ]
    return {
        "text": tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )                          # this is where our json data is wrapped with the model's chat template
    }

dataset = ds.map(to_text, remove_columns=ds.column_names) # dromp columns drops original columns (prompt, respone)

print("=== BEFORE (raw JSON row) ===")
print(ds[0])   # has "prompt" and "response"

print("\n=== AFTER (chat template applied) ===")
print(dataset[0]["text"])

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

=== BEFORE (raw JSON row) ===
{'prompt': 'With joint pain, Rohit, aged 24 was admitted on 5 Sept whose BP was at 140/70.', 'response': {'age': '24', 'bp': '140/70', 'name': 'Rohit', 'symptom': 'joint pain'}}

=== AFTER (chat template applied) ===
<|user|>
With joint pain, Rohit, aged 24 was admitted on 5 Sept whose BP was at 140/70.<|end|>
<|assistant|>
{"age": "24", "bp": "140/70", "name": "Rohit", "symptom": "joint pain"}<|end|>
<|endoftext|>


## Load PEFT (qLoRA) Model

In [None]:
# Default LoRA Configuration from github
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,  # rank of matrices (for LoRA)
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj',
    ],  # which layers to inject LoRA into
    lora_alpha = 64 * 2,  # scaling factor, usually 2x rank
    lora_dropout = 0,  # no regularization, but still since LoRA is often small there is no risk of overfitting
    bias = 'none',  # bias stays frozen, only learn the low-rank matrices
    use_gradient_checkpointing = 'unsloth',  # activate custom checkpointing scheme of Unsloth -> higher compute but less GPU memory when backpropagating
)

Unsloth 2025.9.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Trainer

In [None]:
trainer = SFTTrainer(  # supervised fine-tuning trainer
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    dataset_text_field = 'text', # tells trainer which column in the dataset contains text sequences.
    max_seq_length = 2048,
    args = SFTConfig(
        per_device_train_batch_size = 2,  # each GPU reads 2 tokenized sequences at once
        gradient_accumulation_steps = 4,  # accumulate loss for 4 iterations before optimizer step -> effective batch 2 * 4 = 8
        warmup_steps = 10,  # linearly "climb" to the learning rate from 0 in the first 10 steps
        max_steps = 60,  # max steps before stopping (unless epochs out before that)
        logging_steps = 1,  # log every single step
        output_dir = "finetuned_model",  # where to store checkpoints, logs etc.
        optim = "adamw_8bit",  # 8-bit AdamW optimizer
        num_train_epochs = 3,  # number of epochs, unless we reach 60 steps first
        report_to=[])
)
trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/234 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 234 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Step,Training Loss
1,2.481
2,2.2183
3,2.1606
4,2.1264
5,2.1053
6,2.0916
7,1.8593
8,1.6878
9,1.4403
10,1.3187


TrainOutput(global_step=60, training_loss=0.7653514718015989, metrics={'train_runtime': 192.6611, 'train_samples_per_second': 2.491, 'train_steps_per_second': 0.311, 'total_flos': 1380948919713792.0, 'train_loss': 0.7653514718015989, 'epoch': 2.0})

In [None]:
FastLanguageModel.for_inference(model)

messages = [
    {
        "role": "user",
        "content": "Mike is 30 years old, admitted on sept 6, 2025. bp: 140/70."
    },
]

inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=512, use_cache=True, temperature=0.7, do_sample=True, top_p=0.9)

response = tokenizer.batch_decode(outputs)[0]

print(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|user|> Mike is 30 years old, admitted on sept 6, 2025. bp: 140/70.<|end|><|assistant|> {"age": "30", "bp": "140/70", "name": "Mike", "symptom": ""}<|end|>
