# Optimized finetuning with unsloth

In [None]:
import digitalhub as dh

In [None]:
PROJECT_NAME = "ner-fine-tuning"
project = dh.get_or_create_project(PROJECT_NAME)

In [None]:
%%writefile "train_model.py"

import os
import numpy as np
import json
from datasets import Dataset
from digitalhub_runtime_python import handler
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

@handler()
def train(project):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    with open("data/people_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    ds = Dataset.from_list(data)
    def to_text(ex):
        resp = ex["response"]
        if not isinstance(resp, str):
            resp = json.dumps(resp, ensure_ascii=False)
        msgs = [
            {"role": "user", "content": ex["prompt"]},
            {"role": "assistant", "content": resp},
        ]
        return {
            "text": tokenizer.apply_chat_template(
                msgs, tokenize=False, add_generation_prompt=False
            )
        }
    dataset = ds.map(to_text, remove_columns=ds.column_names)
    # Config From GitHub (without seed)
    model = FastLanguageModel.get_peft_model(
        model,
        r=64,  # rank of matrices (for LoRA)
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],  # which layers to inject LoRA into
        lora_alpha=64 * 2,  # scaling factor, usually 2x rank
        lora_dropout=0,  # no dropout, increase for regularizaiton
        bias="none",  # bias stays frozen, only learn the low-rank matrices
        use_gradient_checkpointing="unsloth",  # activate custom checkpointing scheme of Unsloth -> higher compute but less GPU memory when backpropagating
    )
    trainer = SFTTrainer(  # supervised fine-tuning trainer
        model=model,
        train_dataset=dataset,
        tokenizer=tokenizer,
        dataset_text_field="text",
        max_seq_length=2048,
        args=SFTConfig(
            per_device_train_batch_size=2,  # each GPU reads 2 tokenized sequences at once
            gradient_accumulation_steps=4,  # accumulate loss for 4 iterations before optimizer step -> effective batch 2 * 4 = 8
            warmup_steps=10,  # linearly "climb" to the learning rate from 0 in the first 10 steps
            max_steps=60,  # max steps before stopping (unless epochs out before that)
            logging_steps=1,  # log every single step
            output_dir="outputs",  # where to store checkpoints, logs etc.
            optim="adamw_8bit",  # 8-bit AdamW optimizer
            num_train_epochs=3,  # number of epochs, unless we reach 60 steps first
        ),
    )
    trainer.train()

    

In [None]:
FastLanguageModel.for_inference(model)
messages = [
    {
        "role": "user",
        "content": "Mike is 30 years old, loves hiking and works as a coder.",
    },
]
# Turn messages to tensor and send to GPU
inputs = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to("cuda")
# Generate model response with max 512 tokens and 0.7 temperature, smallest set of tokens with cumulative probability of >= 0.9 are kept for random sampling
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
)

response = tokenizer.batch_decode(outputs)[0]

print(response)