# Preparing the dataset for finetuning

In [30]:
from datasets import load_dataset

raw_datasets = load_dataset("imdb")


In [8]:
from transformers import AutoTokenizer

def tokenize_function(example):   
    checkpoint = "bert-base-cased" 
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer(
        example["text"], 
        padding="max_length",
        truncation=True,
        max_length=128
    )
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)    


In [9]:
small_train_ds = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_ds = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
full_train_ds = tokenized_dataset["train"]
full_eval_ds = tokenized_dataset["test"]

# Finetuning in Pytorch with the trainer API

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    
    

training_args = TrainingArguments(
    output_dir="ft_model",
    eval_strategy="epoch",
    num_train_epochs=5,
    
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_ds,
    eval_dataset=small_eval_ds,
    compute_metrics=compute_metrics
)

Downloading builder script: 0.00B [00:00, ?B/s]

In [12]:
trainer.train()

  return FileStore(store_uri, store_uri)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.479302,0.801
2,No log,0.876515,0.771
3,No log,0.725097,0.822
4,0.291900,0.834087,0.825
5,0.291900,0.951187,0.82


TrainOutput(global_step=625, training_loss=0.23629844341278075, metrics={'train_runtime': 58.0811, 'train_samples_per_second': 86.086, 'train_steps_per_second': 10.761, 'total_flos': 328888819200000.0, 'train_loss': 0.23629844341278075, 'epoch': 5.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.9511868357658386,
 'eval_accuracy': 0.82,
 'eval_runtime': 2.7835,
 'eval_samples_per_second': 359.256,
 'eval_steps_per_second': 44.907,
 'epoch': 5.0}

# Optimized finetuning with unsloth

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = 'unsloth/Phi-3-mini-4k-instruct-bnb-4bit',
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

==((====))==  Unsloth 2025.11.6: Fast Mistral patching. Transformers: 4.57.2.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [15]:
import json
from datasets import Dataset

with open("data/people_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

ds = Dataset.from_list(data)

def to_text(ex):
    resp = ex["response"]
    if not isinstance(resp, str):
        resp = json.dumps(resp, ensure_ascii=False)
    msgs = [
        {"role": "user", "content": ex["prompt"]},
        {"role": "assistant", "content": resp},
    ]
    return {
        "text": tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )
    }

dataset = ds.map(to_text, remove_columns=ds.column_names)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [16]:
# Config From GitHub (without seed)
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,  # rank of matrices (for LoRA)
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj',
    ],  # which layers to inject LoRA into
    lora_alpha = 64 * 2,  # scaling factor, usually 2x rank
    lora_dropout = 0,  # no dropout, increase for regularizaiton
    bias = 'none',  # bias stays frozen, only learn the low-rank matrices
    use_gradient_checkpointing = 'unsloth',  # activate custom checkpointing scheme of Unsloth -> higher compute but less GPU memory when backpropagating
)

Unsloth 2025.11.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

trainer = SFTTrainer(  # supervised fine-tuning trainer
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    dataset_text_field = 'text',
    max_seq_length = 2048,
    args = SFTConfig(
        per_device_train_batch_size = 2,  # each GPU reads 2 tokenized sequences at once
        gradient_accumulation_steps = 4,  # accumulate loss for 4 iterations before optimizer step -> effective batch 2 * 4 = 8
        warmup_steps = 10,  # linearly "climb" to the learning rate from 0 in the first 10 steps
        max_steps = 60,  # max steps before stopping (unless epochs out before that)
        logging_steps = 1,  # log every single step
        output_dir = "outputs",  # where to store checkpoints, logs etc.
        optim = "adamw_8bit",  # 8-bit AdamW optimizer
        num_train_epochs = 3  # number of epochs, unless we reach 60 steps first
    ),
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=64):   0%|          | 0/300 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 300 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Step,Training Loss
1,2.5389
2,2.4953
3,2.4146
4,2.4759
5,2.5492
6,2.2056
7,2.3067
8,2.1339
9,2.073
10,1.7677


TrainOutput(global_step=60, training_loss=1.1442357003688812, metrics={'train_runtime': 130.6207, 'train_samples_per_second': 3.675, 'train_steps_per_second': 0.459, 'total_flos': 833723935469568.0, 'train_loss': 1.1442357003688812, 'epoch': 1.5866666666666667})

In [31]:
FastLanguageModel.for_inference(model)

messages = [
    {
        "role": "user",
        "content": "Mike is 30 years old, loves hiking and works as a coder."
    },
]

# Turn messages to tensor and send to GPU
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Generate model response with max 512 tokens and 0.7 temperature, smallest set of tokens with cumulative probability of >= 0.9 are kept for random sampling
outputs = model.generate(input_ids=inputs, max_new_tokens=512, use_cache=True, temperature=0.7, do_sample=True, top_p=0.9)

response = tokenizer.batch_decode(outputs)[0]

print(response)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 20.50 MiB is free. Process 1148060 has 6.06 GiB memory in use. Process 1397764 has 25.64 GiB memory in use. Of the allocated memory 5.55 GiB is allocated by PyTorch, and 122.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_pretrained_merged("finetuned_model", tokenizer, save_method = "lora")
#model.save_pretrained_gguf("gguf_model_scratch_fixed", tokenizer, quantization_method="q4_k_m", maximum_memory_usage = 0.3)

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /home/acelepija/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [02:38<02:38, 158.16s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [03:49<00:00, 114.71s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:22<00:00, 11.44s/it]


Unsloth: Merge process complete. Saved to `/home/acelepija/enhanced_mlops/framework/library/use_cases/nlp/src/local_platform/experiments/gguf_model_scratch_fixed`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: Missing packages: cmake
Unsloth: Will attempt to install missing system packages.
Unsloth: Installing packages: cmake


RuntimeError: Unsloth: GGUF conversion failed: [FAIL] Unsloth: apt-get does not exist when installing cmake? Is this NOT a Linux / Mac based computer?