# 2. Model Fine-Tuning (Student Model)

This notebook fine-tunes a small model (`Qwen/Qwen2.5-3B-Instruct`) on the labeled dataset.
The goal is to produce a model that fits in 4GB RAM (via 4-bit quantization).

**Hardware for Training:**
- GPU: RTX 4070 Super (12GB VRAM).

**Hardware for Inference (Target):**
- CPU: 4 Cores
- RAM: 4GB

To achieve this, we will:
1. Load the 3B model.
2. Fine-tune with LoRA/QLoRA.
3. Merge adapters.
4. Export to GGUF format for efficient CPU inference.


In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from trl import SFTTrainer, SFTConfig # Updated import
import json

# Configuration
MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"
NEW_MODEL_NAME = "qwen2.5-3b-reminder-bot"
DATASET_FILE = "labeled_dataset.json"

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")

GPU Available: True


In [3]:
# 1. Prepare Dataset
def format_instruction(sample):
    # Convert our JSON format to ChatML format for Qwen
    
    system_prompt = """Ты — система для извлечения параметров напоминаний.
Твоя задача: извлечь текст, дату, время и периодичность из сообщения пользователя и вернуть JSON.
Используй текущую дату (Context Date) для разрешения относительных дат."""
    
    user_content = f"Context Date: {sample['context_date']}\nMessage: \"{sample['input']}\"\n\nJSON:"
    assistant_content = json.dumps(sample['output'], ensure_ascii=False)
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": assistant_content}
    ]
    
    return {"messages": messages}

# Load JSON
try:
    dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
    dataset = dataset.map(format_instruction)
    print(f"Dataset loaded: {len(dataset)} samples")
    
    # DEBUG: Inspect the first sample structure
    print("First sample messages type:", type(dataset[0]['messages']))
    print("First sample messages content:", dataset[0]['messages'])
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    pass

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4944 [00:00<?, ? examples/s]

Dataset loaded: 4944 samples
First sample messages type: <class 'list'>
First sample messages content: [{'content': 'Ты — система для извлечения параметров напоминаний.\nТвоя задача: извлечь текст, дату, время и периодичность из сообщения пользователя и вернуть JSON.\nИспользуй текущую дату (Context Date) для разрешения относительных дат.', 'role': 'system'}, {'content': 'Context Date: 2026-01-21T14:01:25.137854+00:00\nMessage: "[релизы]\nwiki.domrf.ru/pages/viewpage.action?pageid=267595334\n\nбанк:\n‼️21.01.2026  в 21-00 (среда)\nhttps://jira.domrf.ru/browse/integratio-4771\nелк-эскроу синхронизация данных из на \nбыла rfc-9906, её недокатили, катим сейчас без каба получается\n@hopheylalalei сделай страницу по этому релизу (по возможности, либо просто отпиши есть ли какие дополнительные задачи по конфигурированию в этом релизе) и пропиши ветки плз в табличке https://wiki.domrf.ru/pages/viewpage.action?pageid=315611369 \n\n [elk-facade] [athena-adapter] [athena-facade] \n\n@alexzodiac"

In [4]:
# 2. Load Base Model (Quantized for Training Efficiency)
# NOTE: Changing compute_dtype to bfloat16 to match the hardware capabilities (Ampere+)
# and satisfy the optimizer requirements if it decides to use bf16.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Changed from float16 to bfloat16
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.config.use_cache = False  # Silence warnings during training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print("Base model loaded.")



Loading weights:   0%|          | 0/434 [00:00<?, ?it/s]

Base model loaded.


In [5]:
# 3. LoRA Configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] 
)

# NOTE: We do NOT apply get_peft_model() here manually anymore.
# SFTTrainer will apply it automatically using the config above.
# If you run this cell, make sure 'model' is the raw base model from step 2.
print("LoRA config prepared.")

LoRA config prepared.


In [6]:
# 4. Training Arguments with SFTConfig
# In recent trl versions (like 0.28.0), arguments have changed slightly.
# 'max_seq_length' is now 'max_length' in SFTConfig.

training_args = SFTConfig(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=False,      # Disable fp16
    bf16=True,       # Enable bf16 (native support on RTX 4070)
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    optim="paged_adamw_8bit",
    report_to="none",
    max_length=512, 
    dataset_text_field="text",
    packing=False 
)

# Debug wrapper for formatting func
def safe_formatting_func(example):
    # Check if we have a list of messages (batched) or single
    # datasets.map with batched=True usually provides a dict of lists
    if 'messages' not in example:
        raise ValueError("Feature 'messages' not found in dataset")
        
    outputs = []
    for msgs in example['messages']:
        # Ensure msgs is a list of dicts
        if not isinstance(msgs, list):
             # This might happen if dataset loading is weird
             print(f"Warning: msgs is {type(msgs)}")
             continue
        
        # Apply template
        try:
            text = tokenizer.apply_chat_template(msgs, tokenize=False)
            outputs.append(text)
        except Exception as e:
            print(f"Error formatting message: {e}")
            # Fallback or empty
            outputs.append("")
            
    return outputs

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    peft_config=peft_config,
    processing_class=tokenizer, 
    formatting_func=safe_formatting_func
)

# Start Training
print("Starting training...")
trainer.train()

# Save Adapters
trainer.model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)
print(f"Model adapters saved to {NEW_MODEL_NAME}")

Applying formatting function to train dataset:   0%|          | 0/4944 [00:00<?, ? examples/s]



Tokenizing train dataset:   0%|          | 0/4944 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/4944 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Starting training...


Step,Training Loss
10,1.59646
20,0.837205
30,0.687365
40,0.568794
50,0.481584
60,0.449471
70,0.421026
80,0.366211
90,0.344257
100,0.332063


Model adapters saved to qwen2.5-3b-reminder-bot


In [7]:
# 5. Merge and Save (Optional but recommended for GGUF export)
# To export to GGUF, we need the merged model.
# Reload base model in FP16 (CPU or GPU if enough VRAM) to merge.
# Since 3B is small (6GB FP16), we can do this on 12GB GPU.

del model
del trainer
torch.cuda.empty_cache()

print("Merging model for export...")

from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, NEW_MODEL_NAME)
model = model.merge_and_unload()

# Save merged model
merged_model_path = f"{NEW_MODEL_NAME}_merged"
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print(f"Merged model saved to {merged_model_path}")

Merging model for export...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/434 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Merged model saved to qwen2.5-3b-reminder-bot_merged


In [8]:
import os

# 6. Export to GGUF (for 4GB RAM Target)
# This cell automates the GGUF conversion process using llama.cpp

# Define paths
cwd = os.getcwd()
llama_cpp_dir = os.path.join(cwd, "llama.cpp")
merged_model_dir = "qwen2.5-3b-reminder-bot_merged"
gguf_model_path_f16 = "qwen2.5-3b-reminder-bot.gguf"
gguf_model_path_q4 = "qwen2.5-3b-reminder-bot-q4_k_m.gguf"

# 1. Clone llama.cpp if not present
if not os.path.exists(llama_cpp_dir):
    print("Cloning llama.cpp...")
    !git clone https://github.com/ggerganov/llama.cpp
else:
    print("llama.cpp already exists.")

# 2. Build llama.cpp with cmake
print("Building llama.cpp...")
!cd {llama_cpp_dir} && cmake -B build && cmake --build build --config Release -j4

# 3. Convert to GGUF (f16)
convert_script = os.path.join(llama_cpp_dir, "convert_hf_to_gguf.py")
if not os.path.exists(convert_script):
     # Some versions might have a different name or path, check standard one first
     pass

print(f"Converting {merged_model_dir} to GGUF...")
if not os.path.exists(merged_model_dir):
    print(f"Error: Merged model directory {merged_model_dir} not found!")
else:
    # Use standard python invocation for convert script.
    # Note: convert_hf_to_gguf.py is typically in the root of llama.cpp repo
    !python3 {llama_cpp_dir}/convert_hf_to_gguf.py {merged_model_dir} --outfile {gguf_model_path_f16} --outtype f16

# 4. Quantize to q4_k_m
# Binary location depends on build system (cmake puts it in build/bin)
quantize_bin = os.path.join(llama_cpp_dir, "build", "bin", "llama-quantize")

print(f"Quantizing to {gguf_model_path_q4}...")
if os.path.exists(gguf_model_path_f16):
    if os.path.exists(quantize_bin):
        !{quantize_bin} {gguf_model_path_f16} {gguf_model_path_q4} q4_k_m
        print("Quantization complete!")
    else:
        print(f"Error: Quantize binary not found at {quantize_bin}")
else:
    print("Error: GGUF file not found, cannot quantize.")


llama.cpp already exists.
Building llama.cpp...
[0mCMAKE_BUILD_TYPE=Release[0m
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- ggml version: 0.9.7
-- ggml commit:  b55dcdef5
-- OpenSSL found: 3.0.18
-- Generating embedded license file for target: common
-- Configuring done (1.2s)
-- Generating done (0.4s)
-- Build files have been written to: /home/zodiac/stadygit/MLOps/ml_final/llama.cpp/build
[  0%] Built target build_info
[  1%] Built target sha256
[  2%] Built target xxhash
[  2%] Built target cpp-httplib
[  4%] Built target ggml-base
[  4%] Built target sha1
[  4%] Built target llama-llava-cli
[  5%] Built target llama-gemma3-cli
[  6%] Built target llama-minicpmv-cli
[  6%] Built target llama-qwen2vl-cli
[ 10%] Built target ggml-cpu
[ 11%] Built target ggml
[ 11%] Built target llama-gguf-hash
[ 12%] Built target llama-gguf
[ 44%] Built target llama
[ 44%] Built target te