In [None]:
!pip install -U tensorflow -q
!pip install -U unsloth vllm -q
!pip install bitsandbytes accelerate peft -q

In [None]:
import unsloth
from unsloth import FastModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, train_on_responses_only
import argparse
import logging
import sys
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import os, glob, shutil, logging
import torch
from datasets import load_dataset
from huggingface_hub import login
from trl import SFTTrainer

In [None]:
OUTPUT_DIR = "gemma-3-finetuned"
MODEL_NAME = "unsloth/gemma-3-4b-it"

# Info about the system

In [None]:
# Log system info
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
model, tokenizer = FastModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = 2048,
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # A bit more accurate, uses 2x memory
    full_finetuning = False # Whether to fine-tune all model weights or just adapters (if available)
)

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Should leave on!
    finetune_mlp_modules       = True,  # Should leave on!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

<a name="Data"></a>
# Data Prep
We now use the `Gemma-3` format for conversation style finetunes. We use [rewoo/planner_instruction_tuning_2k](https://huggingface.co/datasets/rewoo/planner_instruction_tuning_2k) dataset composed of <**Instruction, Input, Output**>.

Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use `get_chat_template` function to get the correct chat template. Unsloth natively supports `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3, phi4, qwen2.5, gemma3` and more.

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

tokenizer

In [None]:
from datasets import load_dataset
dataset = load_dataset("rewoo/planner_instruction_tuning_2k", split = "train")

# To reduce the training time, we will use a smaller dataset. You can remove this line to use the full dataset.
dataset = dataset.select(range(100))

dataset = dataset.train_test_split(test_size=0.1, seed=3407)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-3-4b-it", use_fast=True)
tokenizer.get_chat_template()

In [None]:
def formatting_prompts_func(examples):
    """Converte il dataset in formato conversazionale Gemma-3"""
    texts = []
    
    for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
        # Costruisci il prompt utente
        if inp.strip():
            user_content = f"{instr}\n\nInput: {inp}"
        else:
            user_content = instr
        
        # Formato conversazionale
        conversation = [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": out}
        ]

        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    
    return {"text": texts}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, remove_columns=eval_dataset.column_names)

In [None]:
print(train_dataset[0]['text'])

# Start Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=args.per_device_train_batch_size,
    gradient_accumulation_steps=args.gradient_accumulation_steps,
    warmup_ratio=args.warmup_ratio,
    num_train_epochs=args.num_train_epochs,
    learning_rate=args.learning_rate,
    fp16=args.fp16 and not is_bfloat16_supported(),
    bf16=args.bf16 and is_bfloat16_supported(),
    logging_steps=args.logging_steps,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type=args.lr_scheduler_type,
    eval_strategy="steps",
    eval_steps=args.eval_steps,
    save_strategy=args.save_strategy,
    save_steps=args.save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    seed=3407,
    output_dir=output_dir,
    report_to="none",
    gradient_checkpointing=args.gradient_checkpointing,
)

In [None]:
# Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=args.max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, pad_to_multiple_of=8),
    dataset_num_proc=2,
    packing=args.packing,
    args=training_args,
)

In [None]:
# TRAIN ON RESPONSES ONLY
trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user\n",
    response_part="<start_of_turn>model\n",
)

Input is separated from output

In [None]:
tokenizer.decode(trainer.train_dataset[1]["input_ids"])

Only the model response is shown

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, "[MASK]")

In [None]:
# Training
print("Starting training...")
trainer_stats = trainer.train()
print("Training completed successfully!")

In [None]:
# Save model and artifacts
print("Saving model and artifacts...")

# SALVA IL MODELLO FUSO
print("Merging LoRA weights into base model...")
model.save_pretrained_merged(OUTPUT_DIR, tokenizer)

# Esporta in GGUF (GGUF = formato llama.cpp)
print("Saving model in GGUF format...")
model.save_pretrained_gguf(
    OUTPUT_DIR,              # cartella HF (Hugging Face) con config.json
    tokenizer,
    quantization_method="f16"  # es.: "q4_k_m", "q8_0", "f16"
)