In [2]:
# !pip install transformers
# !pip install bitsandbytes accelerate peft
# !pip install datasets
# !pip install trl

In [3]:
import argparse
import logging
import sys
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
import os, glob, shutil, logging
import torch
from datasets import load_dataset
from trl import SFTTrainer

2025-10-27 16:38:40.394684: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [4]:
if torch.cuda.is_available():
    device = "cuda"
    # Check if GPU benefits from bfloat16
    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
        torch_dtype = torch.bfloat16
    else:
        torch_dtype = torch.float16

elif getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available():
    device = "mps"
    torch_dtype = torch.float32
else:
    device = "cpu"
    torch_dtype = torch.float32

device

'cuda'

In [5]:
OUTPUT_DIR = "gemma-3-finetuned"
MODEL_NAME = "google/gemma-3-4b-it"

# Info about the system

In [6]:
# Log system info
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.8.0+cu128
CUDA available: True
GPU: NVIDIA L4
GPU memory: 23.6 GB


In [7]:
# BitsAndBytesConfig: Enables 4-bit quantization to reduce model size/memory usage
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_quant_storage=torch_dtype,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype="auto",                          # What torch dtype to use, defaults to auto
    device_map="auto",                     # Let torch decide how to load the model
    load_in_8bit=False,
    quantization_config=quantization_config # Enable 4-bit quantization
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
# Se è un VLM carica il processor dal modello base
processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from peft import LoraConfig, TaskType

# Parametri LoRA adattati per memoria MPS
rank_dimension = 8        # ridotto da 16
lora_alpha = 32           # ridotto da 64
lora_dropout = 0.05

target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

peft_config = LoraConfig(
    r=rank_dimension,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    task_type=TaskType.CAUSAL_LM,
)

<a name="Data"></a>
# Data Prep
We now use the `Gemma-3` format for conversation style finetunes. We use [rewoo/planner_instruction_tuning_2k](https://huggingface.co/datasets/rewoo/planner_instruction_tuning_2k) dataset composed of <**Instruction, Input, Output**>.

Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use `get_chat_template` function to get the correct chat template. Unsloth natively supports `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3, phi4, qwen2.5, gemma3` and more.

In [12]:
from datasets import load_dataset
dataset = load_dataset("rewoo/planner_instruction_tuning_2k", split = "train")

# To reduce the training time, we will use a smaller dataset. You can remove this line to use the full dataset.
# dataset = dataset.select(range(100))

dataset = dataset.train_test_split(test_size=0.1, seed=3407)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [13]:
print(tokenizer.get_chat_template())

{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
    {%- if messages[0]['content'] is string -%}
        {%- set first_user_prefix = messages[0]['content'] + '

' -%}
    {%- else -%}
        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '

' -%}
    {%- endif -%}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {%- set first_user_prefix = "" -%}
    {%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
    {%- endif -%}
    {%- if (message['role'] == 'assistant') -%}
        {%- set role = "model" -%}
    {%- else -%}
        {%- set role = message['role'] -%}
    {%- endif -%}
    {{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
    {%- if message['content'] is string -%}
        {{ message['content'] | trim }}


In [13]:
def formatting_prompts_func(examples):
    """Converte il dataset in formato conversazionale Gemma-3"""
    texts = []
    
    for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
        # Costruisci il prompt utente
        if inp.strip():
            user_content = f"{instr}\n\nInput: {inp}"
        else:
            user_content = instr
        
        # Formato conversazionale
        conversation = [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": out}
        ]

        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    
    return {"text": texts}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, remove_columns=eval_dataset.column_names)

In [15]:
print(train_dataset[0]['text'])

<bos><start_of_turn>user
For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)

Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

Input: Nick Moran is an English actor, writer, and producer who appeard as Scabior in what 2010 British-American fantasy film?<end_of_turn

# Start Training

In [16]:
from trl import SFTConfig, SFTTrainer

per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8  # aumentato per ridurre memoria
logging_steps = 10
learning_rate = 1e-4

max_grad_norm = 1.0
num_train_epochs = 3
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 1024  # ridotto da 1500 per memoria MPS

# Nota: su MPS accelerate non supporta fp16 mixed precision automatico.
# Manteniamo il modello caricato in float16 ma disabilitiamo il flag fp16/bf16 nel trainer.
training_arguments = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    eval_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    bf16=False,
    fp16=False,  # disattivato per evitare errore accelerate su MPS
    hub_private_repo=False,
    push_to_hub=False,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    packing=True
)

In [17]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.1529,0.260286,0.267382,554638.0,0.942252
2,1.7765,0.248613,0.238063,1109276.0,0.944805
3,1.7938,0.250881,0.218696,1663914.0,0.945018


TrainOutput(global_step=225, training_loss=4.165387308332655, metrics={'train_runtime': 3259.6774, 'train_samples_per_second': 0.547, 'train_steps_per_second': 0.069, 'total_flos': 3.634489985839776e+16, 'train_loss': 4.165387308332655, 'epoch': 3.0})

In [19]:
messages = [{
    "role": "system",
    "content": """For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)
 
Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.
 """,
    },
         {
    "role": "user",
    "content": """Who is Pelé?""",
    }
]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation,
    tokenize=False
).removeprefix('<bos>')

outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 300, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1, top_p = 0.95, top_k = 64,
)

output_finetuning = tokenizer.batch_decode(outputs)[0]
print(output_finetuning)

<bos><start_of_turn>user
For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)
 
Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.
 

Who is Pelé?<end_of_turn>
<start_of_turn>model
Plan: Search for more information about Pelé
#E1 = Wikipedia[Pelé]
Plan: Identify Pelé's

In [None]:
trainer.model.save_pretrained("outputs/adapter")
tokenizer.save_pretrained("outputs/adapter")
processor.save_pretrained("outputs/adapter")

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Carica il modello base in float16 (non quantizzato)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

SAVE_DIR = "outputs/merged-model"

# Carica il modello base in float16 (non quantizzato)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. Applica l'adapter LoRA
model = PeftModel.from_pretrained(base_model, "outputs/adapter")

# 3. Esegui il merge e scarica gli adapter
model = model.merge_and_unload()

# 4. Salva il modello mergiato
model.save_pretrained(SAVE_DIR)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(SAVE_DIR)

processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
processor.save_pretrained(SAVE_DIR)

print(f"✅ Modello salvato fuso in: {SAVE_DIR}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Modello salvato fuso in: outputs/merged-model


In [None]:
from huggingface_hub import HfApi, create_repo
import os

# Nome del repository su Hugging Face
repo_name = f"HUGGING-FACE-USERNAME/MODEL-NAME"  # Sostituisci con il tuo username e il nome che vuoi dare al modello

# Crea il repository su Hugging Face
api = HfApi()
create_repo(repo_name, private=False)

# Prepara i metadati per SageMaker
model_card = f"""
---
tags:
- text-generation
- pytorch
library_name: transformers
pipeline_tag: text-generation
inference: true
deployment: sagemaker
---

# Model Card for {repo_name}

Questo modello è una versione fine-tuned di Gemma-3 per la generazione di testo.
Il modello è stato addestrato per generare piani step-by-step utilizzando strumenti esterni.

## Uso con SageMaker

Il modello è configurato per essere deployato su Amazon SageMaker.

## Dataset
Il modello è stato addestrato su [rewoo/planner_instruction_tuning_2k](https://huggingface.co/datasets/rewoo/planner_instruction_tuning_2k).

## Training
- Base model: unsloth/gemma-3-4b-it
- Training framework: PyTorch con LoRA
- Hardware: NVIDIA L4 GPU
"""

# Salva il model card
with open(f"{SAVE_DIR}/README.md", "w") as f:
    f.write(model_card)

# Push su Hugging Face
api.upload_folder(
    folder_path=SAVE_DIR,
    repo_id=repo_name,
    commit_message="Add fine-tuned model with SageMaker configuration"
)

print(f"Modello caricato con successo su: https://huggingface.co/{repo_name}")
