In [1]:
from unsloth import FastModel
import torch
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from unsloth.chat_templates import standardize_data_formats
from trl import SFTTrainer, SFTConfig
from unsloth.chat_templates import train_on_responses_only

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
import wandb

wandb.login()

%env WANDB_WATCH=all
%env WANDB_SILENT=true

### Unsloth

`FastModel` supports loading nearly any model now! This includes Vision and Text models!

In [2]:

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.697 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


We now add LoRA adapters so we only need to update a small amount of parameters!

In [3]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 64,           # Larger = higher accuracy, but might overfit
    lora_alpha = 64,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


<a name="Data"></a>
### Data Prep
We now use the `Gemma-3` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3, phi4, qwen2.5, gemma3` and more.

In [4]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [None]:
dataset = load_dataset("EdwardSJ151/mario-1-2", split = "train")

We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!

In [6]:
dataset = standardize_data_formats(dataset)

Let's see how row 100 looks like!

In [7]:
dataset[100]

{'conversations': [{'content': 'Please generate a level with some pipes, no special enemies, a few ground blocks, many hard blocks, some coin blocks, a few breakable blocks, no koopas, some goombas, some powerups, no coins, high elevation, Medium difficulty, overworld level',
   'role': 'user'},
  {'content': '--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n------------------------------------------------g-\n----------------------------------------------LSSS\n--------------------------------------------------\n--------------------------------------------------\n---##--------------------------------------g------\n---###-------g-----2-----!---!-----------SSUSS----\n---####------##-----------------------------------\n---#####-----##<>-------------------<>------------\n---######---g##[]-------------------[]------------\n---XXXXXXXXXXXXXXXXXXXX--X---X--XXXXXXXXXXXXXXXXXX\n---XXXXXXXXX

We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`

In [8]:
def apply_chat_template(examples):
    texts = tokenizer.apply_chat_template(examples["conversations"])
    return { "text" : texts }
pass
dataset = dataset.map(apply_chat_template, batched = True)

Map: 100%|██████████| 10891/10891 [00:00<00:00, 35210.47 examples/s]


Let's see how the chat template did! Notice `Gemma-3` default adds a `<bos>`!

In [9]:
dataset[100]["text"]

'<bos><start_of_turn>user\nPlease generate a level with some pipes, no special enemies, a few ground blocks, many hard blocks, some coin blocks, a few breakable blocks, no koopas, some goombas, some powerups, no coins, high elevation, Medium difficulty, overworld level<end_of_turn>\n<start_of_turn>model\n--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n------------------------------------------------g-\n----------------------------------------------LSSS\n--------------------------------------------------\n--------------------------------------------------\n---##--------------------------------------g------\n---###-------g-----2-----!---!-----------SSUSS----\n---####------##-----------------------------------\n---#####-----##<>-------------------<>------------\n---######---g##[]-------------------[]------------\n---XXXXXXXXXXXXXXXXXXXX--X---X--XXXXXXXXXXXXXXXXXX\n---XXXXXXXXXXXXXXX

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
project_name = "mario_llm_finetunning" 
wandb.init(entity="edwardsj_151", project=project_name, name="qwen-2.5-14b-horizontal-newline-1epoch-teste5")

In [10]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=12): 100%|██████████| 10891/10891 [00:09<00:00, 1129.36 examples/s]


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [11]:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12): 100%|██████████| 10891/10891 [00:00<00:00, 24950.37 examples/s]


Let's verify masking the instruction part is done! Let's print the 100th row again:

In [12]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><bos><start_of_turn>user\nPlease generate a level with some pipes, no special enemies, a few ground blocks, many hard blocks, some coin blocks, a few breakable blocks, no koopas, some goombas, some powerups, no coins, high elevation, Medium difficulty, overworld level<end_of_turn>\n<start_of_turn>model\n--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n------------------------------------------------g-\n----------------------------------------------LSSS\n--------------------------------------------------\n--------------------------------------------------\n---##--------------------------------------g------\n---###-------g-----2-----!---!-----------SSUSS----\n---####------##-----------------------------------\n---#####-----##<>-------------------<>------------\n---######---g##[]-------------------[]------------\n---XXXXXXXXXXXXXXXXXXXX--X---X--XXXXXXXXXXXXXXXXXX\n---XXXXXXXXXX

Now let's print the masked out example - you should see only the answer is present:

In [13]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                  --------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n------------------------------------------------g-\n----------------------------------------------LSSS\n--------------------------------------------------\n--------------------------------------------------\n---##--------------------------------------g------\n---###-------g-----2-----!---!-----------SSUSS----\n---####------##-----------------------------------\n---#####-----##<>-------------------<>------------\n---######---g##[]-------------------[]------------\n---XXXXXXXXXXXXXXXXXXXX--X---X--XXXXXXXXXXXXXXXXXX\n---XXXXXXXXXXXXXXXXXXXX--X---X--XXXXXXXXXXXXXXXXXX<end_of_turn>\n'

In [14]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4060 Ti. Max memory = 15.697 GB.
4.732 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [15]:
trainer_stats = trainer.train()
wandb.finish()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,891 | Num Epochs = 1 | Total steps = 1,361
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,209,984/4,000,000,000 (2.98% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,3.768
2,3.9854
3,3.2113
4,2.3687
5,2.1366
6,1.9071
7,1.8275
8,1.7176
9,1.8741
10,1.6421


Unsloth: Will smartly offload gradients to save VRAM!


In [16]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2171.4369 seconds used for training.
36.19 minutes used for training.
Peak reserved memory = 5.703 GB.
Peak reserved memory for training = 0.971 GB.
Peak reserved memory % of max memory = 36.332 %.
Peak reserved memory for training % of max memory = 6.186 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Create a level",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.2, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nMake a level with many coins and many koopas<end_of_turn>\n<start_of_turn>model\n--------------------------------------------------\n---------oo---------------------------------------\n--------o--o--------------------------------------\n---------------------------------oo--------------\n--------S?S!S!S!S!S!S!S------o--o-----------------\n-------------------------------------<>----------\n-------------------------------------[]----------\no-o-o-----g--------------------------[]--o-------\n--------SSSSSSSSS------R----R----R----[]--------R\n-SSSS---------------------------------[]------o---\n------------------<>----R----R----R----[]--------\n------------------[]--------------------[]--o-----\n-XXXXXXX---------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX--\n-XXXXXXX---------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX--<end_of_turn>']

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [18]:
save_model_name = "gemma-3-4b-it-unsloth-bnb-4bit-mariogpt-teste1"

model.save_pretrained(save_model_name)  # Local saving
tokenizer.save_pretrained(save_model_name)
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

['unsloth/gemma-3-4b-it-unsloth-bnb-4bit-mariogpt-teste1/processor_config.json']

### Saving to float16 for VLLM

We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3-finetune`. Set `if False` to `if True` to let it run!

In [19]:
if False: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

In [20]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )