In [1]:
from unsloth import FastLanguageModel
import torch
import os
from huggingface_hub import login
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from unsloth.chat_templates import standardize_sharegpt
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from trl import SFTTrainer, SFTConfig
from unsloth.chat_templates import train_on_responses_only
import pandas as pd
from datasets import Dataset

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
import wandb

wandb.login()

%env WANDB_WATCH=all
%env WANDB_SILENT=true

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_WATCH=all
env: WANDB_SILENT=true


In [3]:
mytoken = os.environ["HUGGINGFACE_TOKEN"]
login(token=mytoken)

In [4]:
fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.697 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:01<00:00,  1.70it/s]


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.4.7 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


<a name="Data"></a>
### Data Prep


In [None]:
non_reasoning_dataset = load_dataset("EdwardSJ151/mario-1-2", split = "str_horizontal_newline_nopath")

In [7]:
dataset = standardize_sharegpt(non_reasoning_dataset)

non_reasoning_conversations = tokenizer.apply_chat_template(
    dataset["conversations"],
    tokenize = False,
)

In [8]:
non_reasoning_conversations[0]

'<|im_start|>user\nI need a level that has some pipes, no special enemies, some ground blocks, a few hard blocks, some coin blocks, a few breakable blocks, no koopas, many goombas, a few powerups, no coins, high elevation, Medium difficulty, overworld level<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n-----------------------------SSSSSS---------------\n--------------------------------------------------\n--------------------------------------------------\n--------------------------------------------------\n--------------------?------S!S----S!S-------------\n---------------##----------------------<>-------<>\n---------------##------<>--------------[]-------[]\n---------------##------[]--g-g---------[]-g-g-g-[]\nXXXXXXXXXXXXXXXXX--XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\nXXXXXXXXX

In [9]:
non_reasoning_subset = pd.Series(non_reasoning_conversations, name="text")
non_reasoning_dataset = Dataset.from_pandas(non_reasoning_subset.to_frame())

In [10]:
print(non_reasoning_dataset)

Dataset({
    features: ['text'],
    num_rows: 17071
})


<a name="Train"></a>
### Train the model


In [None]:
project_name = "akcit_games_finetunning"
wandb.init(entity="edwardsj_151", project=project_name, name="qwen-2.5-14b-horizontal-newline-1epoch-teste1",
           notes="rank=64, alpha=64, warmup ratio 5%, weight decay 1%")

[34m[1mwandb[0m: Currently logged in as: [33medwardsj151[0m ([33mwahoo_pav2025[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = non_reasoning_dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        # warmup_steps = 5,
        warmup_ratio = 0.05,
        num_train_epochs = 1, 
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=12): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17071/17071 [00:03<00:00, 5305.92 examples/s]


In [13]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4060 Ti. Max memory = 15.697 GB.
11.395 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()
wandb.finish()

In [15]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

15642.2005 seconds used for training.
260.7 minutes used for training.
Peak reserved memory = 13.67 GB.
Peak reserved memory for training = 2.275 GB.
Peak reserved memory % of max memory = 87.087 %.
Peak reserved memory for training % of max memory = 14.493 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [16]:
messages = [
    {"role" : "user", "content" : "Give me a level with many coins, many powerups"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 512, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
-----------------------ooo------------------------
---------------------SSSSSSS----------------------
--------------------------------------------------
--------------------------------------------------
-------------------ooooooo------------------------
-------------------SSSSSSSSS----------------------
-----------?--------------------------------------
---L-------------------------------------()-------
----r-----------r-----------g-#----------[]-----oo
XXXXXXX--------XXXXXXXXXXXXXXX#XXXXXXXXXXXXXX--XXX
XXXXXXX--------XXXXXXXXXXXXXXX#XXXXXXXXXXXXXX--XXX<|im_end|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
save_model_name = "Qwen3-14B-Instruct-bnb-4bit-teste1"

model.save_pretrained(save_model_name)
tokenizer.save_pretrained(save_model_name)

('Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/tokenizer_config.json',
 'Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/special_tokens_map.json',
 'Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/vocab.json',
 'Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/merges.txt',
 'Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/added_tokens.json',
 'Qwen3-14B-Instruct-bnb-4bit-mariogpt-teste1/tokenizer.json')