In [1]:
import os
from datasets import load_dataset
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import os
os.environ["WANDB_PROJECT"] = "qwen-coder-llm-fine-tuning"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb
wandb.login()

wandb: Currently logged in as: casvi-sanchez (virtualtek) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

In [3]:
import wandb
wandb.login()

True

In [4]:
dataset_name = "squad_v2"
dataset = load_dataset(dataset_name, split="train")
eval_dataset = load_dataset(dataset_name, split="validation")
print("dataset: ",dataset)
print("eval_dataset: ",eval_dataset)


dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})
eval_dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})


In [5]:
import torch
cuda_available = torch.cuda.is_available()

if cuda_available:
    device_id = 0  # You can change to 1,2,3 if you want other GPUs
    torch.cuda.set_device(device_id)
    # device = torch.device(f"cuda:{device_id}")
    device = torch.device(f"cuda:{device_id}")
    print(f"🖥️ Using GPU {device_id}: {torch.cuda.get_device_name(device_id)}")
else:
    device = torch.device("cpu")
    print("⚙️ No GPU available, using CPU.")

print(f"Device selected: {device}")

🖥️ Using GPU 0: NVIDIA GeForce RTX 4070 SUPER
Device selected: cuda:0


In [6]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 2. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.4.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [21]:
import evaluate
import numpy as np
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [22]:
import time
start = time.time()
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = eval_dataset,
    compute_metrics=compute_metrics,
    args = SFTConfig(
        dataset_num_proc = 1,
        dataset_text_field = "question",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,

        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["question"]: 100%|██████████| 130319/130319 [00:07<00:00, 17238.23 examples/s]
Unsloth: Tokenizing ["question"]: 100%|██████████| 11873/11873 [00:00<00:00, 18400.11 examples/s]


In [23]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4070 SUPER. Max memory = 11.994 GB.
2.367 GB of memory reserved.


In [24]:
# Start training
trainer_stats=trainer.train()

end = time.time()
length = end - start

hours = int(length // 3600)
minutes = int((length % 3600) // 60)
seconds = int(length % 60)

print(f"It took {hours} hours, {minutes} minutes, and {seconds} seconds to train the model!")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 130,319 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 34,865,152/7,000,000,000 (0.50% trained)


Step,Training Loss
1,3.0617
2,2.9884
3,2.9369
4,2.976
5,2.9969
6,3.369
7,2.8302
8,2.6095
9,3.4397
10,2.5578


It took 0 hours, 1 minutes, and 4 seconds to train the model!


In [19]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

56.7882 seconds used for training.
0.95 minutes used for training.
Peak reserved memory = 2.367 GB.
Peak reserved memory for training = 0.232 GB.
Peak reserved memory % of max memory = 19.735 %.
Peak reserved memory for training % of max memory = 1.934 %.


In [20]:
print(trainer_stats.metrics)

{'train_runtime': 56.7882, 'train_samples_per_second': 8.452, 'train_steps_per_second': 0.528, 'total_flos': 69221221785600.0, 'train_loss': 3.5615405480066937}


In [None]:
print(trainer.evaluate())

Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
