In [None]:
!pip install unsloth
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes
!pip install -q wandb

In [None]:

import subprocess
import sys
import os

import torch
from unsloth import FastLanguageModel
import wandb

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
if 'GPU' in result.stdout:
    print("[OK] GPU detected!")
    gpu_name = result.stdout.split('\n')[8]
    print(f"     {gpu_name}")
else:
    print("[ERROR] No GPU! Go to: Runtime -> Change runtime type -> T4 GPU")
    sys.exit(1)
print(f"[OK] PyTorch: {torch.__version__}")
print(f"[OK] CUDA: {torch.cuda.is_available()}")
print(f"[OK] GPU: {torch.cuda.get_device_name(0)}")
print(f"[OK] WandB: {wandb.__version__}")

print("\n" + "="*70)
print("SETUP COMPLETE")
print("="*70)


In [None]:

wandb.login()
run = wandb.init(
    project="tiki-chatbot",
    name="qwen-2.5-7b-finetune",
    config={
        "model": "Qwen2.5-7B-Instruct",
        "max_seq_length": 1024,
        "lora_r": 16,
        "lora_alpha": 16,
        "learning_rate": 2e-4,
        "batch_size": 2,
        "gradient_accumulation": 4,
        "epochs": 3,
        "optimizer": "adamw_8bit",
    }
)

In [None]:
#Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-7B-Instruct",
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)
print("Base model loaded")

# Apply LoRA

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# Show trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
trainable_pct = 100 * trainable / total

print(f"Trainable: {trainable:,} / {total:,} ({trainable_pct:.2f}%)")

# Log model info to WandB
wandb.config.update({
    "trainable_params": trainable,
    "total_params": total,
    "trainable_percentage": trainable_pct
})

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,

    # Optimizer
    optim="adamw_8bit",
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_steps=10,

    # Logging
    logging_steps=10,
    logging_first_step=True,
    logging_dir="",

    # Evaluation
    eval_steps=100,
    eval_strategy="steps",

    # Saving
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Performance
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    # WandB integration
    report_to="wandb",
    run_name=run.name,

    seed=42,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
)

# Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=eval_data,
    dataset_text_field="text",
    max_seq_length=1024,
    data_collator=data_collator,
    args=training_args,
    packing=False,
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/8152 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/906 [00:00<?, ? examples/s]

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_mem = round(torch.cuda.max_memory_reserved() / 1024**3, 2)
max_mem = round(gpu_stats.total_memory / 1024**3, 2)
print(f"\nGPU: {gpu_stats.name}")
print(f"Memory: {start_mem}GB / {max_mem}GB")
print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(eval_data)}")

trainer_stats = trainer.train()
# trainer_stats = trainer.train(resume_from_checkpoint="/content/drive/MyDrive/qwen_ecommerce/checkpoint-2000")


The model is already on multiple devices. Skipping the move to device specified in `args`.



GPU: Tesla T4
Memory: 6.9GB / 14.74GB
Training examples: 8152
Validation examples: 906


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,152 | Num Epochs = 3 | Total steps = 3,057
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


Step,Training Loss,Validation Loss
100,0.6942,0.724311
200,0.6754,0.66736
300,0.6377,0.633623
400,0.637,0.602086
500,0.5798,0.568958
600,0.6456,0.541182
700,0.5144,0.511445
800,0.5269,0.485981
900,0.5137,0.462672
1000,0.5015,0.433347


Unsloth: Will smartly offload gradients to save VRAM!


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
print("\nEvaluating on validation set...")
eval_results = trainer.evaluate()

print(f"\nValidation Results:")
print(f"  Eval Loss: {eval_results['eval_loss']:.4f}")
print(f"  Eval Runtime: {eval_results['eval_runtime']:.2f}s")
print(f"  Eval Samples/sec: {eval_results['eval_samples_per_second']:.2f}")

# Log to WandB
wandb.log({
    "final/eval_loss": eval_results['eval_loss'],
})


Evaluating on validation set...



Validation Results:
  Eval Loss: 0.2156
  Eval Runtime: 149.55s
  Eval Samples/sec: 6.06


In [None]:
model_save_path = ""

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save model artifact to WandB
artifact = wandb.Artifact(
    name="tiki-chatbot-model",
    type="model",
    description="Fine-tuned Qwen 2.5 7B for Tiki ecommerce chatbot"
)
artifact.add_dir(model_save_path)
run.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/qwen_ecommerce/tiki_chatbot)... Done. 0.3s


<Artifact tiki-chatbot-model>