In [1]:
from unsloth import FastModel
from datasets import load_dataset
from PIL import Image

model, tokenizer = FastModel.from_pretrained(
    model_name = "/root/autodl-tmp/kaggle408/checkpoints/gek_e2b", 
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    attn_implementation = "eager", # necessary
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.2: Fast Gemma3N patching. Transformers: 4.54.1.
   \\   /|    NVIDIA RTX 5880 Ada Generation. Num GPUs = 1. Max memory: 47.383 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

<img src="https://files.worldwildlife.org/wwfcmsprod/images/Sloth_Sitting_iStock_3_12_2014/story_full_width/8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg" alt="Alt text" height="256">

In [2]:
sloth_link = "https://files.worldwildlife.org/wwfcmsprod/images/Sloth_Sitting_iStock_3_12_2014/story_full_width/8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg"

messages = [{
    "role" : "user",
    "content": [
        { "type": "image", "image" : sloth_link },
        { "type": "text",  "text" : "Which films does this animal feature in?" }
    ]
}]

messages = tokenizer.apply_chat_template(messages).removeprefix('<bos>')
print(messages)

<start_of_turn>user
<image_soft_token>Which films does this animal feature in?<end_of_turn>



In [4]:
from datasets import load_dataset
train_set = load_dataset("/root/autodl-tmp/kaggle408/dataset/rlaif-v",split="train[:20%]")

In [5]:
def format(example):
    prompt = [
        {
            "role": "user",
            "content": [{"type": "image"}, {"type": "text", "text": example["question"]}],
        },
    ]
    chosen = [
        {
            "role": "assistant",
            "content": [{"type": "text", "text": example["chosen"]}],
        },
    ]
    rejected = [
        {
            "role": "assistant",
            "content": [{"type": "text", "text": example["rejected"]}],
        },
    ]

    max_size = max(tokenizer.image_processor.size.values())
    example["image"].thumbnail((max_size, max_size))

    if isinstance(example["image"], Image.Image) and example["image"].mode != "RGB":
        example["image"] = example["image"].convert("RGB")

    return {"images": [example["image"]], "prompt": prompt, "chosen": chosen, "rejected": rejected}

In [6]:
train_set = train_set.map(format, remove_columns=train_set.column_names)

In [7]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [8]:
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

from trl import DPOTrainer, DPOConfig
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        gradient_checkpointing=True,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        #num_train_epochs = 1,
        max_steps = 50,
        learning_rate = 5e-6,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        dataloader_num_workers=8,
        dataset_num_proc=8,
    ),
    processing_class= tokenizer.tokenizer,
    beta = 0.1,
    train_dataset = train_set,
    max_length = 2048,
    max_prompt_length = 512,
)

In [None]:
dpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,626 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 10,567,680 of 5,450,005,952 (0.19% trained)


Output()

Output()

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.6931,0.0,0.0,0.0,0.0,-295.3909,-365.700012,-4.570601,-4.835527,0,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-329.520447,-452.861694,-4.205674,-4.359941,No Log,No Log,No Log,No Log
3,0.6811,-0.06048,-0.087853,0.5,0.027373,-276.599884,-371.08374,-3.796412,-4.415313,No Log,No Log,No Log,No Log
4,0.7571,-0.091486,0.029122,0.125,-0.120608,-342.322479,-310.961487,-4.326503,-4.270564,No Log,No Log,No Log,No Log
5,0.7101,-0.112256,-0.07981,0.375,-0.032446,-344.78479,-337.127197,-4.44068,-4.732671,No Log,No Log,No Log,No Log
6,0.6937,-0.084758,-0.08822,0.625,0.003461,-500.740295,-449.412445,-4.444498,-4.640773,No Log,No Log,No Log,No Log
7,0.7334,-0.156626,-0.081477,0.25,-0.075148,-469.506714,-447.463074,-4.860947,-4.587072,No Log,No Log,No Log,No Log
8,0.7278,-0.150597,-0.089778,0.375,-0.060819,-526.976868,-658.174927,-4.172904,-4.389082,No Log,No Log,No Log,No Log
9,0.7181,-0.176807,-0.132912,0.375,-0.043896,-394.988007,-341.076233,-4.806297,-4.380426,No Log,No Log,No Log,No Log
10,0.6742,-0.129302,-0.182011,0.625,0.052709,-615.226929,-547.148804,-4.857585,-4.70545,No Log,No Log,No Log,No Log


TrainOutput(global_step=50, training_loss=0.6946284055709839, metrics={'train_runtime': 123.0591, 'train_samples_per_second': 3.25, 'train_steps_per_second': 0.406, 'total_flos': 0.0, 'train_loss': 0.6946284055709839, 'epoch': 0.024058703235895585})

Exception in thread Thread-77:
Traceback (most recent call last):
  File "/root/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/root/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/threading.py", line 1401, in run
    self.function(*self.args, **self.kwargs)
  File "/root/autodl-tmp/kaggle408/.venv/lib/python3.11/site-packages/swanlab/data/run/helper.py", line 148, in _
    monitor_func()
  File "/root/autodl-tmp/kaggle408/.venv/lib/python3.11/site-packages/swanlab/data/run/main.py", line 116, in monitor_func
    self.__exp.add(
  File "/root/autodl-tmp/kaggle408/.venv/lib/python3.11/site-packages/swanlab/data/run/exp.py", line 200, in add
    m = self._add(key, name, column_class, column_config, section_type, data, step)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/autodl-tmp/kaggle408/.venv/lib/python3.11/site-package