In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.1: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.567 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

# Data Prep

In [4]:
from torch.utils.data import Dataset
from PIL import Image
import json
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = json.load(open("./gpt_4o_coco.json", "r", encoding="utf-8"))

    def __len__(self):
        return len(self.data)

    def _get_message(self, item, image_obj):
        question = item['questions']
        answer = item['answers']
        message = [
            {
                "role": "user",
                "content": [{'type': 'text', 'text': f"Question: {question}\n"}, {'type': 'image', 'image':image_obj}]
            },
            {
                "role": "assistant",
                "content": [
                    {'type': 'text', 'text': answer.replace("\\n", "\n")}
                ],
            }
        ]
        return message

    def _get_image(self, item):
        image_path = item['image_path']
        image = [Image.open(image_path)]
        return image
        
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            indices = range(*idx.indices(len(self)))
            batch_messages = []
            for i in indices:
                item = self.data[i]
                image = self._get_image(item)[0]
                batch_messages.append(self._get_message(item, image))
            return {'messages': batch_messages}
        else:
            item = self.data[idx]
            image = self._get_image(item)[0]
            return {
                'messages': self._get_message(item, image),
            }

In [5]:
dataset = CustomDataset(None)
print(len(dataset))
print(dataset[:2])

3156
{'messages': [[{'role': 'user', 'content': [{'type': 'text', 'text': 'Question: How are the Motorcycle in the foreground and Motorcycle in the background positioned?\n'}, {'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x427 at 0x7FF802A28700>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': "Overall Image Description: The image shows two vintage motorcycles parked on a grassy field. The motorcycle in the foreground is black with a cream fuel tank, and it has a sidecar attached. The motorcycle in the background is also black and appears to be a classic model. Both bikes have license plates, and there are wooden blocks under the front wheels to stabilize them. The setting is outdoors, likely at a motorcycle exhibition or rally, as indicated by the ropes marking the area.\n\nReference Object: Motorcycle in the background  \nTarget Object: Motorcycle in the foreground\n\nReference Object Absolute Direction: The motorcycle in the backg

In [6]:
dataset = list(dataset)[:200]
print(len(dataset))
print(dataset[:2])

200
[{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'Question: How are the Motorcycle in the foreground and Motorcycle in the background positioned?\n'}, {'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x427 at 0x7FF8029E0E50>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': "Overall Image Description: The image shows two vintage motorcycles parked on a grassy field. The motorcycle in the foreground is black with a cream fuel tank, and it has a sidecar attached. The motorcycle in the background is also black and appears to be a classic model. Both bikes have license plates, and there are wooden blocks under the front wheels to stabilize them. The setting is outdoors, likely at a motorcycle exhibition or rally, as indicated by the ropes marking the area.\n\nReference Object: Motorcycle in the background  \nTarget Object: Motorcycle in the foreground\n\nReference Object Absolute Direction: The motorcycle in the backgr

### Format: 
```python
[
{ "role": "user",
  "content": [{"type": "text",  "text": Q}, {"type": "image", "image": image} ]
},
{ "role": "assistant",
  "content": [{"type": "text",  "text": A} ]
},
]
```

In [7]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

Unsloth: Model does not have a default image size - using 512


Unsloth: Model does not have a default image size - using 512

In [8]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.567 GB.
7.736 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 3 | Total steps = 75
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 51,521,536/7,000,000,000 (0.74% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7621
2,2.6746
3,2.5629
4,2.5016
5,2.4059
6,2.0432
7,1.8655
8,1.5237
9,1.4545
10,1.2516


In [10]:
model.save_pretrained("Qwen2.5_VL_7B_lora_coco_only_200_datasets")  # Local saving
tokenizer.save_pretrained("Qwen2.5_VL_7B_lora_coco_only_200_datasets")

[]

In [14]:
from PIL import Image
FastVisionModel.for_inference(model) # Enable for inference!

image = Image.open("./catdog.jpg")
instruction = "Can you describe where the dog is?"

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512,use_cache = True, temperature = 1.5, min_p = 0.1)

Overall Image Description: The image depicts an indoor scene featuring two animals, a golden retriever and a gray and white cat, positioned on a light-colored floor. Behind them is a cabinet with shelves holding various items. The setting appears to be a cozy home environment.

Reference Object: Cat  
Target Object: Dog

Reference Object Absolute Direction: The cat is facing the observer  
Target Object Absolute Direction: The dog is facing the observer

Perspective: Egocentric (from the observer's point of view)  
Answer: The cat is on the center right of the image, the dog is on the left middle of the image, and in the image, the cat is to the right of the dog.

Perspective: Allocentric (from the dog's point of view)  
Answer: From the dog's perspective, the cat is behind it.<|im_end|>
