In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [1]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
from unsloth.trainer import UnslothVisionDataCollator

from trl import SFTTrainer, SFTConfig

import torch
from datasets import load_dataset

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
dataset = load_dataset("aseeransari/ventory", split="train")

In [18]:
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = True, # True or "unsloth" for long context
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = False, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = False, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0.1,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    target_modules = "all-linear", # Optional now! Can specify a list if needed
)


==((====))==  Unsloth 2025.8.4: Fast Qwen2_Vl patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [None]:
# import os
# os._exit(0)

In [22]:
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : "What is this ?"},
            {"type" : "image", "image" : sample["image"].resize((512, 512))} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["caption"]} ]
        },
    ]
    return { "messages" : conversation }

converted_dataset = [convert_to_conversation(sample) for sample in dataset]

converted_dataset[:2]

[{'messages': [{'role': 'user',
    'content': [{'type': 'text', 'text': 'What is this ?'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=512x512>}]},
   {'role': 'assistant',
    'content': [{'type': 'text', 'text': 'This is Airpump'}]}]},
 {'messages': [{'role': 'user',
    'content': [{'type': 'text', 'text': 'What is this ?'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=512x512>}]},
   {'role': 'assistant',
    'content': [{'type': 'text', 'text': 'This is Airpump'}]}]}]

In [23]:
tokenizer.image_processor.size = (512, 512)

In [24]:
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 5, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_length = 2048,
    ),
)


torch.cuda.empty_cache()
torch.cuda.ipc_collect()

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Unsloth: Model does not have a default image size - using 512
GPU = Tesla T4. Max memory = 14.741 GB.
3.623 GB of memory reserved.


In [25]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 240 | Num Epochs = 5 | Total steps = 600
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 28,950,528 of 2,237,936,128 (1.29% trained)


Step,Training Loss
1,1.0373
2,0.8742
3,0.85
4,0.8791
5,0.5354
6,0.6463
7,0.492
8,0.4222
9,0.2341
10,0.6009


In [26]:

model.save_pretrained("./ventory_model")  # Local saving
tokenizer.save_pretrained("./ventory_tokenizer")

[]

In [29]:
 dataset[128]

{'file_name': '95fc7725-Bikelamp_01_02.png',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>,
 'caption': 'This is Bikelamp_back'}

In [31]:
# inference
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[128]["image"]
instruction = "What is this ?"

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



This is Bikelamp_back<|im_end|>
