# Finetuning Qwen2.5-VL on benthic images
The below code was used to fine-tune the models for the thesis project. The code is adapted from the notebook provided by unsloth:

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_(7B)-Vision.ipynb

Author: Aidan Murray

Date: 2025-09-26


### Installing dependancies

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth

import os
os.environ['TRITON_JIT_DISABLE_OPT'] = '1' # Likely the most critical change

### Imports

In [None]:
from unsloth import FastVisionModel
import torch
import pandas as pd
from PIL import Image
import requests
from io import BytesIO
from google.colab import drive
from unsloth.trainer import UnslothVisionDataCollator
from unsloth import get_chat_template
from trl import SFTTrainer, SFTConfig
import time
import matplotlib.pyplot as plt
from pathlib import Path

drive.mount('/content/drive')

## Data Prep


In [None]:
N = 6500      # change N to train on different size
BASE_PATH = Path("/content/drive/MyDrive/llm_finetuning")
PROMPT_PATH = BASE_PATH / "prompt.txt"

TRAIN_PARTIAL = BASE_PATH / "train_partial.csv"
VALIDATION = BASE_PATH / "validation.csv"

IMAGE_FOLDER = BASE_PATH / "all_images"

PROMPT = "Analyse the entire image carefully and decide which of the label names correspond to features that are clearly visible in the image."

df_train = pd.read_csv(TRAIN_PARTIAL).dropna(subset=['label.name'])
df_val = pd.read_csv(VALIDATION).dropna(subset=['label.name'])

In [None]:
def create_dataset(df, folder_path, n):
    "creates a dataset in the format required to train with unsloth"

    df_indexed = df.set_index('point.media.id')
    media = df_indexed.index.to_series().drop_duplicates().sample(n=n+100, random_state=42)

    i = 0
    dataset = []
    for id in media:
        if i >= n: break

        image_path = folder_path / f"{id}.jpg"
        if not image_path.exists():
            print("Image not found, skipping path...")
            continue
        i += 1
        print(f"Getting image {i}...")
        img = Image.open(image_path)

        y_true = df_indexed.loc[id, 'label.name']

        # uncomment the context for different fine-tuning technique
        prompt_context = ""                                                                                    # basic prompt
        # prompt_context = f"\n### Context ###\n(latitude, longitude): ({latitude},{longitude})\ndepth: {depth}" # numical context
        # prompt_context = f"\n### Context ###\nRealm: {realm}\nProvince: {province}\nEcoregion: {ecoregion}"    # hierarchical
        # prompt_context = f"\n### Context ###\nEcoregion: {ecoregion}"                                          # ecoregions only

        new_prompt = PROMPT + prompt_context

        conversation = [
            { "role": "user",
              "content" : [
                {"type" : "text",  "text"  : new_prompt},
                {"type" : "image", "image" : img} ]
            },
            { "role" : "assistant",
              "content" : [
                {"type" : "text",  "text"  : y_true} ]
            },
        ]
        messages = { "messages" : conversation }

        dataset.append(messages)

    print(f"Finished. Successfully downloaded {i} images and their annotations")
    return dataset

In [None]:
train_dataset = create_dataset(df_train, IMAGE_FOLDER, N)

In [None]:
val_dataset = create_dataset(df_val, IMAGE_FOLDER, 300)

### Prepare model with unsloth

In [None]:
model_name = "unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit"
# model_name = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"

model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = False,
    use_gradient_checkpointing = False,
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Train the model

In [None]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    max_seq_length = 4096,
    # eval_strategy = "steps",

    args = SFTConfig(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 80,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,

        eval_strategy = "steps",
        eval_steps = 10,


        assistant_only_loss = True,
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Show effect of epochs

In [None]:
log_history = trainer.state.log_history

train_losses = []
train_epochs = []
eval_losses = []
eval_epochs = []

for entry in log_history:
    if 'loss' in entry and 'eval_loss' not in entry:
        train_losses.append(entry['loss'])
        train_epochs.append(entry['epoch'])
    elif 'eval_loss' in entry:
        eval_losses.append(entry['eval_loss'])
        eval_epochs.append(entry['epoch'])

plt.figure(figsize=(10, 5))
plt.plot(train_epochs[:len(train_losses)], train_losses, label="Training Loss")
plt.plot(eval_epochs[:len(eval_losses)], eval_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()

## Check the output after training

In [None]:
img = val_dataset[0]['messages'][0]['content'][1]['image']

print(val_dataset[0]['messages'][1]['content'][0]['text'])
img

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = img
instruction = PROMPT

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt = True)

_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1028,
                   use_cache = True, temperature = 1.0, min_p = 0.1)

# Save the Model to huggingface

In [None]:
if True: model.push_to_hub_merged("username/model-name", tokenizer, token = "token")