# Here we test our finetuned model's perfomance agianst 300 images

Author: Aidan Murray

Date: 2025-09-26


In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth



### First, mount the google drive and load the dataset

In [None]:
from pathlib import Path
from google.colab import drive
import pandas as pd
from PIL import Image
import ast

drive.mount('/content/drive')

BASE_PATH = Path('/content/drive/MyDrive/llm_finetuning')
VAL_PATH = BASE_PATH / "data" / "validation.csv"       # "test.csv" for test dataset
IMAGE_FOLDER = BASE_PATH /  "all_images"

PROMPT = "Analyse the entire image carefully and decide which of the label names correspond to features that are clearly visible in the image."

N = 300      # N = 1000 for test dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def prepare_dataset(n, data_path, im_path, prompt):
  "prepare the dataset for inference by the model"

  df = pd.read_csv(data_path)
  df_indexed = df.set_index('point.media.id')
  media = df_indexed.index.to_series().sample(n=n+100, random_state=42)

  dataset = []
  i = 0
  for i, id in enumerate(media):
    if i >= n:
      break

    lat = df_indexed.loc[id, 'point.pose.lat']
    lon = df_indexed.loc[id, 'point.pose.lon']
    dep = df_indexed.loc[id, 'point.pose.dep']
    realm = df_indexed.loc[id, 'REALM']
    province = df_indexed.loc[id, 'PROVINCE']
    ecoregion = df_indexed.loc[id, 'ECOREGION']

    prompt_context = ""
    # prompt_context = f"\n### Context ###\n(latitude, longitude): ({lat},{lon})\ndepth: {dep}"
    # prompt_context = f"\n### Context ###\nRealm: {realm}\nProvince: {province}\nEcoregion: {ecoregion}"
    # prompt_context = f"\n### Context ###\nEcoregion: {ecoregion}"

    new_prompt = PROMPT + prompt_context

    path = im_path / f"{id}.jpg"
    if not path.exists():
      print("Image not found, skipping ...")
      continue
    i += 1
    print(f"Getting image {i} ...")
    img = Image.open(path)

    mydict = {"id": id,
              "labels": df_indexed.loc[id, 'label.name'],
              "image": img,
              "prompt": new_prompt}
    dataset.append(mydict)

  return dataset

In [None]:
dataset = prepare_dataset(300, VAL_PATH, IMAGE_FOLDER, PROMPT)

Getting image 1 ...
Getting image 2 ...
Getting image 3 ...
Getting image 4 ...
Getting image 5 ...
Getting image 6 ...
Getting image 7 ...
Getting image 8 ...
Getting image 9 ...
Getting image 10 ...
Getting image 11 ...
Getting image 12 ...
Getting image 13 ...
Getting image 14 ...
Getting image 15 ...
Getting image 16 ...
Getting image 17 ...
Getting image 18 ...
Getting image 19 ...
Getting image 20 ...
Getting image 21 ...
Getting image 22 ...
Getting image 23 ...
Getting image 24 ...
Getting image 25 ...
Getting image 26 ...
Getting image 27 ...
Getting image 28 ...
Getting image 29 ...
Getting image 30 ...
Getting image 31 ...
Getting image 32 ...
Getting image 33 ...
Getting image 34 ...
Getting image 35 ...
Getting image 36 ...
Getting image 37 ...
Getting image 38 ...
Getting image 39 ...
Getting image 40 ...
Getting image 41 ...
Getting image 42 ...
Getting image 43 ...
Getting image 44 ...
Getting image 45 ...
Getting image 46 ...
Getting image 47 ...
Getting image 48 ...
G

## Now we prepare the model

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "username/model-name", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 4096,
    # dtype = dtype,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.2: Fast Qwen2_5_Vl patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

video_preprocessor_config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): Linear4bit(in_features=1280, out_features=3840, bias=True)
            (proj): Linear4bit(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear4bit(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear4bit(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear4bit(in_features=3420, out_features=1280, bias=True)
        

## Make an inference for each image

In [None]:
import time
import warnings

ids = []
true_labels = []
predicted_labels = []
raw_predicted_labels = []
failed_parse = 0

start_time = time.time()
for i, element in enumerate(dataset):
  print(f"Generating prompt for image {i}...")
  ids.append(element['id'])

  img = element['image']
  messages = [
      {"role": "user", "content": [
          {"type": "image"},
          {"type": "text", "text": element["prompt"]}
      ]},
  ]

  input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
  inputs = tokenizer(
      img,
      input_text,
      add_special_tokens = False,
      return_tensors = "pt",
  ).to("cuda")

  output_tokens = model.generate(**inputs, max_new_tokens=1028)
  input_length = inputs["input_ids"].shape[1]
  generated_tokens = output_tokens[0][input_length:]
  output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
  raw_predicted_labels.append(output_text)

  try:
    y_pred = ast.literal_eval(output_text)
  except (ValueError, SyntaxError) as e:
    warnings.warn(f"Warning: Failed to parse model output from image {element['id']}. Error: {e}")
    y_pred = ['Failed']
    failed_parse += 1
  predicted_labels.append(y_pred)

  y_true = ast.literal_eval(element['labels'])
  true_labels.append(y_true)

end_time = time.time()

Generating prompt for image 0...
Generating prompt for image 1...
Generating prompt for image 2...
Generating prompt for image 3...
Generating prompt for image 4...
Generating prompt for image 5...
Generating prompt for image 6...
Generating prompt for image 7...
Generating prompt for image 8...
Generating prompt for image 9...
Generating prompt for image 10...
Generating prompt for image 11...
Generating prompt for image 12...
Generating prompt for image 13...
Generating prompt for image 14...
Generating prompt for image 15...
Generating prompt for image 16...
Generating prompt for image 17...
Generating prompt for image 18...
Generating prompt for image 19...
Generating prompt for image 20...
Generating prompt for image 21...
Generating prompt for image 22...
Generating prompt for image 23...
Generating prompt for image 24...
Generating prompt for image 25...
Generating prompt for image 26...
Generating prompt for image 27...
Generating prompt for image 28...
Generating prompt for im

## Evaluate the accuracy of the model

In [None]:
def f1_score(seq1, seq2):
  "Converts the inputs to sets and returns the f1 score of these sets"

  set_1 = set(seq1)
  set_2 = set(seq2)
  f1 = (2 * len(set_1.intersection(set_2))) / (len(set_1) + len(set_2))

  return f1

In [None]:
evals = [f1_score(true_labels[i], predicted_labels[i]) for i in range(len(predicted_labels))]

## Write output to files

In [None]:
OUTPUT_PATH = BASE_PATH / "output_7B_1300_justeco"
OUTPUT_PATH.mkdir(exist_ok=True)

with open(OUTPUT_PATH / "predicted_labels.txt", "w") as f:
  for line in predicted_labels:
    f.write(str(line) + '\n')

with open(OUTPUT_PATH / "raw_predicted_labels.txt", "w") as f:
  for line in raw_predicted_labels:
    f.write(line + '\n')

df_eval = pd.DataFrame()
df_eval["ID"] = ids
df_eval["F1 Score"] = evals
df_eval.to_csv(OUTPUT_PATH / "evals.csv")
df_eval["F1 Score"].describe().to_csv(OUTPUT_PATH / "eval_stats.csv")

with open(OUTPUT_PATH / "info.txt", "w") as f:
  f.write(f"Number of failed parses: {failed_parse}\n")
  f.write(f"Execution time: {end_time - start_time}")

print("done")

done
