# Here we test our finetuned model's perfomance agianst 300 images

Author: Aidan Murray

Date: 2025-09-26


In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth

### First, mount the google drive and load the dataset

In [None]:
from pathlib import Path
from google.colab import drive
import pandas as pd
from PIL import Image
import ast

drive.mount('/content/drive')

BASE_PATH = Path('/content/drive/MyDrive/llm_finetuning')
VAL_PATH = BASE_PATH / "data" / "validation.csv"       # "test.csv" for test dataset
IMAGE_FOLDER = BASE_PATH /  "all_images"

PROMPT = "Analyse the entire image carefully and decide which of the label names correspond to features that are clearly visible in the image."

N = 300      # N = 1000 for test dataset

In [None]:
def prepare_dataset(n, data_path, im_path, prompt):
  "prepare the dataset for inference by the model"

  df = pd.read_csv(data_path)
  df_indexed = df.set_index('point.media.id')
  media = df_indexed.index.to_series().sample(n=n+100, random_state=42)

  dataset = []
  i = 0
  for i, id in enumerate(media):
    if i >= n:
      break

    lat = df_indexed.loc[id, 'point.pose.lat']
    lon = df_indexed.loc[id, 'point.pose.lon']
    dep = df_indexed.loc[id, 'point.pose.dep']
    realm = df_indexed.loc[id, 'REALM']
    province = df_indexed.loc[id, 'PROVINCE']
    ecoregion = df_indexed.loc[id, 'ECOREGION']

    prompt_context = ""
    # prompt_context = f"\n### Context ###\n(latitude, longitude): ({lat},{lon})\ndepth: {dep}"
    # prompt_context = f"\n### Context ###\nRealm: {realm}\nProvince: {province}\nEcoregion: {ecoregion}"
    # prompt_context = f"\n### Context ###\nEcoregion: {ecoregion}"

    new_prompt = PROMPT + prompt_context

    path = im_path / f"{id}.jpg"
    if not path.exists():
      print("Image not found, skipping ...")
      continue
    i += 1
    print(f"Getting image {i} ...")
    img = Image.open(path)

    mydict = {"id": id,
              "labels": df_indexed.loc[id, 'label.name'],
              "image": img,
              "prompt": new_prompt}
    dataset.append(mydict)

  return dataset

In [None]:
dataset = prepare_dataset(300, VAL_PATH, IMAGE_FOLDER, PROMPT)

## Now we prepare the model

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "username/model-name", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 4096,
    # dtype = dtype,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

## Make an inference for each image

In [None]:
import time
import warnings

ids = []
true_labels = []
predicted_labels = []
raw_predicted_labels = []
failed_parse = 0

start_time = time.time()
for i, element in enumerate(dataset):
  print(f"Generating prompt for image {i}...")
  ids.append(element['id'])

  img = element['image']
  messages = [
      {"role": "user", "content": [
          {"type": "image"},
          {"type": "text", "text": element["prompt"]}
      ]},
  ]

  input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
  inputs = tokenizer(
      img,
      input_text,
      add_special_tokens = False,
      return_tensors = "pt",
  ).to("cuda")

  output_tokens = model.generate(**inputs, max_new_tokens=1028)
  input_length = inputs["input_ids"].shape[1]
  generated_tokens = output_tokens[0][input_length:]
  output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
  raw_predicted_labels.append(output_text)

  try:
    y_pred = ast.literal_eval(output_text)
  except (ValueError, SyntaxError) as e:
    warnings.warn(f"Warning: Failed to parse model output from image {element['id']}. Error: {e}")
    y_pred = ['Failed']
    failed_parse += 1
  predicted_labels.append(y_pred)

  y_true = ast.literal_eval(element['labels'])
  true_labels.append(y_true)

end_time = time.time()

## Evaluate the accuracy of the model

In [None]:
def f1_score(seq1, seq2):
  "Converts the inputs to sets and returns the f1 score of these sets"

  set_1 = set(seq1)
  set_2 = set(seq2)
  f1 = (2 * len(set_1.intersection(set_2))) / (len(set_1) + len(set_2))

  return f1

In [None]:
evals = [f1_score(true_labels[i], predicted_labels[i]) for i in range(len(predicted_labels))]

## Write output to files

In [None]:
OUTPUT_PATH = BASE_PATH / "output_7B_1300_justeco"
OUTPUT_PATH.mkdir(exist_ok=True)

with open(OUTPUT_PATH / "predicted_labels.txt", "w") as f:
  for line in predicted_labels:
    f.write(str(line) + '\n')

with open(OUTPUT_PATH / "raw_predicted_labels.txt", "w") as f:
  for line in raw_predicted_labels:
    f.write(line + '\n')

df_eval = pd.DataFrame()
df_eval["ID"] = ids
df_eval["F1 Score"] = evals
df_eval.to_csv(OUTPUT_PATH / "evals.csv")
df_eval["F1 Score"].describe().to_csv(OUTPUT_PATH / "eval_stats.csv")

with open(OUTPUT_PATH / "info.txt", "w") as f:
  f.write(f"Number of failed parses: {failed_parse}\n")
  f.write(f"Execution time: {end_time - start_time}")

print("done")