### Setup

#### Imports

In [1]:
import os
import copy
import json
import time

import numpy as np

from nimbro_utils.lazy import read_json, escape, format_number
from nimbro_utils.lazy import remove_ansi_escape, draw_text, show_image, save_image # for saving as image

#### Functions

In [2]:
def write_results(base_path, model_name_to_eval_dict, weights=None, latex=False):
    rows = []

    # These experiments were conducted with an infinite number of retry attempts until a valid structured description was obtained.
    # We changed the evaluation protocol to allow for a maximum of four attempts in order to accommodate models that rarely or never succeed.
    inf_retries = [
        "claude_sonnet_4/fo_2025_06_09_01_25_22_237_label_match",
        "gemini_2_5_flash_high/fo_2025_08_28_14_21_16_422_label_match",
        "gemini_2_5_flash_low/fo_2025_08_28_14_24_39_880_label_match",
        "gemini_2_5_flash_medium/fo_2025_08_28_14_42_54_220_label_match",
        "gemini_2_5_flash_none/fo_2025_08_28_15_08_42_343_label_match",
        "gemini_2_5_flash_preview/fo_2025_06_09_01_43_46_408_label_match",
        "gemini_2_5_pro_high/fo_2025_08_28_15_26_51_137_label_match",
        "gemini_2_5_pro_none/fo_2025_08_28_15_51_46_277_label_match",
        "gemini_2_5_pro_preview/fo_2025_06_09_02_04_38_468_label_match",
        "glm_4_5_reason/fo_2025_08_28_17_06_55_532_label_match",
        "gpt_4_1/fo_2025_06_09_09_54_44_572_label_match",
        "gpt_4_1_mini/fo_2025_06_09_10_01_01_783_label_match",
        "gpt_4_1_nano/fo_2025_06_09_11_31_50_761_label_match",
        "gpt_5_high/fo_2025_08_28_16_19_05_062_label_match",
        "gpt_5_mini_none/fo_2025_08_28_17_00_14_730_label_match",
        "gpt_5_nano_high/fo_2025_08_28_17_37_39_974_label_match",
        "gpt_5_nano_none/fo_2025_08_28_17_53_26_811_label_match",
        "gpt_5_none/fo_2025_08_28_18_06_02_775_label_match",
        "grok_4/fo_2025_08_28_19_05_16_834_label_match",
        "internvl_3_5_2b_none/fo_2025_08_28_18_57_59_521_label_match",
        "internvl_3_5_2b_reason/fo_2025_08_28_20_36_39_816_label_match",
        "internvl_3_5_4b_reason/fo_2025_08_28_19_20_26_476_label_match",
        "internvl_3_5_20b_a4b_reason/fo_2025_08_29_18_37_44_078_label_match",
        "internvl_3_5_30b_a3b_none/fo_2025_08_28_19_33_16_248_label_match",
        "internvl_3_5_30b_a3b_reason/fo_2025_08_28_19_44_42_109_label_match",
        "internvl_3_5_38b_reason/fo_2025_08_28_20_10_41_190_label_match",
        "mistral_medium_3_1_temp_high/fo_2025_08_28_21_07_24_394_label_match",
        "mistral_medium_3_1_temp_low/fo_2025_08_28_21_19_05_965_label_match",
        "ovis_2_5_9b_none/fo_2025_08_26_14_37_31_222_label_match",
        "ovis_2_5_9b_reason/fo_2025_08_28_21_36_05_082_label_match",
        "claude_sonnet_4/fo_2025_06_08_22_11_36_197_label_match",
        "gemini_2_5_flash_high/fo_2025_08_28_22_08_17_562_label_match",
        "gemini_2_5_flash_low/fo_2025_08_29_15_11_06_237_label_match",
        "gemini_2_5_flash_medium/fo_2025_08_29_15_18_30_354_label_match",
        "gemini_2_5_flash_none/fo_2025_08_25_15_21_08_854_label_match",
        "gemini_2_5_flash_preview/fo_2025_06_08_22_43_08_886_label_match",
        "gemini_2_5_pro_high/fo_2025_08_29_15_26_11_360_label_match",
        "gemini_2_5_pro_none/fo_2025_08_28_22_25_19_287_label_match",
        "gemini_2_5_pro_preview/fo_2025_06_08_22_35_47_157_label_match",
        "glm_4_5_reason/fo_2025_08_28_23_09_06_888_label_match",
        "gpt_4_1/fo_2025_06_08_22_29_53_001_label_match",
        "gpt_4_1_mini/fo_2025_06_08_22_47_47_312_label_match",
        "gpt_4_1_nano/fo_2025_06_08_22_52_38_103_label_match",
        "gpt_5_high/fo_2025_08_29_15_56_15_093_label_match",
        "gpt_5_mini_high/fo_2025_08_29_16_08_33_024_label_match",
        "gpt_5_mini_none/fo_2025_08_28_22_54_34_989_label_match",
        "gpt_5_nano_high/fo_2025_08_29_16_21_15_467_label_match",
        "gpt_5_nano_none/fo_2025_08_29_16_41_22_600_label_match",
        "gpt_5_none/fo_2025_08_28_22_43_40_097_label_match",
        "grok_4/fo_2025_08_28_23_01_10_250_label_match",
        "internvl_3_5_2b_none/fo_2025_08_28_23_22_43_290_label_match",
        "internvl_3_5_2b_reason/fo_2025_08_29_16_50_49_757_label_match",
        "internvl_3_5_4b_none/fo_2025_08_29_17_40_43_317_label_match",
        "internvl_3_5_4b_reason/fo_2025_08_29_17_47_57_187_label_match",
        "internvl_3_5_30b_a3b_none/fo_2025_08_29_17_00_39_604_label_match",
        "internvl_3_5_30b_a3b_reason/fo_2025_08_29_17_20_54_905_label_match",
        "internvl_3_5_38b_reason/fo_2025_08_29_17_31_43_513_label_match",
        "mistral_medium_3_1_temp_high/fo_2025_08_28_23_42_47_130_label_match",
        "mistral_medium_3_1_temp_low/fo_2025_08_29_17_57_35_128_label_match",
        "ovis_2_5_9b_none/fo_2025_08_28_23_51_45_603_label_match",
        "ovis_2_5_9b_reason/fo_2025_08_29_18_04_13_326_label_match",
    ]

    longest_name = 0
    if isinstance(model_name_to_eval_dict, list):
        considered_models = []
        for model in model_name_to_eval_dict[0]:
            for i in range(len(model_name_to_eval_dict) - 1):
                if model not in model_name_to_eval_dict[1 + i]:
                    break
            else:
                longest_name = max(longest_name, len(model))
                considered_models.append(model)
        num_considered_models = len(considered_models)

        num_weights = len(weights)
        sum_weights = sum(weights)
        weights = [w / sum_weights for w in weights]

        all_failure = [0 for _ in range(num_considered_models)]
        all_attempts = [0 for _ in range(num_considered_models)]
        all_described_instances = [0 for _ in range(num_considered_models)]
        all_description_time = [0 for _ in range(num_considered_models)]
        all_matched = [0 for _ in range(num_considered_models)]
        all_mAP = [0 for _ in range(num_considered_models)]
        all_precision = [0 for _ in range(num_considered_models)]
        all_recall = [0 for _ in range(num_considered_models)]
        all_fscore = [0 for _ in range(num_considered_models)]

        for i, model in enumerate(considered_models):
            for j, w in enumerate(weights):
                longest_name = max(longest_name, len(model))
                file_path = os.path.join(base_path[j].rstrip(os.path.sep), model_name_to_eval_dict[j][model].lstrip(os.path.sep), "evaluation.json")
                file_path = os.path.abspath(file_path)
                try:
                    success, message, results = read_json(file_path=file_path)
                    assert success, f"{model}: {message}"
                    if not success:
                        raise Exception(message)
                    if model_name_to_eval_dict[j][model] in inf_retries:
                        all_failure[i] += np.nan
                        all_attempts[i] += np.nan
                    else:
                        all_failure[i] += results['images'].get('described_instances_NaN', 0.0) * w
                        all_attempts[i] += results['images'].get('failures_structured_description_mean', np.nan) * w
                    all_described_instances[i] += results['images'].get('detected_instances_mean', np.nan) * w
                    # all_description_time[i] += results['images'].get('time_description_mean') * w
                    all_description_time[i] += results['images'].get('time_description_median', np.nan) * w
                    # all_matched[i] += results['images'].get('label_matching_accept_mean', np.nan) * w)
                    all_matched[i] += (results['images'].get('label_matching_accept_mean', np.nan) / results['images'].get('detected_instances_mean', np.nan)) * w
                    all_mAP[i] += results['results'].get('performance_metrics', {}).get('mAP', np.nan) * w
                    all_precision[i] += results['results'].get('performance_metrics', {}).get('precision', np.nan) * w
                    all_recall[i] += results['results'].get('performance_metrics', {}).get('recall', np.nan) * w
                    all_fscore[i] += results['results'].get('performance_metrics', {}).get('fscore', np.nan) * w
                except Exception as e:
                    raise Exception(f"Failed obtaining '{model}' results from '{file_path}': {repr(e)}")

    else:
        considered_models = list(model_name_to_eval_dict.keys())

        all_failure = []
        all_attempts = []
        all_described_instances = []
        all_description_time = []
        all_matched = []
        all_mAP = []
        all_precision = []
        all_recall = []
        all_fscore = []

        for model in model_name_to_eval_dict:
            longest_name = max(longest_name, len(model))
            file_path = os.path.join(base_path.rstrip(os.path.sep), model_name_to_eval_dict[model].lstrip(os.path.sep), "evaluation.json")
            file_path = os.path.abspath(file_path)
            try:
                success, message, results = read_json(file_path=file_path)
                assert success, f"{model}: {message}"
                if not success:
                    raise Exception(message)
                if model_name_to_eval_dict[model] in inf_retries:
                    all_failure.append(np.nan)
                    all_attempts.append(np.nan)
                else:
                    all_failure.append(results['images'].get('described_instances_NaN', 0.0))
                    all_attempts.append(results['images'].get('failures_structured_description_mean', np.nan))
                all_described_instances.append(results['images'].get('detected_instances_mean', np.nan))
                # all_description_time.append(results['images'].get('time_description_mean'))
                all_description_time.append(results['images'].get('time_description_median', np.nan))
                # all_matched.append(results['images'].get('label_matching_accept_mean', np.nan))
                all_matched.append(results['images'].get('label_matching_accept_mean', np.nan) / all_described_instances[-1])
                all_mAP.append(results['results'].get('performance_metrics', {}).get('mAP', np.nan))
                all_precision.append(results['results'].get('performance_metrics', {}).get('precision', np.nan))
                all_recall.append(results['results'].get('performance_metrics', {}).get('recall', np.nan))
                all_fscore.append(results['results'].get('performance_metrics', {}).get('fscore', np.nan))
            except Exception as e:
                raise Exception(f"Failed obtaining '{model}' results from '{file_path}': {repr(e)}")

    try:
        best_failure = np.nanargmin(all_failure)
        worst_failure = np.nanargmax(all_failure)
    except:
        best_failure = None
        worst_failure = None

    try:
        best_attempts = np.nanargmin(all_attempts)
        worst_attempts = np.nanargmax(all_attempts)
    except:
        best_attempts = None
        worst_attempts = None

    try:
        best_described_instances = np.nanargmax(all_described_instances)
        worst_described_instances = np.nanargmin(all_described_instances)
    except:
        best_described_instances = None
        worst_described_instances = None

    try:
        best_description_time = np.nanargmin(all_description_time)
        worst_description_time = np.nanargmax(all_description_time)
    except:
        best_description_time = None
        worst_description_time = None

    # try:
    #     best_matched = np.nanargmax(all_matched)
    #     worst_matched = np.nanargmin(all_matched)
    # except:
    #     best_matched = None
    #     worst_matched = None

    try:
        best_mAP = np.nanargmax(all_mAP)
        worst_mAP = np.nanargmin(all_mAP)
    except:
        best_mAP = None
        worst_mAP = None

    try:
        best_precision = np.nanargmax(all_precision)
        worst_precision = np.nanargmin(all_precision)
    except:
        best_precision = None
        worst_precision = None

    try:
        best_recall = np.nanargmax(all_recall)
        worst_recall = np.nanargmin(all_recall)
    except:
        best_recall = None
        worst_recall = None

    try:
        best_fscore = np.nanargmax(all_fscore)
        worst_fscore = np.nanargmin(all_fscore)
    except:
        best_fscore = None
        worst_fscore = None

    for i, model in enumerate(considered_models):
        name = f"{model}{' ' * (longest_name - len(model))}"

        failure = format_number(all_failure[i], integers=1, decimals=2, signed=False)
        attempts = format_number(all_attempts[i], integers=1, decimals=2, signed=False)
        described_instances = format_number(all_described_instances[i], integers=2, decimals=1, signed=False)
        description_time = format_number(all_description_time[i], integers=3, decimals=1, signed=False)
        description_time = f"{description_time}s"
        # matched = format_number(all_matched[i], integers=2, decimals=1, signed=False)
        matched = format_number(all_matched[i], integers=1, decimals=2, signed=False)
        mAP = format_number(all_mAP[i], integers=1, decimals=3, signed=False)
        precision = format_number(all_precision[i], integers=1, decimals=3, signed=False)
        recall = format_number(all_recall[i], integers=1, decimals=3, signed=False)
        fscore = format_number(all_fscore[i], integers=1, decimals=3, signed=False)

        if latex:
            if best_described_instances is not None:
                if all_described_instances[i] == all_described_instances[best_described_instances]:
                    described_instances = r"\bf{" + described_instances + r"}"
                else:
                    described_instances = f"    {described_instances} "

            if best_description_time is not None:
                if all_description_time[i] == all_description_time[best_description_time]:
                    description_time = r"\bf{" + description_time + r"}"
                else:
                    description_time = f"    {description_time} "

            if best_mAP is not None:
                if all_mAP[i] == all_mAP[best_mAP]:
                    mAP = r"\bf{" + mAP + r"}"
                else:
                    mAP = f"    {mAP} "

            if best_precision is not None:
                if all_precision[i] == all_precision[best_precision]:
                    precision = r"\bf{" + precision + r"}"
                else:
                    precision = f"    {precision} "

            if best_recall is not None:
                if all_recall[i] == all_recall[best_recall]:
                    recall = r"\bf{" + recall + r"}"
                else:
                    recall = f"    {recall} "

            if best_fscore is not None:
                if all_fscore[i] == all_fscore[best_fscore]:
                    fscore = r"\bf{" + fscore + r"}"
                else:
                    fscore = f"    {fscore} "

            if isinstance(model_name_to_eval_dict, list):
                row = f"& {model}{' ' * (longest_name - len(model))} & {described_instances} & {description_time} & {mAP} & {precision} & {recall} & {fscore} \\\\ % {' '.join(os.path.join(base_path[j], model_name_to_eval_dict[j][model]) for j in range(num_weights))})"
            else:
                row = f"& {model}{' ' * (longest_name - len(model))} & {described_instances} & {description_time} & {mAP} & {precision} & {recall} & {fscore} \\\\ % {model_name_to_eval_dict[model]}"
        else:
            if best_failure is not None:
                if all_failure[i] == all_failure[best_failure]:
                    failure = f"{escape['bold']}{escape['blue']}{failure}{escape['end']}"
                elif all_failure[i] == all_failure[worst_failure]:
                    failure = f"{escape['bold']}{escape['red']}{failure}{escape['end']}"

            if best_attempts is not None:
                if all_attempts[i] == all_attempts[best_attempts]:
                    attempts = f"{escape['bold']}{escape['blue']}{attempts}{escape['end']}"
                elif all_attempts[i] == all_attempts[worst_attempts]:
                    attempts = f"{escape['bold']}{escape['red']}{attempts}{escape['end']}"

            if best_described_instances is not None:
                if all_described_instances[i] == all_described_instances[best_described_instances]:
                    described_instances = f"{escape['bold']}{escape['blue']}{described_instances}{escape['end']}"
                elif all_described_instances[i] == all_described_instances[worst_described_instances]:
                    described_instances = f"{escape['bold']}{escape['red']}{described_instances}{escape['end']}"

            if best_description_time is not None:
                if all_description_time[i] == all_description_time[best_description_time]:
                    description_time = f"{escape['bold']}{escape['blue']}{description_time}{escape['end']}"
                elif all_description_time[i] == all_description_time[worst_description_time]:
                    description_time = f"{escape['bold']}{escape['red']}{description_time}{escape['end']}"

            # if best_matched is not None:
            #     if all_matched[i] == all_matched[best_matched]:
            #         matched = f"{escape['bold']}{escape['blue']}{matched}{escape['end']}"
            #     elif all_matched[i] == all_matched[worst_matched]:
            #         matched = f"{escape['bold']}{escape['red']}{matched}{escape['end']}"

            if best_mAP is not None:
                if all_mAP[i] == all_mAP[best_mAP]:
                    mAP = f"{escape['bold']}{escape['blue']}{mAP}{escape['end']}"
                elif all_mAP[i] == all_mAP[worst_mAP]:
                    mAP = f"{escape['bold']}{escape['red']}{mAP}{escape['end']}"

            if best_precision is not None:
                if all_precision[i] == all_precision[best_precision]:
                    precision = f"{escape['bold']}{escape['blue']}{precision}{escape['end']}"
                elif all_precision[i] == all_precision[worst_precision]:
                    precision = f"{escape['bold']}{escape['red']}{precision}{escape['end']}"

            if best_recall is not None:
                if all_recall[i] == all_recall[best_recall]:
                    recall = f"{escape['bold']}{escape['blue']}{recall}{escape['end']}"
                elif all_recall[i] == all_recall[worst_recall]:
                    recall = f"{escape['bold']}{escape['red']}{recall}{escape['end']}"

            if best_fscore is not None:
                if all_fscore[i] == all_fscore[best_fscore]:
                    fscore = f"{escape['bold']}{escape['blue']}{fscore}{escape['end']}"
                elif all_fscore[i] == all_fscore[worst_fscore]:
                    fscore = f"{escape['bold']}{escape['red']}{fscore}{escape['end']}"

            row = f"{name} | {fscore} | {recall} | {precision} | {mAP} || {described_instances} | {matched} | {description_time} || {failure} | {attempts}"

        rows.append(row)

    if latex:
        result = [f"% {base_path}"]
    else:
        header = f"Model{' ' * (longest_name - len('model'))} |  F1   |  Rec. | Prec. |  mAP  || Ins. | Mat. |  Time  || Fail | Ret. "
        result = [header, len(header) * "-"]

    # order = np.argsort(np.array(all_failure))
    # order = np.argsort(np.array(all_attempts))
    # order = np.argsort(-np.array(all_described_instances))
    # order = np.argsort(np.array(all_description_time))
    # order = np.argsort(-np.array(all_mAP))
    # order = np.argsort(-np.array(all_matched))
    # order = np.argsort(-np.array(all_precision))
    # order = np.argsort(-np.array(all_recall))
    order = np.argsort(-np.array(all_fscore))
    for i in order:
        result.append(rows[i])

    return "\n".join(result)

In [3]:
def save_results(text, suffix, path="."):
    text_normal = remove_ansi_escape(text)
    image = draw_text(
        image=np.zeros((int(round(len(text.splitlines()) * 24.6)), 1315), dtype=np.uint8),
        text=text_normal,
        anchor=(0,0),
        font_path="/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf",
        font_size=22,
        # background_color=(255, 255, 255),
        # text_color=(0, 0, 0)
    )
    # show_image(image, 700)
    image_path = save_image(image=image, suffix=suffix, path=path)
    print(f"Results saved to '{os.path.abspath(image_path)}'")
    text_path = image_path.replace(".png", ".txt")
    with open(text_path, "a") as f:
        f.write(text_normal)
        # f.write(text + "\n")
    print(f"Results saved to '{os.path.abspath(text_path)}'")

### Description Models

#### COCO 500

In [4]:
base_path_coco = "./../../data/keep/coco"

model_name_to_eval_dict_coco = {
    'Claude Sonnet 4': "claude_sonnet_4/fo_2025_06_09_01_25_22_237_label_match", # *
    'Gemini 2.5 Flash (high)': "gemini_2_5_flash_high/fo_2025_08_28_14_21_16_422_label_match", # *
    'Gemini 2.5 Flash Lite (high)': "gemini_2_5_flash_lite_high/fo_2025_09_24_17_21_36_580_label_match",
    'Gemini 2.5 Flash Lite': "gemini_2_5_flash_lite_none/fo_2025_09_24_16_32_10_614_label_match",
    'Gemini 2.5 Flash (low)': "gemini_2_5_flash_low/fo_2025_08_28_14_24_39_880_label_match", # *
    'Gemini 2.5 Flash (medium)': "gemini_2_5_flash_medium/fo_2025_08_28_14_42_54_220_label_match", # *
    'Gemini 2.5 Flash': "gemini_2_5_flash_none/fo_2025_08_28_15_08_42_343_label_match", # *
    'Gemini 2.5 Flash (preview)': "gemini_2_5_flash_preview/fo_2025_06_09_01_43_46_408_label_match", # *
    'Gemini 2.5 Pro (high)': "gemini_2_5_pro_high/fo_2025_08_28_15_26_51_137_label_match", # *
    'Gemini 2.5 Pro': "gemini_2_5_pro_none/fo_2025_08_28_15_51_46_277_label_match", # *
    'Gemini 2.5 Pro (preview)': "gemini_2_5_pro_preview/fo_2025_06_09_02_04_38_468_label_match", # *
    'Gemma 4B': "gemma_4b/fo_2025_09_29_17_03_54_913_label_match",
    'Gemma 12B': "gemma_12b/fo_2025_09_29_16_41_04_519_label_match",
    'Gemma 27B': "gemma_27b/fo_2025_09_29_16_54_08_445_label_match",
    'GLM 4.5V': "glm_4_5_none/fo_2025_09_29_17_11_17_392_label_match",
    'GLM 4.5V (think)': "glm_4_5_reason/fo_2025_08_28_17_06_55_532_label_match", # *
    'GPT-4.1': "gpt_4_1/fo_2025_06_09_09_54_44_572_label_match", # *
    'GPT-4.1 mini': "gpt_4_1_mini/fo_2025_06_09_10_01_01_783_label_match", # *
    'GPT-4.1 nano': "gpt_4_1_nano/fo_2025_06_09_11_31_50_761_label_match", # *
    'GPT-5 (high)': "gpt_5_high/fo_2025_08_28_16_19_05_062_label_match", # *
    'GPT-5 mini (high)': "gpt_5_mini_high/fo_2025_09_26_12_06_47_231_label_match",
    'GPT-5 mini': "gpt_5_mini_none/fo_2025_08_28_17_00_14_730_label_match", # *
    'GPT-5 nano (high)': "gpt_5_nano_high/fo_2025_08_28_17_37_39_974_label_match", # *
    'GPT-5 nano': "gpt_5_nano_none/fo_2025_08_28_17_53_26_811_label_match", # *
    'GPT-5': "gpt_5_none/fo_2025_08_28_18_06_02_775_label_match", # *
    'Grok 4': "grok_4/fo_2025_08_28_19_05_16_834_label_match", # *
    'InternVL 3.5 2B': "internvl_3_5_2b_none/fo_2025_08_28_18_57_59_521_label_match", # *
    'InternVL 3.5 2B (think)': "internvl_3_5_2b_reason/fo_2025_08_28_20_36_39_816_label_match", # *
    'InternVL 3.5 4B (think)': "internvl_3_5_4b_reason/fo_2025_08_28_19_20_26_476_label_match", # *
    'InternVL 3.5 20B A4B (think)': "internvl_3_5_20b_a4b_reason/fo_2025_08_29_18_37_44_078_label_match", # *
    'InternVL 3.5 30B A3B': "internvl_3_5_30b_a3b_none/fo_2025_08_28_19_33_16_248_label_match", # *
    'InternVL 3.5 30B A3B (think)': "internvl_3_5_30b_a3b_reason/fo_2025_08_28_19_44_42_109_label_match", # *
    'InternVL 3.5 38B (think)': "internvl_3_5_38b_reason/fo_2025_08_28_20_10_41_190_label_match", # *
    'Mistral Medium 3.1 (t=1.00)': "mistral_medium_3_1_temp_high/fo_2025_08_28_21_07_24_394_label_match", # *
    'Mistral Medium 3.1 (t=0.15)': "mistral_medium_3_1_temp_low/fo_2025_08_28_21_19_05_965_label_match", # *
    'OVIS 2.5 9B ': "ovis_2_5_9b_none/fo_2025_08_26_14_37_31_222_label_match", # *
    'OVIS 2.5 9B (think)': "ovis_2_5_9b_reason/fo_2025_08_28_21_36_05_082_label_match", # *
    'Qwen3-VL 235B A22B': "qwen_3_vl_235b_A22b_none/fo_2025_09_29_17_24_19_003_label_match",
    'Qwen3-VL 235B A22B (reason)': "qwen_3_vl_235b_A22b_reason/fo_2025_09_29_17_35_57_241_label_match",
    'Qwen-VL Plus': "qwen_vl_plus/fo_2025_09_29_17_54_53_346_label_match",
}

# *infinite retries

text = write_results(base_path_coco, model_name_to_eval_dict_coco, latex=False)
print(text)

Model                        |  F1   |  Rec. | Prec. |  mAP  || Ins. | Mat. |  Time  || Fail | Ret. 
----------------------------------------------------------------------------------------------------
Gemini 2.5 Pro (high)        | [1m[94m0.541[0m | 0.489 | 0.606 | 0.338 || 11.4 | 0.50 |  14.5s ||  nan |  nan
Gemini 2.5 Pro (preview)     | 0.537 | 0.487 | 0.599 | [1m[94m0.350[0m || 11.1 | 0.51 |  13.3s ||  nan |  nan
Gemini 2.5 Pro               | 0.537 | 0.487 | 0.598 | 0.337 || 11.4 | 0.50 |  14.7s ||  nan |  nan
GLM 4.5V (think)             | 0.526 | 0.445 | 0.642 | 0.334 ||  8.2 | 0.60 |  14.9s ||  nan |  nan
Grok 4                       | 0.524 | [1m[94m0.531[0m | 0.518 | 0.340 || 12.1 | 0.59 |  58.2s ||  nan |  nan
InternVL 3.5 38B (think)     | 0.523 | 0.464 | 0.599 | 0.336 ||  9.8 | 0.55 |  59.1s ||  nan |  nan
Gemini 2.5 Flash (medium)    | 0.520 | 0.487 | 0.558 | 0.323 || 12.7 | 0.48 |   6.6s ||  nan |  nan
Gemini 2.5 Flash Lite (high) | 0.520 | 0.449 | 0.616 | 0.31

Models are sorted in descending order by F-1 score.<br>
Best model(s) per column is/are highlighted in blue.<br>
Worst model(s) per column is/are highlighted in red.<br>

Legend:
- F1: The achieved F-1 score of detections that passed label matching compared to groundtruth annotations.<br>
- Rec.: The achieved recall score of detections that passed label matching compared to groundtruth annotations.<br>
- Prec.: The achieved precision score of detections that passed label matching compared to groundtruth annotations.<br>
- mAP: The achieved mAP score of detections that passed label matching compared to groundtruth annotations.<br>
- Ins.: The average number of object instances in a valid structured description per image.<br>
- Mat.: The ratio of matched detections by the label matching procedure over all detections.<br>
- Time: The median time to generate a valid structured description over all images.<br>
- Fail: The rate of invalid structured descriptions after all (4) generation attempts.<br>
- Ret.: The average number of retry attempts to generate a valid structured description per image (0 to 3).<br>
 
Remarks:
- Models where the last two columns report nan were evaluated with an infinite and untracked number of retry attempts, until a valid structured description was obtained.<br>
- All models were used and interpreted at best effort, limiting parallel usage, attempting to extract JSON from within markdown tags or reasoning content, etc.<br>
- Reasons for failed attempts may include rate limits, content moderation, timeouts, reaching max. token limits, etc.<br>
- All reported times may heavily be affected by the used hardware, rate limits, server load, etc.<br>

In [5]:
# print("ros2 run vlm_gist fiftyone_eval " + " ".join([f'"{os.path.abspath(os.path.join(base_path_coco.rstrip(os.path.sep), model_name_to_eval_dict_coco[model].lstrip(os.path.sep)))}"' for model in model_name_to_eval_dict_coco]))

In [6]:
save_results(text=text, suffix="coco")

Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_26eac753_coco.png'
Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_26eac753_coco.txt'


#### Custom Dataset

In [7]:
base_path_custom = "./../../data/keep/custom"

model_name_to_eval_dict_custom = {
    'Claude Sonnet 4': "claude_sonnet_4/fo_2025_06_08_22_11_36_197_label_match", # *
    'Gemini 2.5 Flash (high)': "gemini_2_5_flash_high/fo_2025_08_28_22_08_17_562_label_match", # *
    'Gemini 2.5 Flash Lite (high)': "gemini_2_5_flash_lite_high/fo_2025_09_24_12_52_14_681_label_match",
    'Gemini 2.5 Flash Lite': "gemini_2_5_flash_lite_none/fo_2025_09_24_12_51_42_507_label_match",
    'Gemini 2.5 Flash (low)': "gemini_2_5_flash_low/fo_2025_08_29_15_11_06_237_label_match", # *
    'Gemini 2.5 Flash (medium)': "gemini_2_5_flash_medium/fo_2025_08_29_15_18_30_354_label_match", # *
    'Gemini 2.5 Flash': "gemini_2_5_flash_none/fo_2025_08_25_15_21_08_854_label_match", # *
    'Gemini 2.5 Flash (preview)': "gemini_2_5_flash_preview/fo_2025_06_08_22_43_08_886_label_match", # *
    'Gemini 2.5 Pro (high)': "gemini_2_5_pro_high/fo_2025_08_29_15_26_11_360_label_match", # *
    'Gemini 2.5 Pro': "gemini_2_5_pro_none/fo_2025_08_28_22_25_19_287_label_match", # *
    'Gemini 2.5 Pro (preview)': "gemini_2_5_pro_preview/fo_2025_06_08_22_35_47_157_label_match", # *
    'Gemma 4B': "gemma_4b/fo_2025_09_26_12_24_16_991_label_match",
    'Gemma 12B': "gemma_12b/fo_2025_09_30_13_04_45_707_label_match",
    'Gemma 27B': "gemma_27b/fo_2025_09_26_12_21_24_437_label_match",
    'GLM 4.5V': "glm_4_5_none/fo_2025_09_26_12_34_34_430_label_match",
    'GLM 4.5V (think)': "glm_4_5_reason/fo_2025_08_28_23_09_06_888_label_match", # *
    'GPT-4.1': "gpt_4_1/fo_2025_06_08_22_29_53_001_label_match", # *
    'GPT-4.1 mini': "gpt_4_1_mini/fo_2025_06_08_22_47_47_312_label_match", # *
    'GPT-4.1 nano': "gpt_4_1_nano/fo_2025_06_08_22_52_38_103_label_match", # *
    'GPT-5 (high)': "gpt_5_high/fo_2025_08_29_15_56_15_093_label_match", # *
    'GPT-5 mini (high)': "gpt_5_mini_high/fo_2025_08_29_16_08_33_024_label_match", # *
    'GPT-5 mini': "gpt_5_mini_none/fo_2025_08_28_22_54_34_989_label_match", # *
    'GPT-5 nano (high)': "gpt_5_nano_high/fo_2025_08_29_16_21_15_467_label_match", # *
    'GPT-5 nano': "gpt_5_nano_none/fo_2025_08_29_16_41_22_600_label_match", # *
    'GPT-5': "gpt_5_none/fo_2025_08_28_22_43_40_097_label_match", # *
    'Grok 4': "grok_4/fo_2025_08_28_23_01_10_250_label_match", # *
    'InternVL 3.5 2B': "internvl_3_5_2b_none/fo_2025_08_28_23_22_43_290_label_match", # *
    'InternVL 3.5 2B (think)': "internvl_3_5_2b_reason/fo_2025_08_29_16_50_49_757_label_match", # *
    'InternVL 3.5 4B': "internvl_3_5_4b_none/fo_2025_08_29_17_40_43_317_label_match", # *
    'InternVL 3.5 4B (think)': "internvl_3_5_4b_reason/fo_2025_08_29_17_47_57_187_label_match", # *
    'InternVL 3.5 30B A3B': "internvl_3_5_30b_a3b_none/fo_2025_08_29_17_00_39_604_label_match", # *
    'InternVL 3.5 30B A3B (think)': "internvl_3_5_30b_a3b_reason/fo_2025_08_29_17_20_54_905_label_match", # *
    'InternVL 3.5 38B (think)': "internvl_3_5_38b_reason/fo_2025_08_29_17_31_43_513_label_match", # *
    'Mistral Medium 3.1 (t=1.00)': "mistral_medium_3_1_temp_high/fo_2025_08_28_23_42_47_130_label_match", # *
    'Mistral Medium 3.1 (t=0.15)': "mistral_medium_3_1_temp_low/fo_2025_08_29_17_57_35_128_label_match", # *
    'OVIS 2.5 9B': "ovis_2_5_9b_none/fo_2025_08_28_23_51_45_603_label_match", # *
    'OVIS 2.5 9B (think)': "ovis_2_5_9b_reason/fo_2025_08_29_18_04_13_326_label_match", # *
    'Qwen3-VL 235B A22B': "qwen_3_vl_235b_A22b_none/fo_2025_09_26_12_38_16_410_label_match",
    'Qwen3-VL 235B A22B (reason)': "qwen_3_vl_235b_A22b_reason/fo_2025_09_30_13_02_46_896_label_match",
    'Qwen-VL Plus': "qwen_vl_plus/fo_2025_09_26_12_40_59_911_label_match",
}

# *infinite retries

text = write_results(base_path_custom, model_name_to_eval_dict_custom, latex=False)
print(text)

Model                        |  F1   |  Rec. | Prec. |  mAP  || Ins. | Mat. |  Time  || Fail | Ret. 
----------------------------------------------------------------------------------------------------
Gemini 2.5 Pro               | [1m[94m0.464[0m | [1m[94m0.417[0m | 0.523 | 0.363 || 14.7 | 1.00 |  16.0s ||  nan |  nan
Gemini 2.5 Pro (high)        | 0.460 | 0.415 | 0.515 | [1m[94m0.366[0m || 14.8 | 1.00 |  17.7s ||  nan |  nan
Gemini 2.5 Pro (preview)     | 0.451 | 0.402 | 0.515 | 0.357 || 14.3 | 1.00 |  14.9s ||  nan |  nan
Gemini 2.5 Flash (low)       | 0.435 | 0.395 | 0.485 | 0.344 || 15.0 | 1.00 |   7.1s ||  nan |  nan
Grok 4                       | 0.428 | 0.385 | 0.482 | 0.340 || 14.7 | 1.00 |  34.7s ||  nan |  nan
GPT-5 mini (high)            | 0.422 | 0.401 | 0.445 | 0.342 || 16.6 | 1.00 |  82.7s ||  nan |  nan
GPT-5                        | 0.418 | 0.413 | 0.424 | 0.352 || 17.9 | 1.00 |  54.2s ||  nan |  nan
OVIS 2.5 9B (think)          | 0.415 | 0.318 | [1m[94m0.5

Models are sorted in descending order by F-1 score.<br>
Best model(s) per column is/are highlighted in blue.<br>
Worst model(s) per column is/are highlighted in red.<br>

Legend:
- F1: The achieved F-1 score of detections that passed label matching compared to groundtruth annotations.<br>
- Rec.: The achieved recall score of detections that passed label matching compared to groundtruth annotations.<br>
- Prec.: The achieved precision score of detections that passed label matching compared to groundtruth annotations.<br>
- mAP: The achieved mAP score of detections that passed label matching compared to groundtruth annotations.<br>
- Ins.: The average number of object instances in a valid structured description per image.<br>
- Mat.: The ratio of matched detections by the label matching procedure over all detections.<br>
- Time: The median time to generate a valid structured description over all images.<br>
- Fail: The rate of invalid structured descriptions after all (4) generation attempts.<br>
- Ret.: The average number of retry attempts to generate a valid structured description per image (0 to 3).<br>
 
Remarks:
- Models where the last two columns report nan were evaluated with an infinite and untracked number of retry attempts, until a valid structured description was obtained.<br>
- All models were used and interpreted at best effort, limiting parallel usage, attempting to extract JSON from within markdown tags or reasoning content, etc.<br>
- Reasons for failed attempts may include rate limits, content moderation, timeouts, reaching max. token limits, etc.<br>
- All reported times may heavily be affected by the used hardware, rate limits, server load, etc.<br>

In [8]:
# print("ros2 run vlm_gist fiftyone_eval " + " ".join([f'"{os.path.abspath(os.path.join(base_path_custom.rstrip(os.path.sep), model_name_to_eval_dict_custom[model].lstrip(os.path.sep)))}"' for model in model_name_to_eval_dict_custom]))

In [9]:
save_results(text=text, suffix="custom")

Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_f000e22c_custom.png'
Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_f000e22c_custom.txt'


#### Weighted Datasets

In [10]:
text = write_results([base_path_coco, base_path_custom], [model_name_to_eval_dict_coco, model_name_to_eval_dict_custom], weights=[0.5, 0.5], latex=False)
print(text)

Model                        |  F1   |  Rec. | Prec. |  mAP  || Ins. | Mat. |  Time  || Fail | Ret. 
----------------------------------------------------------------------------------------------------
Gemini 2.5 Pro (high)        | [1m[94m0.500[0m | 0.452 | 0.560 | 0.352 || 13.1 | 0.75 |  16.1s ||  nan |  nan
Gemini 2.5 Pro               | 0.500 | 0.452 | 0.560 | 0.350 || 13.0 | 0.75 |  15.3s ||  nan |  nan
Gemini 2.5 Pro (preview)     | 0.494 | 0.444 | 0.557 | [1m[94m0.353[0m || 12.7 | 0.76 |  14.1s ||  nan |  nan
Gemini 2.5 Flash (low)       | 0.477 | 0.437 | 0.524 | 0.335 || 13.5 | 0.75 |   6.4s ||  nan |  nan
Grok 4                       | 0.476 | 0.458 | 0.500 | 0.340 || 13.4 | 0.80 |  46.4s ||  nan |  nan
Gemini 2.5 Flash Lite (high) | 0.466 | 0.395 | 0.569 | 0.303 || 10.5 | 0.79 |  10.7s || [1m[94m0.00[0m | 0.03
Gemini 2.5 Flash (high)      | 0.465 | 0.436 | 0.499 | 0.325 || 14.1 | 0.75 |   7.5s ||  nan |  nan
InternVL 3.5 38B (think)     | 0.464 | 0.398 | 0.560 | 0.31

Models are sorted in descending order by F-1 score.<br>
Best model(s) per column is/are highlighted in blue.<br>
Worst model(s) per column is/are highlighted in red.<br>

Legend:
- F1: The achieved F-1 score of detections that passed label matching compared to groundtruth annotations.<br>
- Rec.: The achieved recall score of detections that passed label matching compared to groundtruth annotations.<br>
- Prec.: The achieved precision score of detections that passed label matching compared to groundtruth annotations.<br>
- mAP: The achieved mAP score of detections that passed label matching compared to groundtruth annotations.<br>
- Ins.: The average number of object instances in a valid structured description per image.<br>
- Mat.: The ratio of matched detections by the label matching procedure over all detections.<br>
- Time: The median time to generate a valid structured description over all images.<br>
- Fail: The rate of invalid structured descriptions after all (4) generation attempts.<br>
- Ret.: The average number of retry attempts to generate a valid structured description per image (0 to 3).<br>
 
Remarks:
- Models where the last two columns report nan were evaluated with an infinite and untracked number of retry attempts, until a valid structured description was obtained.<br>
- All models were used and interpreted at best effort, limiting parallel usage, attempting to extract JSON from within markdown tags or reasoning content, etc.<br>
- Reasons for failed attempts may include rate limits, content moderation, timeouts, reaching max. token limits, etc.<br>
- All reported times may heavily be affected by the used hardware, rate limits, server load, etc.<br>

In [11]:
save_results(text=text, suffix="weighted")

Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_6361e58f_weighted.png'
Results saved to '/home/paetzoldb0/ws/jazzy/main/src/vlm_gist/notebooks/evaluation/2025_10_03T00_23_01_6361e58f_weighted.txt'
