### Download LEGO-VLM dataset

In [None]:
%pip install huggingface_hub
from huggingface_hub import snapshot_download

local_folder = snapshot_download(
    repo_id="PPPPPeter/arta",
    repo_type="dataset",
    local_dir="./"
)



### Grounding benchmark

In [None]:
#!pip install rouge-score sacrebleu nltk pandas
import json
import nltk
import pandas as pd
from rouge_score import rouge_scorer
from sacrebleu import sentence_bleu

# Download tokenizer models
nltk.download('punkt', quiet=True)

# Load all examples from the JSONL file
examples = []
with open('eval_1/eval_grounding.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        examples.append(json.loads(line))

# Function to compute token-overlap F1 (F1-Theme)
def compute_f1_theme(gold, pred):
    gt_tokens = nltk.word_tokenize(gold.lower())
    pr_tokens = nltk.word_tokenize(pred.lower())
    common = set(gt_tokens) & set(pr_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pr_tokens)
    recall = len(common) / len(gt_tokens)
    return 2 * (precision * recall) / (precision + recall)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

# Compute metrics for each example
rows = []
for ex in examples:
    gt = ex["ground_truth"]
    pr = ex["prediction"]
    f1 = compute_f1_theme(gt, pr)
    rouge = scorer.score(gt, pr)
    bleu = sentence_bleu(pr, [gt]).score
    
    rows.append({
        "F1-Theme": f1,
        "ROUGE-1": rouge['rouge1'].fmeasure,
        "ROUGE-2": rouge['rouge2'].fmeasure,
        "ROUGE-L": rouge['rougeL'].fmeasure,
        "BLEU": bleu
    })

# Create DataFrame
df = pd.DataFrame(rows)

# Calculate average metrics
avg_metrics = df.mean()

# Display results
print("\nAverage Metrics:")
for metric, value in avg_metrics.items():
    if metric == "BLEU":
        print(f"{metric}: {value:.2f}")
    else:
        print(f"{metric}: {value:.4f}")

### State benchmark

In [None]:
import json
from sklearn.metrics import f1_score, confusion_matrix

# Load JSONL file
with open('eval_1/eval_state.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line.strip()) for line in f]

ground_truths = []
predictions = []

for example in data:
    # Extract ground truth
    gt = example.get("ground_truth", "").strip().lower()
    pred = example.get("prediction", "").strip().lower()

    # Ensure both are either "yes" or "no"
    if "yes" in gt:
        gt_binary = "yes"
    elif "no" in gt:
        gt_binary = "no"
    else:
        continue  # skip malformed GT

    if "yes" in pred:
        pred_binary = "yes"
    elif "no" in pred:
        pred_binary = "no"
    else:
        pred_binary = "no"  # default to no if invalid

    ground_truths.append(gt_binary)
    predictions.append(pred_binary)

# Convert to binary labels
y_true = [1 if x == "yes" else 0 for x in ground_truths]
y_pred = [1 if x == "yes" else 0 for x in predictions]

# Compute metrics
f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# False Positive Rate: FP / (FP + TN)
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# Print results
print(f"Total examples processed: {len(ground_truths)}")
print(f"F1-State: {f1:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print("\nConfusion Matrix:")
print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")

### Object detection benchmark

In [None]:
import json
import re
import math
from pathlib import Path
from collections import defaultdict
import numpy as np
from PIL import Image
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
import seaborn as sns  

#Functions from process_vision_info
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 2600 * 28 * 28
MAX_RATIO = 200

def round_by_factor(number: int, factor: int) -> int:
    return round(number / factor) * factor

def ceil_by_factor(number: int, factor: int) -> int:
    return math.ceil(number / factor) * factor

def floor_by_factor(number: int, factor: int) -> int:
    return math.floor(number / factor) * factor

def smart_resize(
    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
) -> tuple[int, int]:
    if min(height, width) == 0: return 0, 0
    if max(height, width) / min(height, width) > MAX_RATIO:
        print(f"Warning: Image with extreme aspect ratio {width}x{height} skipped.")
        return 0, 0
    h_bar = max(factor, round_by_factor(height, factor))
    w_bar = max(factor, round_by_factor(width, factor))
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = max(factor, floor_by_factor(height / beta, factor))
        w_bar = max(factor, floor_by_factor(width / beta, factor))
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, factor)
        w_bar = ceil_by_factor(width * beta, factor)
    return h_bar, w_bar


def parse_and_validate_boxes(s):
    valid_box_regex = r'\{<(-?\d+(?:\.\d+)?)><(-?\d+(?:\.\d+)?)><(-?\d+(?:\.\d+)?)><(-?\d+(?:\.\d+)?)>\}'
    parsed_boxes = []
    for box_coords in re.findall(valid_box_regex, s):
        box = list(map(float, box_coords))
        if box[0] < box[2] and box[1] < box[3]:
            parsed_boxes.append(box)
    return parsed_boxes

def first_existing_image(rel_path: str, prefixes=("", "ARTA_LEGO", "ARTA_LEGO/ARTA_LEGO")):
    for pfx in prefixes:
        p = Path(pfx) / rel_path
        if p.is_file():
            return p
    raise FileNotFoundError(f"No image found for {rel_path} in {prefixes}")

def match_and_debug(gt, pr):
    M, N = len(gt), len(pr)
    if M == 0 or N == 0: return [], 0.0

    cost_matrix = np.ones((M, N))
    for i in range(M):
        for j in range(N):
            g, p = gt[i], pr[j]
            inter_x1, inter_y1 = max(g[0], p[0]), max(g[1], p[1])
            inter_x2, inter_y2 = min(g[2], p[2]), min(g[3], p[3])
            inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
            g_area = (g[2] - g[0]) * (g[3] - g[1])
            p_area = (p[2] - p[0]) * (p[3] - p[1])
            union_area = g_area + p_area - inter_area
            iou = inter_area / union_area if union_area > 0 else 0
            cost_matrix[i, j] = 1 - iou

    rows, cols = linear_sum_assignment(cost_matrix)
    matched = [(r, c, 1 - cost_matrix[r, c]) for r, c in zip(rows, cols)]
    return matched, sum(m[2] for m in matched)


with open('eval_object_thinking.jsonl') as f:
    data = [json.loads(l) for l in f]

total_iou, all_count, skipped = 0.0, 0, 0
nonzero_iou_examples = []

for idx, ex in enumerate(data):
    user_img_dict = None
    for m in ex["messages"]:
        if m["role"] == "user":
            user_img_dict = next((c for c in m["content"] if c.get("type") == "image"), None)
            break

    if not user_img_dict:           
        skipped += 1
        continue

    img_rel_path = user_img_dict["image"].replace("file://", "")
    gt_boxes_norm = parse_and_validate_boxes(ex['ground_truth'])
    pr_boxes_raw  = parse_and_validate_boxes(ex['prediction'])

    try:
        with Image.open(first_existing_image(img_rel_path)) as img:
            w_orig, h_orig = img.size
    except FileNotFoundError:
        skipped += 1
        continue

    if "resized_width" in user_img_dict and "resized_height" in user_img_dict:
        w_new = int(user_img_dict["resized_width"])
        h_new = int(user_img_dict["resized_height"])
    else:                               
        h_new, w_new = smart_resize(h_orig, w_orig)

    if any(v == 0 for v in (w_orig, h_orig, w_new, h_new)):
        skipped += 1
        continue

    if (all(0.0 <= c <= 100.0 for b in pr_boxes_raw for c in b) and
        all(0.0 <= c <= 100.0 for b in gt_boxes_norm for c in b)):
        matched, sum_iou = match_and_debug(gt_boxes_norm, pr_boxes_raw)
    
    else:
        gt_boxes_px = [[
            b[0] / 100 * w_orig, b[1] / 100 * h_orig,
            b[2] / 100 * w_orig, b[3] / 100 * h_orig,
        ] for b in gt_boxes_norm]

        pr_boxes_px = []
        for b in pr_boxes_raw:
            if any(c > 100.0 for c in b):            # already pixels (resized)
                x1, y1, x2, y2 = b
                pr_boxes_px.append([
                    x1 / w_new * w_orig, y1 / h_new * h_orig,
                    x2 / w_new * w_orig, y2 / h_new * h_orig,
                ])
            else:                                     # 0‑100 of resized image
                pr_boxes_px.append([
                    b[0] / 100 * w_orig, b[1] / 100 * h_orig,
                    b[2] / 100 * w_orig, b[3] / 100 * h_orig,
                ])

        matched, sum_iou = match_and_debug(gt_boxes_px, pr_boxes_px)

    all_count += 1
    if matched:
        # Compute mean IoU for this example
        mean_iou = sum_iou / len(matched) if len(matched) > 0 else 0.0
        total_iou += mean_iou
        if mean_iou > 0.0:
            nonzero_iou_examples.append(mean_iou)
    else:
        # Add 0.0 for invalid examples
        total_iou += 0.0

# Plotting
sns.set_style("whitegrid") 
plt.rcParams.update({'font.size': 12})

print(f"\nProcessed {all_count} examples (skipped {skipped}).")

if all_count > 0:
    mean_all = total_iou / all_count
    print(f"Overall Mean IoU (all examples)        : {mean_all:.4f}")

    if nonzero_iou_examples:
        mean_nozeros = np.mean(nonzero_iou_examples)
        print(f"Overall Mean IoU (non-zero examples) : {mean_nozeros:.4f}")
    else:
        print("No examples with IoU > 0")

    # Histogram of IoU values
    iou_values = [0.0] * (all_count - len(nonzero_iou_examples)) + nonzero_iou_examples
    plt.figure(figsize=(10, 6))
    plt.hist(iou_values, bins=20, color="#FF9F43", alpha=0.8, edgecolor="black")
    plt.xlabel('IoU Values', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.title('Distribution of IoU', fontsize=16, pad=20)
    plt.grid(axis='y', alpha=0.75)
    plt.axvline(np.mean(iou_values), color='red', linestyle='--', linewidth=2, label=f'Mean IoU: {np.mean(iou_values):.4f}')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.show()
else:
    print("No examples were processed.")

In [None]:
# ===== Visualise N worst & N best with detailed metrics including demos =====
import json, re, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
from scipy.optimize import linear_sum_assignment
import matplotlib.patches as mpatches
import math

N = 10  # Show top N and bottom N examples

box_re = re.compile(r'\{<(-?\d+(?:\.\d+)?)><(-?\d+(?:\.\d+)?)>'
                    r'<(-?\d+(?:\.\d+)?)><(-?\d+(?:\.\d+)?)>\}')

def boxes(txt):                 # extract every {<x1><y1><x2><y2>}
    return [list(map(float, b)) for b in box_re.findall(txt)]

def last_user_img(msgs):
    for m in reversed(msgs):
        if m["role"] == "user":
            for c in m["content"]:
                if c["type"] == "image":
                    return c["image"].lstrip("file://")
    return None

def get_demo_message(messages):
    """Returns the first system or assistant message with a user+assistant pair"""
    demo = {"user": "", "assistant": ""}
    for m in messages:
        if m["role"] == "system" and isinstance(m["content"], list):
            for c in m["content"]:
                if c["type"] == "image":
                    img_path = c["image"].lstrip("file://")
                    return {"demo_image": img_path}
        elif m["role"] == "user" and isinstance(m["content"], list):
            for c in m["content"]:
                if c["type"] == "image":
                    img_path = c["image"].lstrip("file://")
                    txt = next((ct["text"] for ct in m["content"] if ct["type"] == "text"), "")
                    demo["user"] = txt
                    return {"demo_image": img_path, "demo_prompt": txt}
    return {}

def first_path(rel, roots=("", "ARTA_LEGO", "ARTA_LEGO/ARTA_LEGO")):
    for r in roots:
        p = Path(r) / rel
        if p.is_file():
            return p
    return None

def mean_iou(gt, pr):                                   # Hungarian + mean IoU
    M, N = len(gt), len(pr)
    if M == 0 or N == 0:
        return 0.0
    cost = np.zeros((M, N))
    for i, g in enumerate(gt):
        for j, p in enumerate(pr):
            ix = max(0, min(g[2], p[2]) - max(g[0], p[0]))
            iy = max(0, min(g[3], p[3]) - max(g[1], p[1]))
            inter = ix * iy
            ua = (g[2] - g[0]) * (g[3] - g[1])
            ub = (p[2] - p[0]) * (p[3] - p[1])
            union = ua + ub - inter
            cost[i, j] = 1 - inter / union if union else 1
    r, c = linear_sum_assignment(cost)
    return float(np.mean([1 - cost[ri, ci] for ri, ci in zip(r, c)]))

def ensure_pct(pred, w, h):
    """Return (all‑percent‑boxes, all‑pixel‑boxes)."""
    pct, px = [], []
    for x1, y1, x2, y2 in pred:
        if 0 <= x1 <= 100 and 0 <= y1 <= 100 and 0 <= x2 <= 100 and 0 <= y2 <= 100:
            pct.append([x1, y1, x2, y2])
            px.append([x1 / 100 * w, y1 / 100 * h, x2 / 100 * w, y2 / 100 * h])
        else:                                              # pixel → percent
            pct.append([x1 / w * 100, y1 / h * 100, x2 / w * 100, y2 / h * 100])
            px.append([x1, y1, x2, y2])
    return pct, px

def get_last_turn(messages):
    user_text = ""
    assistant_text = ""
    for m in reversed(messages):
        if m["role"] == "assistant" and not assistant_text:
            assistant_text = m["content"]
        elif m["role"] == "user" and not user_text:
            for c in m["content"]:
                if c["type"] == "text":
                    user_text = c["text"]
    return {"user": user_text, "assistant": assistant_text}

# -------------------------------------------------------------------------
ex_list = []

with open("eval_object_thinking.jsonl", encoding="utf-8") as fh:
    for rec in map(json.loads, fh):
        img_rel = last_user_img(rec["messages"])
        if not img_rel:
            continue
        path = first_path(img_rel)
        if not path:
            continue
        gt_raw = boxes(rec["ground_truth"])
        pr_raw = boxes(rec["prediction"])
        if not (gt_raw and pr_raw):
            continue

        w, h = Image.open(path).size
        gt_pct = gt_raw  # GT already in percent
        pr_pct, pr_px = ensure_pct(pr_raw, w, h)

        iou_score = mean_iou(gt_pct, pr_pct)
        last_turn = get_last_turn(rec["messages"])
        demo_info = get_demo_message(rec["messages"])

        ex_list.append({
            "iou": iou_score,
            "path": path,
            "gt_raw": gt_raw,
            "pr_raw": pr_raw,
            "pr_px": pr_px,
            "pr_pct": pr_pct,
            "gt_pct": gt_pct,
            "width": w,
            "height": h,
            "last_turn": last_turn,
            "demo": demo_info,
            "ground_truth": rec["ground_truth"],
            "prediction": rec["prediction"]
        })

all_scores = [e["iou"] for e in ex_list]
print(f"Mean IoU over {len(all_scores)} examples: {np.mean(all_scores):.4f}")

# Legend patches
gt_patch = mpatches.Patch(color='lime', label='Ground Truth')
pr_patch = mpatches.Patch(color='red', label='Prediction')

# Load font

base_font = ImageFont.truetype("arial.ttf", size=200)


# Sort by IoU
ex_list.sort(key=lambda x: x["iou"])
worst_examples = ex_list[:N]
best_examples = ex_list[-N:]

# Function to draw image with boxes
def draw_boxes_and_print_info(example, rank_title):
    path = example["path"]
    w, h = example["width"], example["height"]
    img = Image.open(path).convert("RGBA")
    draw = ImageDraw.Draw(img, "RGBA")

    try:
        font_size = int(h * 0.05)
        font = ImageFont.truetype("arial.ttf", size=font_size)
    except IOError:
        font = base_font

    # Draw aspect ratio banner
    common_divisor = math.gcd(w, h)
    ar_w = w // common_divisor
    ar_h = h // common_divisor
    text = f"{ar_w}:{ar_h}"
    bbox = draw.textbbox((0, 0), text, font=font)
    pad = 4
    bg = (0, 0, 0, 160)
    draw.rectangle((2, 2, bbox[2] + 2 * pad, bbox[3] + 2 * pad), fill=bg)
    draw.text((2 + pad, 2 + pad), text, fill="yellow", font=font)

    # Draw ground truth boxes
    for x1, y1, x2, y2 in example["gt_raw"]:
        draw.rectangle([x1 / 100 * w, y1 / 100 * h, x2 / 100 * w, y2 / 100 * h],
                      outline="lime", width=3)

    # Draw prediction boxes
    for x1, y1, x2, y2 in example["pr_raw"]:
        if x2 > x1 and y2 > y1:
            draw.rectangle([x1 / 100 * w, y1 / 100 * h, x2 / 100 * w, y2 / 100 * h],
                          outline="red", width=3)

    # Plotting
    fig, ax = plt.subplots(1, 2 if "demo_image" in example["demo"] else 1, figsize=(12, 6))
    if "demo_image" in example["demo"]:
        demo_path = first_path(example["demo"]["demo_image"])
        if demo_path:
            demo_img = Image.open(demo_path).convert("RGBA")
            ax[0].imshow(demo_img)
            ax[0].axis("off")
            ax[0].set_title("Demo Example", fontsize=12)

            # Print demo info
            print("\nDEMO EXAMPLE:")
            print("-"*60)
            print(f"IMAGE PATH: {demo_path}")
            print(f"PROMPT: {example['demo']['demo_prompt']}")
            print("-"*60)
        else:
            ax[0].text(0.5, 0.5, "No Demo Available", ha="center", va="center")
        ax[1].imshow(img)
        ax[1].axis("off")
        ax[1].set_title(f"{rank_title} | Mean IoU = {example['iou']:.3f}", fontsize=12)
    else:
        ax.imshow(img)
        ax.axis("off")
        ax.set_title(f"{rank_title} | Mean IoU = {example['iou']:.3f}", fontsize=12)

    plt.tight_layout()
    plt.show()

    # Print additional info
    print(f"\n{'='*60}\n")
    print(f"IMAGE PATH: {path}")
    print(f"SIZE: {w}x{h} | ASPECT RATIO: {ar_w}:{ar_h}")
    print(f"GROUND TRUTH ({len(example['gt_raw'])} boxes):")
    for i, box in enumerate(example["gt_raw"]):
        print(f"  GT Box {i+1}: {box}")
    print(f"PREDICTION ({len(example['pr_raw'])} boxes):")
    for i, box in enumerate(example["pr_raw"]):
        print(f"  PR Box {i+1}: {box} | Pixel: {example['pr_px'][i]}")
    print("\nUSER PROMPT:")
    print(example["last_turn"]["user"])
    print("\nASSISTANT RESPONSE:")
    print(example["last_turn"]["assistant"])
    print("\nRAW GROUND TRUTH STRING:")
    print(example["ground_truth"])
    print("\nRAW PREDICTION STRING:")
    print(example["last_turn"]["assistant"])
    print(f"\n{'='*60}\n")

# Display best and worst
print("\n\n=== BEST EXAMPLES ===")
for idx, ex in enumerate(best_examples, 1):
    draw_boxes_and_print_info(ex, f"Best #{idx}")

print("\n\n=== WORST EXAMPLES ===")
for idx, ex in enumerate(worst_examples, 1):
    draw_boxes_and_print_info(ex, f"Worst #{idx}")