# 🧪 SEE Evaluator
This notebook evaluates VLM-generated screen descriptions using GPT-4o and DeepEval. It measures both Reference Coverage and Must-Include Coverage against the SEE-Benchmark annotations.

# 🛠️ Configuration

In [14]:
EVAL_INPUT_XLSX = "descriptions/example_descriptions.xlsx"

# 📦 Step 1: Setup and Imports

In [15]:
import os
import base64
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from together import Together
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr

load_dotenv()

True

# 📂 Step 2: Load Annotations

In [16]:
with open("annotation_dataset/annotations.json", "r", encoding="utf-8") as f:
    annotations = json.load(f)

# 📏 Step 3: Define SEE Evaluation Metrics using GEval (DeepEval)

In [17]:
# Part 1: Complete metrics definition
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.evaluate import AsyncConfig

# Your existing reference coverage metric
reference_coverage_eval = GEval(
    name="Referentie-dekking (SEE-Benchmark)",
    criteria=(
        "Beoordeel hoe goed de modeloutput overeenkomt met de essentie van de referentiebeschrijving. "
        "Vergelijk het met het nakijken van een open vraag: parafraseren en andere woordvolgorde mogen, zolang de kern van het antwoord gelijk blijft. "
        "Hoe meer belangrijke en relevante elementen correct aanwezig zijn, hoe hoger de score. "
        "De beschrijving moet functioneel bruikbaar zijn voor een blinde gebruiker en volledig in het Nederlands zijn."
    ),
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT
    ],
    evaluation_steps=[
        "De beschrijving moet volledig in het Nederlands zijn. Engelse termen zijn alleen toegestaan als ze letterlijk ook in de referentie staan. Als er Engelse termen voorkomen die niet in de referentie staan, geef een lage score (onder 0.3).",
        "Controleer of alle belangrijke en functioneel relevante elementen uit de referentie aanwezig zijn in de modeloutput, ook als ze net anders genoemd worden, zolang de essentie van het functioneel relevante element maar overkomt. Als er iets ontbreekt, trek dan punten af.",
        "Parafrasering is toegestaan, zolang betekenis en inhoud behouden blijven. Als de betekenis verandert of vager wordt, trek dan punten af.",
        "Als er elementen genoemd worden die niet in de referentie staan (zoals extra knoppen of functionaliteiten), trek dan punten af.",
        "Als de modeloutput visuele of stilistische details bevat die niet in de referentie staan én geen functioneel doel dienen, trek dan punten af.",
        "Als de modeloutput veel langer is dan nodig of onnodige omschrijvingen bevat, trek dan punten af."
    ],
    model="gpt-4o",
    threshold=0.7
)

# New metric for must-include elements
must_include_coverage_eval = GEval(
    name="Verplichte Elementen Dekking",
    criteria="Alle verplichte elementen uit de lijst moeten herkenbaar en in het Nederlands aanwezig zijn in de modeloutput, ook als synoniem of parafrase. Het belangrijkste is dat de essentie van het element behouden blijft.",
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.INPUT
    ],
    evaluation_steps=[
        "Controleer voor elk verplicht element of het in de output staat, ook synoniemen of parafraseringen tellen even zwaar mee."
        "Als alles aanwezig is telt dit sterk mee."
    ],
    model="gpt-4o",
    threshold=0.8
)

# 🧮 Step 4: Generate GEval evaluations from model outputs and annotations

In [18]:
# Part 2: Complete evaluation script
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
import pandas as pd

# Load your model outputs
model_outputs_df = pd.read_excel(EVAL_INPUT_XLSX)

# Create test cases for both metrics
reference_test_cases = []
must_include_test_cases = []

for idx, row in model_outputs_df.iterrows():
    image_id = str(row['image_id']).zfill(3)
    model_output = row['model_output']

    # Find reference description
    reference = None
    must_include_elements = []
    
    for annotation in annotations:
        ann_id = str(annotation.get('image_id', '')).zfill(3)
        if ann_id == image_id:
            reference = annotation.get('reference_description', None)
            must_include_elements = annotation.get('must_include', [])
            break

    if reference is None:
        print(f"Warning: No reference found for image_id {image_id}")
        continue

    # Test case for reference coverage
    reference_test_cases.append(
        LLMTestCase(
            input="",
            actual_output=model_output,
            expected_output=reference,
            additional_metadata={"image_id": image_id, "metric": "reference_coverage"}
        )
    )

    # Test case for must-include coverage (only if must_include elements exist)
    if must_include_elements:
        # Format the must_include elements for the input
        must_include_input = f"Verplichte elementen die aanwezig moeten zijn: {must_include_elements}"
        
        must_include_test_cases.append(
            LLMTestCase(
                input=must_include_input,
                actual_output=model_output,
                expected_output="",  # Not used for this metric
                additional_metadata={"image_id": image_id, "metric": "must_include_coverage", "must_include": must_include_elements}
            )
        )

print(f"Created {len(reference_test_cases)} reference coverage test cases")
print(f"Created {len(must_include_test_cases)} must-include coverage test cases")

print("Running reference coverage evaluation...")
reference_results = evaluate(test_cases=reference_test_cases, metrics=[reference_coverage_eval])

print("Running must-include coverage evaluation...")
must_include_results = evaluate(test_cases=must_include_test_cases, metrics=[must_include_coverage_eval])


Created 5 reference coverage test cases
Created 5 must-include coverage test cases
Running reference coverage evaluation...


Evaluating 5 test case(s) in parallel: |██████    | 60% (3/5) [Time Taken: 00:03,  1.25test case/s]

False !!!!!!!!!!!!
False !!!!!!!!!!!!
False !!!!!!!!!!!!


Evaluating 5 test case(s) in parallel: |████████  | 80% (4/5) [Time Taken: 00:03,  1.65test case/s]

False !!!!!!!!!!!!


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:03,  1.26test case/s]

False !!!!!!!!!!!!


Metrics Summary

  - ❌ Referentie-dekking (SEE-Benchmark) (GEval) (score: 0.02, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The description is mostly in Dutch but includes English terms not in the reference, like 'Assignment 1: Portfolio'. Important elements like authors and specific word markings are missing. The output mentions a status bar, which is not in the reference., error: None)

For test case:

  - input: 
  - actual output: De schermafbeelding toont een Microsoft Word-document op een computer.

Het document is getiteld "Assignment 1: Portfolio" en bevat een inleiding over een patiënt met Colitis Ulcerosa. De tekst is geschreven in het Engels en bevat markeringen en annotaties.

De belangrijkste elementen in het scherm zijn:
* De menubalk bovenaan met opties zoals "Bestand", "Start", "Invoegen", enz.
* De werkbalk onder de menubalk met knoppen voor tekstbewerking, zoals "Calibri", "Lettertype", "Tekst uitlijnen", enz.
* Het document z




Running must-include coverage evaluation...


Evaluating 5 test case(s) in parallel: |██        | 20% (1/5) [Time Taken: 00:02,  2.21s/test case]

False !!!!!!!!!!!!
False !!!!!!!!!!!!
False !!!!!!!!!!!!


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:02,  1.98test case/s]

False !!!!!!!!!!!!
False !!!!!!!!!!!!


Metrics Summary

  - ✅ Verplichte Elementen Dekking (GEval) (score: 0.9599051067671297, threshold: 0.8, strict: False, evaluation model: gpt-4o, reason: All required elements are present in the output, including the navigation elements, four information tiles with specific details, and the section with important themes and links., error: None)

For test case:

  - input: Verplichte elementen die aanwezig moeten zijn: ['Navigatiebalk (Home, Menu, Zoekbalk, Inloggen)', 'Vier informatietegels', "Informatietegel: 'Aangifte inkomstenbelasting doen' met deadline en inlogknop", "Informatietegel: 'Aangifte 2024 gedaan - wanneer hoor ik iets?' met link", "Informatietegel: 'Arbeidsrelaties: zzp - ja of nee?' met link", "Informatietegel: 'Uitbetalen voorlopige aanslag' met datum en link", "Sectie met belangrijke thema's: 'Aangifte doen', 'Toeslagen', 'Betalen en ontvangen'", "Links naar onderwerpen zoals 'Voorlopige aanslag', 'Erfbelasting', 'Schenkbelastin




# 📊 Step 5: Aggregate evaluation results and export scored overview

In [19]:
# Extract individual test scores from both evaluations
reference_scores = []
for test_result in reference_results.test_results:
    metric_data = test_result.metrics_data[0]
    reference_scores.append({
        'image_id': test_result.additional_metadata.get('image_id', 'unknown'),
        'reference_score': metric_data.score,
        'reference_pass': metric_data.success,
        'reference_reason': metric_data.reason
    })

must_include_scores = []
for test_result in must_include_results.test_results:
    metric_data = test_result.metrics_data[0]
    must_include_scores.append({
        'image_id': test_result.additional_metadata.get('image_id', 'unknown'),
        'must_include_score': metric_data.score,
        'must_include_pass': metric_data.success,
        'must_include_reason': metric_data.reason
    })

# Create DataFrames
ref_df = pd.DataFrame(reference_scores)
must_df = pd.DataFrame(must_include_scores)

# Merge the results
scores_df = ref_df.merge(must_df, on='image_id', how='left')
scores_df['must_include_score'] = scores_df['must_include_score'].fillna('N/A')
scores_df['must_include_pass'] = scores_df['must_include_pass'].fillna('N/A')

# Calculate overall pass
scores_df['overall_pass'] = scores_df.apply(
    lambda row: row['reference_pass'] if row['must_include_pass'] == 'N/A' 
    else row['reference_pass'] and row['must_include_pass'], axis=1
)

# Display individual scores
print("="*80)
print("INDIVIDUAL TEST SCORES")
print("="*80)
print(scores_df[['image_id', 'reference_score', 'reference_pass', 'must_include_score', 'must_include_pass', 'overall_pass']].to_string(index=False))

# Display summary
print(f"\n{'='*80}")
print("OVERALL SUMMARY")
print("="*80)
total_tests = len(scores_df)
passed_tests = scores_df['overall_pass'].sum()
overall_pass_rate = (passed_tests / total_tests) * 100

print(f"Overall Pass Rate: {overall_pass_rate:.2f}%")
print(f"Passed Tests: {passed_tests}/{total_tests}")
print(f"Average Reference Score: {scores_df['reference_score'].mean():.4f}")

must_include_numeric = scores_df[scores_df['must_include_score'] != 'N/A']['must_include_score']
if len(must_include_numeric) > 0:
    print(f"Average Must-Include Score: {must_include_numeric.mean():.4f}")

# Save results to results/
from pathlib import Path
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

# Extract model and prompt version from filename
parts = Path(EVAL_INPUT_XLSX).stem.split('__')
model_name_raw = parts[0]
prompt_version = parts[1] if len(parts) > 1 else "unknown"
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

file_basename = f"{model_name_raw}__{prompt_version}__{timestamp}"
eval_path = os.path.join(results_dir, f"{file_basename}__eval.csv")
scores_df.to_csv(eval_path, index=False)

print(f"\n✅ Detailed results saved to {eval_path}")


INDIVIDUAL TEST SCORES
image_id  reference_score  reference_pass  must_include_score  must_include_pass  overall_pass
     003         0.020000           False            0.275077              False         False
     002         0.788565            True            0.959905               True          True
     005         0.635280           False            0.779498              False         False
     004         0.040000           False            0.764173              False         False
     001         0.750391            True            0.857039               True          True

OVERALL SUMMARY
Overall Pass Rate: 40.00%
Passed Tests: 2/5
Average Reference Score: 0.4468
Average Must-Include Score: 0.7271

✅ Detailed results saved to results\example_descriptions__unknown__20250629_1127__eval.csv
