# Visual-TableQA Evaluation Notebook
----
Evaluating Models on Synthetic Image Dataset on Tables for QA Reasoning and Recognition Tasks

----
authors: Marc Haraoui, Aser Lompo

date: 07/06/2025

### Imports

In [None]:
import json
from typing import Dict, Any
from groq import Groq
from openai import OpenAI
import requests
import google.generativeai as genai
from evaluate import load
from io import BytesIO
import base64
from datasets import load_dataset
import time
import random
from utils import extract_json_blocks, read_Exception


  from .autonotebook import tqdm as notebook_tqdm


### Constants

In [None]:
# API keys for model hosts
groq_key   = "..."
google_key = "..."
openai_key = "..."
openrouter_key = "..."

api_keys = {'groq': groq_key, 'google': google_key, 'openai': openai_key, 'openrouter': openrouter_key}


## Visual-TableQA Evaluation Class

In [None]:
class TableQA_Evaluation:
    """
    Visual-TableQA Evaluation with VLMs-as-Juries System:
    VLM jury system for evaluating model responses against ground truth answers.
    Multiple VLMs act as juries giving binary scores (correct/incorrect) with majority voting.
    """

    def __init__(self, api_keys: Dict[str, str]):
        self.groq_client = Groq(api_key=api_keys['groq'])
        genai.configure(api_key=api_keys['google'])
        self.openai_client = OpenAI(api_key=api_keys['openai'])
        self.openrouter_key = api_keys['openrouter']

        self.models_spec = {
            "qwen/qwen3-32b": {"api": "groq", "temperature": 0.6, "top_p": 0.95, "json_format": True},
            "deepseek-r1-distill-llama-70b": {"api": "groq", "temperature": 0.6, "top_p": 0.95, "json_format": False},
            "gemini-2.5-pro": {"api": "google", "temperature": 0.3, "top_p": 1.0, "json_format": True},
            "gpt-4.1": {"api": "openai", "temperature": 0.1, "top_p": 0.9, "json_format": "json_object"},
            "deepseek/deepseek-prover-v2": {"api": "openrouter", "temperature": 0.6, "top_p": 0.95, "json_format": True}
        }

        # Escape braces for Python format
        self.prompt_template = ("Evaluate if the answer matches the ground truth. "
                                "To do so, read the question and determine whether the provided answer conveys the same meaning as the ground truth."
                                "Output a JSON response as follows:\n\n"
                                "{{\"verdict\": 1}} for correct or {{\"verdict\": 0}} for incorrect.\n\n"
                                "Question: {question}\n"
                                "Ground Truth: {ground_truth}\n"
                                "Answer: {prediction}\n\n"
                                "Response:")



    def call_llm(self, model_name, prompt, max_tokens)-> Dict[str, Any]:
        json_format = self.models_spec[model_name]['json_format']
        api_host = self.models_spec[model_name]['api']
        temperature, top_p = self.models_spec[model_name]['temperature'], self.models_spec[model_name]['top_p']

        messages = prompt if api_host == 'google' else [{"role": "user", "content": prompt}]

        if api_host.lower() == "groq":
            completion = self.groq_client.chat.completions.create(model=model_name,messages=messages,temperature=temperature, top_p= top_p,
              max_completion_tokens=max_tokens,stream=False, reasoning_format='hidden' if model_name in ["deepseek-r1-distill-llama-70b"] else None,
              response_format={"type": "json_object"} if json_format else None,
            )
            response = completion.choices[0].message.content
            if completion.choices[0].finish_reason == "stop":
                return response if json_format else extract_json_blocks(response)
            else:
                print(completion.choices[0].finish_reason)
                raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")

        elif api_host.lower() == "google":
            model = genai.GenerativeModel(model_name)
            generation_config = {
                                "max_output_tokens": max_tokens,
                                "temperature": temperature,
                                "top_p": top_p,
                                "response_mime_type": "application/json" if json_format else None,
                                }
            response = model.generate_content(contents=messages, generation_config=generation_config)
            if (response.prompt_feedback is None or
                  response.prompt_feedback.block_reason.name == "BLOCK_REASON_UNSPECIFIED"):
                return response.text if json_format else extract_json_blocks(response.text)
            else:
                print(response.prompt_feedback.block_reason.name)
                raise Exception(f"API call ended before task. Reason: {response.prompt_feedback.block_reason.name}")

        elif api_host.lower() == "openai":
            completion = self.openai_client.chat.completions.create(model=model_name,messages=messages,temperature=temperature,
                    top_p= top_p, max_completion_tokens=max_tokens,response_format= {"type": json_format})

            if completion.choices[0].finish_reason == "stop":
                return completion.choices[0].message.content
            else:
                print(completion.choices[0].finish_reason)
                raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")

        elif api_host.lower() == "openrouter":
            if model_name == 'thudm/glm-z1-32b:free':
                args = {"model": model_name, "messages": messages}
            else:
                args = {"model": model_name, "messages": messages, "temperature": temperature, "top_p": top_p, "response_format": {"type": "json_object"}}

            response = requests.post(url="https://openrouter.ai/api/v1/chat/completions",
                                 headers={"Authorization": f"Bearer {self.openrouter_key}", "Content-Type": "application/json"},
                                   data=json.dumps(args))
            response = response.json()
            #print(response)
            if response['choices'][0]['finish_reason'] == "stop":
                return extract_json_blocks(response['choices'][0]['message']['content'])
            else:
                print(response['choices'][0]['finish_reason'])
                raise Exception(f"API call ended before task. Reason: {response['choices'][0]['finish_reason']}")

        else:
            raise Exception("Please choose api cloud host from 'google', 'groq', 'openai' and 'openrouter'.")

    def safe_call_llm(self, model_name, prompt, max_tokens=5000):
      time.sleep(random.uniform(1, 3))
      try:
          return self.call_llm(model_name, prompt, max_tokens), None
      except Exception as e:
          # Handle API calls limits
          msg = read_Exception(e).lower()
          if any(keyword in msg for keyword in ["quota", "billing", "insufficient", "resource exhausted", "429", "402"]):
              print(model_name, "has reached its limit")
          elif 'ended' in msg:
              print(model_name, "needs more tokens to think")
          return None, msg


    def evaluate_single_jury(self, model_name: str, question: str, prediction: str, ground_truth: str) -> Dict[str, Any]:
        prompt = self.prompt_template.format(question=question, prediction=prediction, ground_truth=ground_truth)
        for attempt in range(3):
            decision, call_status = self.safe_call_llm(model_name=model_name, prompt=prompt)
            if call_status is None:
                decision = json.loads(decision)
                verdict = decision['verdict']
                return {"jury_model": model_name, "verdict": verdict, "success": True}
        # Exceeded retries, skip this jury
        return {"jury_model": model_name, "verdict": 0, "success": False, "error": call_status}

    def compute_llm_juries(self, question: str, pred: str, ref: str) -> Dict[str, Any]:
        evaluations = {model: self.evaluate_single_jury(model, question, pred, ref)
                       for model in self.models_spec}

        # Keep only successful juries
        successful = [eval for _,eval in evaluations.items() if eval.get("success")]
        verdicts = [eval["verdict"] for eval in successful]
        if not verdicts:
            majority = None
            print("no verdict")
        elif verdicts.count(1) > verdicts.count(0):
            majority = 1
        elif verdicts.count(1) < verdicts.count(0):
            majority = 0
        else:
            majority = evaluations["gemini-2.5-pro"]["verdict"]

        confidence = 0.0 if not verdicts else verdicts.count(majority) / len(verdicts)

        return {
            "evaluations": evaluations,
            "final_verdict": majority,
            "confidence": confidence,
            "successful_juries": len(successful),
            "total_juries": len(evaluations)
        }


## Functions used

In [None]:
def convert_image_to_bytes(image, format="JPEG"):
    if image.mode == "RGBA":
        image = image.convert("RGB")  # remove alpha channel for JPEG
    buffer = BytesIO()
    image.save(buffer, format=format)
    return buffer, buffer.getvalue()


def encode_image_to_b64(image):
    buffer, _ = convert_image_to_bytes(image)
    return base64.b64encode(buffer.getvalue()).decode('utf-8')


# LLama 4 Models Function
def llama4_vision(query, image, model_name, client):
    b64_image = encode_image_to_b64(image)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": query},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"},
                },
            ],
        }
    ]
    completion = client.chat.completions.create(
        model=model_name, messages=messages,
        temperature=0.6, max_completion_tokens=2000,
        top_p=0.95, stream=False,
    )

    response = completion.choices[0].message.content
    if completion.choices[0].finish_reason == "stop":
        return response
    else:
        print(completion.choices[0].finish_reason)
        raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")


# Gemini Model Function
def gemini_vision(query, image, model_name, client):
    _, image_encoded = convert_image_to_bytes(image)
    generation_config = {"max_output_tokens": 2000, "temperature": 0.3}
    messages = [
        {"mime_type": "image/jpeg", "data": image_encoded},
        query
    ]
    response = client.generate_content(
        contents=messages,
        generation_config=generation_config
    )
    if (response.prompt_feedback is None or
                response.prompt_feedback.block_reason.name == "BLOCK_REASON_UNSPECIFIED"):
        return response.text
    else:
        print(response.prompt_feedback.block_reason.name)
        raise Exception(f"API call ended before task. Reason: {response.prompt_feedback.block_reason.name}")


# OpenAI Model Function
def openai_vision(query, image, model_name, client):
    b64_image = encode_image_to_b64(image)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": query},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"},
                },
            ],
        }
    ]
    completion = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.2,
        max_completion_tokens=2000,
    )
    if completion.choices[0].finish_reason == "stop":
        return completion.choices[0].message.content
    else:
        print(completion.choices[0].finish_reason)
        raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")

# Open Router Model Function
def openrouter_vision(query, image, model_name, client):
    b64_image = encode_image_to_b64(image)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": query},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"},
                },
            ],
        }
    ]
    completion = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.4,
        max_completion_tokens=2000,
    )
    if completion.choices[0].finish_reason == "stop":
        return completion.choices[0].message.content
    else:
        print(completion.choices[0].finish_reason)
        raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")


def generate_content(query, image, model_name, client, api):
    try:
        if api=="groq":
            return llama4_vision(query, image, model_name, client), None
        elif api=="google":
            return gemini_vision(query, image, model_name, client), None
        elif api=="openai":
            return openai_vision(query, image, model_name, client), None
        elif api=="openrouter":
                return openrouter_vision(query, image, model_name, client), None
        else:
            raise Exception("API client not found")
    except Exception as e:
        # Handle API calls limits
        msg = read_Exception(e).lower()
        if any(keyword in msg for keyword in ["quota", "billing", "insufficient", "resource exhausted", "429", "402"]):
            print(model_name, "has reached its limit")
        elif 'ended' in msg:
            print(model_name, "needs more tokens to think")
        else:
            print(msg)
        return None, msg

## Model Evaluation using a dataset.

you can replace Visual-TableQA dataset by any dataset you need

#### Setup up available models and dataset

In [None]:
dataset = load_dataset('AI-4-Everyone/Visual-TableQA', split='test')

print(dataset)

Generating train split: 100%|██████████| 3604/3604 [00:03<00:00, 1036.80 examples/s]
Generating validation split: 100%|██████████| 777/777 [00:00<00:00, 1277.15 examples/s]
Generating test split: 100%|██████████| 784/784 [00:00<00:00, 1225.22 examples/s]

Dataset({
    features: ['image', 'question', 'answer'],
    num_rows: 784
})





In [None]:
models_config = {
    # Groq
    "meta-llama/llama-4-scout-17b-16e-instruct": "groq",
    "meta-llama/llama-4-maverick-17b-128e-instruct": "groq",

    # Google
    "gemini-2.5-flash":"google",
    "gemini-2.5-pro": "google",

    # OpenAI
    "openai/gpt-4o": "openrouter",
    "openai/gpt-4o-mini": "openrouter",
    "anthropic/claude-3.5-sonnet": "openrouter",

    # Open Router
    "opengvlab/internvl3-14b:free": "openrouter",
    "qwen/qwen2.5-vl-32b-instruct:free": "openrouter",
    "mistralai/mistral-small-3.1-24b-instruct:free":"openrouter",
    "google/gemma-3-27b-it:free": "openrouter",
}

#### Function to select provider

In [None]:
def api_provider(model_name):
    api = models_config[model_name]
    if api == "groq":
        client = Groq(api_key=api_keys['groq'])
    elif api == "google":
        genai.configure(api_key=api_keys['google'])
        client = genai.GenerativeModel(model_name)
    elif api == "openai":
        client = OpenAI(api_key=api_keys['openai'])
    elif api == "openrouter":
        client = OpenAI(
                    api_key=api_keys['openrouter'],
                    base_url="https://openrouter.ai/api/v1"
              )
    else:
      raise Exception(rf"{model_name} is unknown.")
    return client

In [None]:
# Instantiate the evaluator
evaluator = TableQA_Evaluation(api_keys)

In [None]:
#model_name = "meta-llama/llama-4-scout-17b-16e-instruct"
#model_name = "meta-llama/llama-4-maverick-17b-128e-instruct"
#model_name = "gemini-2.5-flash"
#model_name = "gemini-2.5-pro"
model_name = "openai/gpt-4o"
#model_name = "openai/gpt-4o-mini"
# model_name = "opengvlab/internvl3-14b:free"
#model_name = "qwen/qwen2.5-vl-32b-instruct:free"
#model_name = "mistralai/mistral-small-3.1-24b-instruct:free"
#model_name = "google/gemma-3-27b-it:free"

In [None]:
# Metrics record
"""
results = {
    "correctness": [],
    "skipped":[],
}

"""
with open("gpt-4o-results.json", "r") as f:
    results = json.load(f)
#"""

In [None]:
# Client
client = api_provider(model_name)
api = models_config[model_name]

# Batch size and number of batches
b_size = 4
num_b  = len(dataset)//b_size

# Start evaluation
for batch in range(num_b):
    skip_batch = False
    batch_correctness = []
    for i in range(batch*b_size, (batch+1)*b_size):
        # Retrieve data from row
        image = dataset[i]['image']
        question = dataset[i]['question']
        answer = dataset[i]['answer']

        # Predict response
        pred, verdict = None, None
        for attempt in range(3):
            try:
                time.sleep(random.uniform(1, 3)) # Sleep for a few seconds to not exceed max requests per min
                pred, _ = generate_content(question, image, model_name, client, api)
                break # success
            except Exception as e:
                if attempt == 2:
                    print(f"Issue in batch {batch}, row {i}: {e}")
        if pred is None:
            # at least one attempt failed → skip the whole batch
            print(f"Skipping entire batch {batch} due to failure at row {i}.")
            skip_batch = True
            break  # break the iteration loop

        try:
            verdict = evaluator.compute_llm_juries(question, pred, answer)['final_verdict']
        except Exception as e:
            print(e.__class__.__name__, " An Error occured while computing juries verdict")
        if verdict is None:
            # at least one jury failed → skip the whole batch
            print(f"Skipping entire batch {batch} due to failure at row {i}.")
            skip_batch = True
            break  # break the iteration loop

        batch_correctness.append(verdict)

    if skip_batch:
        results['skipped'].append(batch)
        continue  # Skip evaluation for this batch and move to the next one

    # Evaluate model with metrics
    batch_correctness = sum(batch_correctness) / b_size

    results['correctness'].append(batch_correctness)

    print(f"------Batch {batch} ----------")
    print(f"Correctness {results['correctness'][-1]}")
    with open("gpt-4o-results.json", "w") as f:
        json.dump(results, f, indent=2)

#### Final results

In [None]:
# Final results

for record in ["gemini-2.5-flash-results.json", "gemini-2.5-pro-results.json", "gpt-4o-results.json",
                    "gpt-4o-mini-results.json", "llama-4-16e-results.json", "llama-4-128e-results.json",
                    "mistral-small-3.1-24b-results.json", "qwen2.5-vl-32b-results.json"]:
    with open(record, "r") as f:
        results = json.load(f)

    num_b = len(results["correctness"])
    results["correctness"] = sum(results["correctness"]) /num_b

    # Print results
    print(f'{record[:-5]} \n \
     * correctness: {results["correctness"]*100:.2f}%.')