# TableQA Evaluation Notebook
----
Evaluating Models on Synthetic Image Dataset on Tables for QA Reasoning and Recognition Tasks

----
authors: Marc Haraoui, Aser Lompo

date: 07/06/2025

## Setup

### Imports

In [1]:
import json
from typing import Dict, Any
from groq import Groq
from openai import OpenAI
import requests
import google.generativeai as genai
from io import BytesIO
import base64
from datasets import load_dataset
import time
import random
from utils import extract_json_blocks, read_Exception
import os
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


### Constants

In [2]:
# API keys for model hosts
groq_key   = "..."
google_key = "..."
openai_key = "..."
openrouter_key = "..."

api_keys = {'groq': groq_key, 'google': google_key, 'openai': openai_key, 'openrouter': openrouter_key}


## TableQA Evaluation Class

In [13]:
from typing import Dict, Any
from utils import extract_json_blocks, read_Exception
class TableQA_Evaluation:
    """
    TableQA Evaluation with VLMs-as-Juries System:
    - VLM jury system for evaluating model responses against ground truth answers.
      Multiple VLMs act as juries giving binary scores (correct/incorrect) with majority voting.
    """

    def __init__(self, api_keys: Dict[str, str]):
        self.groq_client = Groq(api_key=api_keys['groq'])
        genai.configure(api_key=api_keys['google'])
        self.openai_client = OpenAI(api_key=api_keys['openai'])
        self.openrouter_key = api_keys['openrouter']
      
        self.models_spec = {
            "gemini-2.5-pro": {"api": "google", "temperature": 0.1, "top_p": 1.0, "json_format": True},
            "gpt-4.1": {"api": "openai", "temperature": 0.1, "top_p": 0.9, "json_format": "json_object"},
            "mistralai/mistral-large-2411": {"api": "openrouter", "temperature": 0.0, "top_p": 0.95, "json_format": False},
            "deepseek/deepseek-chat-v3.1": {"api": "openrouter", "temperature": 0.1, "top_p": 0.95, "json_format": False},
            "deepcogito/cogito-v2-preview-deepseek-671b": {"api": "openrouter", "temperature": 0.1, "top_p": 0.95, "json_format": False}
        }

        self.prompt_template = ("You are an expert evaluator of question-answer pairs. You will be given a question a model's answer and a ground truth answer (reference)."
                                "Evaluate the answer based on these criteria: "
                                "1) Is the model's answer logically consistent?"
                                "2) Does the model's answer convey the same meaning as the ground truth?"
                                "If the two criteria are true, mark the pair as correct. If one of the criteria is not met, mark it as incorrect."
                                "Think step by step and conclude with your verdict and the index of the criterium not met (if none, index is 0) as follows:"
                                "{JSON_mention}"
                                "{{\"verdict\": [0, index_of_the_criterium_not_met]}} for incorrect or {{\"verdict\": [1, 0]}} for correct.\n\n"
                                "Question: {question}\n"
                                "Answer: {answer}\n"
                                "Ground Truth: {ground_truth}\n\n"
                                "Response:")
        
    def call_llm(self, model_name, prompt, max_tokens)-> Dict[str, Any]:
        json_format = self.models_spec[model_name]['json_format']
        api_host = self.models_spec[model_name]['api']
        temperature, top_p = self.models_spec[model_name]['temperature'], self.models_spec[model_name]['top_p']
        
        messages = prompt if api_host == 'google' else [{"role": "user", "content": prompt}]

        if api_host.lower() == "groq":
            completion = self.groq_client.chat.completions.create(model=model_name,messages=messages,temperature=temperature, top_p= top_p,
              max_completion_tokens=max_tokens,stream=False, 
              reasoning_format='hidden' if model_name in ["deepseek-r1-distill-llama-70b", "qwen/qwen3-32b", "openai/gpt-oss-120b"] else None,
              response_format={"type": "json_object"} if json_format else None,
              reasoning_effort='high' if model_name=="openai/gpt-oss-120b" else None,
            )
            response = completion.choices[0].message.content
            if completion.choices[0].finish_reason == "stop":
                return response if json_format else extract_json_blocks(response)
            else:
                print(completion.choices[0].finish_reason)
                raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")

        elif api_host.lower() == "google":
            model = genai.GenerativeModel(model_name)
            generation_config = {
                                "max_output_tokens": max_tokens,
                                "temperature": temperature,
                                "top_p": top_p,
                                "response_mime_type": "application/json" if json_format else None,
                                }
            response = model.generate_content(contents=messages, generation_config=generation_config)
            if (response.prompt_feedback is None or
                  response.prompt_feedback.block_reason.name == "BLOCK_REASON_UNSPECIFIED"):
                return response.text if json_format else extract_json_blocks(response.text)
            else:
                print(response.prompt_feedback.block_reason.name)
                raise Exception(f"API call ended before task. Reason: {response.prompt_feedback.block_reason.name}")

        elif api_host.lower() == "openai":
            completion = self.openai_client.chat.completions.create(model=model_name,messages=messages,temperature=temperature,
                    top_p= top_p, max_completion_tokens=max_tokens,response_format= {"type": json_format})
          
            if completion.choices[0].finish_reason == "stop":
                return completion.choices[0].message.content
            else:
                print(completion.choices[0].finish_reason)
                raise Exception(f"API call ended before task. Reason: {completion.choices[0].finish_reason}")

        elif api_host.lower() == "openrouter":
            if model_name == "deepseek/deepseek-chat-v3.1":
                args = {"model": model_name, "messages": messages, "temperature": temperature, "top_p": top_p, 
                        "reasoning": {"effort": "high", "exclude": False}}
            else:
                args = {"model": model_name, "messages": messages, "temperature": temperature, "top_p": top_p}
            if json_format:
                args["response_format"]= {"type": "json_object"}
          
            response = requests.post(url="https://openrouter.ai/api/v1/chat/completions",
                                 headers={"Authorization": f"Bearer {self.openrouter_key}", "Content-Type": "application/json"},
                                   data=json.dumps(args))
            response = response.json()
            #print(response['choices'][0]['message']['content'])
            if response['choices'][0]['finish_reason'] == "stop":
                return extract_json_blocks(response['choices'][0]['message']['content'])
            else:
                print(response['choices'][0]['finish_reason'])
                raise Exception(f"API call ended before task. Reason: {response['choices'][0]['finish_reason']}")

        else:
            raise Exception("Please choose api cloud host from 'google', 'groq', 'openai' and 'openrouter'.")

    def safe_call_llm(self, model_name, prompt, max_tokens=10000):
      try:
          return self.call_llm(model_name, prompt, max_tokens), None
      except Exception as e:
          # Handle API calls limits
          msg = read_Exception(e).lower()
          if any(keyword in msg for keyword in ["quota", "billing", "insufficient", "resource exhausted", "429", "402"]):
              print(model_name, "has reached its limit")
          elif 'ended' in msg:
              print(model_name, "needs more tokens to think")
          return None, msg
      

    def evaluate_single_jury(self, model_name: str, question: str, answer: str, ground_truth: str) -> Dict[str, Any]:
        JSON_mention='JSON\n' if model_name=='gpt-4.1' else '\n'
            
        prompt = self.prompt_template.format(JSON_mention=JSON_mention, question=question, answer=answer, ground_truth=ground_truth)
        for attempt in range(3):
            decision, call_status = self.safe_call_llm(model_name=model_name, prompt=prompt)
            if call_status is None:
                decision = json.loads(decision)
                verdict, reason = decision['verdict'][0], decision['verdict'][1]
                return {"jury_model": model_name, "verdict": verdict, "reason": reason, "success": True}
        # Exceeded retries, skip this jury
        return {"jury_model": model_name, "verdict": 0, "success": False, "error": call_status}

    def compute_llm_juries(self, question: str, answer: str, ground_truth: str) -> Dict[str, Any]:
        evaluations = {model: self.evaluate_single_jury(model, question, answer, ground_truth)
                       for model in self.models_spec}
        
        # Keep only successful juries
        successful = [eval for _,eval in evaluations.items() if eval.get("success")]
        verdicts = [eval["verdict"] for eval in successful]
        if not verdicts:
            majority = None
            print("no verdict")
        elif verdicts.count(1) > verdicts.count(0):
            majority = 1
        elif verdicts.count(1) < verdicts.count(0):
            majority = 0
        else:
            majority = evaluations["gpt-4.1"]["verdict"]
            
        confidence = 0.0 if not verdicts else verdicts.count(majority) / len(verdicts)
        
        return {
            "evaluations": evaluations,
            "final_verdict": majority,
            "confidence": confidence,
            "successful_juries": len(successful),
            "total_juries": len(evaluations)
        }


## Setup up dataset

In [None]:
dataset = load_dataset('AI-4-Everyone/Visual-TableQA', split='test')
print(dataset)

In [None]:
from transformers import AutoProcessor,  AutoTokenizer, Qwen2_5_VLForConditionalGeneration, LlavaNextForConditionalGeneration
from PIL import Image, ImageOps
import torch

In [None]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from charts and diagrams images.
Answer the questions strictly from the image, with clear, rigorous step-by-step justification. Stay concise, but include all reasoning that’s relevant."""

def to_pil(img):
    return img if isinstance(img, Image.Image) else Image.fromarray(img)

### Setup Qwen2.5-VL

In [10]:
MODEL_ID    = "Qwen/Qwen2.5-VL-7B-Instruct"

min_pixels = 256*28*28
max_pixels = 2560*28*28
processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)
use_bf16 = torch.cuda.is_bf16_supported()

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    # Optional (if your setup supports it):
    attn_implementation="flash_attention_2",
    low_cpu_mem_usage=True,
)


##Uncomment for LoRA finetuned version
#MERGED_DIR = "qwen-vl-merged-tierA-tableqa"
#OUTPUT_DIR_TIER_B= "qwen-vl-sft-lora-tableqa-tierB"

#base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#    MERGED_DIR, device_map="auto", torch_dtype="auto",
#    attn_implementation="flash_attention_2", low_cpu_mem_usage=True
#)

#model = PeftModel.from_pretrained(base, OUTPUT_DIR_TIER_B)

#del base                       
#torch.cuda.empty_cache()    
#model.eval()

def format_data(sample):
    return [
        {"role": "system",
         "content": [{"type": "text", "text": system_message}],
        },
        {"role": "user",
         "content": [
                {"type": "image",
                 "image": to_pil(sample["image"]),},
                {"type": "text",
                 "text": sample["question"],
                },
            ],
        },
        {"role": "assistant",
         "content": [{"type": "text", "text": sample["answer"]}],
        },
    ]

def generate_text_from_sample(model, sample, max_new_tokens=5000, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(format_data(sample)[:2], tokenize=False, add_generation_prompt=True)
    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(format_data(sample)[:2])
    
    # Prepare the inputs for the model
    model_inputs = processor(text=[text_input], images=image_inputs, return_tensors="pt").to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]  # Return the first decoded output text

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 5/5 [00:08<00:00,  1.71s/it]


### Setup LLaVA

In [None]:
MODEL_ID = "llava-hf/llama3-llava-next-8b-hf"

processor = AutoProcessor.from_pretrained(MODEL_ID)
use_bf16 = torch.cuda.is_bf16_supported()


model = LlavaNextForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    low_cpu_mem_usage=True,
)

#Uncomment for LoRA finetuned version
#adapter_path = "PATH_TO_LORA_ADAPTER"
#model.load_adapter(adapter_path)

def build_messages(sample):
    gt=sample["answer"]
    system= {"role": "system", "content": [{"type": "text", "text": system_message}]}
    user = {"role": "user", "content": [{"type":"image", "image": to_pil(sample['image'])}, {"type":"text","text": sample['question']}]}
    asst = {"role": "assistant", "content": [{"type":"text","text": gt}]}
    return [system, user], [system, user, asst]

def generate_text_from_sample(model, sample, max_new_tokens=5000, device="cuda"):
    text_input = processor.apply_chat_template(build_messages(sample)[0], tokenize=False, add_generation_prompt=True)
    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(build_messages(sample)[0])
    # Prepare the inputs for the model
    model_inputs = processor(text=[text_input], images=image_inputs, return_tensors="pt").to(device)  # Move inputs to the specified device

    gen_kwargs = {"max_new_tokens":max_new_tokens, "do_sample":False, "pad_token_id": processor.tokenizer.pad_token_id, 
                  "eos_token_id": processor.tokenizer.eos_token_id}
    
    with torch.inference_mode():
        output_ids = model.generate(**model_inputs, **gen_kwargs)

    # Decode only the generated part (exclude input tokens)
    input_token_len = model_inputs["input_ids"].shape[1]
    generated_ids = output_ids[:, input_token_len:]
    
    # Decode the generated text
    generated_text = processor.decode(
        generated_ids[0], 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True
    )
    
    return generated_text.strip()

### Evaluation Loop

In [16]:
# Instantiate the evaluator
evaluator = TableQA_Evaluation(api_keys)

In [1]:
# Setup metrics
results = {
    "skipped": [],
    "evaluations": [],
    }

#Uncomment to resume evaluation in case of interruption
#with open("qwen2.5-vl-7b-detailed-results.json", "r") as f:
#    results = json.load(f)

In [None]:
id_start=len(results['evaluations'])
for i in range(id_start, len(dataset)):
    # Retrieve data from row
    image = dataset[i]['image']
    question = dataset[i]['question']
    answer = dataset[i]['answer']
    
    # Predict response
    pred, verdict = None, None
    for attempt in range(3):
        try:
            pred = generate_text_from_sample(model=model, sample=dataset[i])
            break # success
        except Exception as e:
            if attempt == 2:
                print(f"Issue during inference with row {i}: {e}")
    if pred is None:
        # at least one attempt failed → skip the row
        results['skipped'].append(i)
        continue
    
    try:
        evaluations = evaluator.compute_llm_juries(question, pred, answer)
        verdict = evaluations['final_verdict']
    except Exception as e:
        print(e.__class__.__name__, f" An Error occured while computing juries verdict with row {i}")
    if verdict is None:
        # at least one jury failed → skip the row
        results['skipped'].append(i)
        continue
    results['evaluations'].append({"row":i, "question":question, "pred":pred, "answer":answer, "evals":evaluations})
    correctness=verdict
    confidence = evaluations["confidence"]

    print(f"------Row {i} : corr= {correctness}, conf= {confidence} ----------")
    with open("qwen2.5-vl-7b-detailed-results.json", "w") as f:
        json.dump(results, f, indent=2)
    
print(f"skipped:{results['skipped']}")

In [20]:
corr=[evaluation['evals']["final_verdict"] for evaluation in results["evaluations"]]

In [None]:
final = (sum(corr)) /len(corr)
print(f'correctness: {final*100:.2f}%.')