In [None]:
!pip install pandasql
!pip install natsort

In [None]:
# ReAcTable Evaluation: Code for Iterative Evaluation on the Entire TableBench Dataset

import sys
import os
sys.path.append(os.path.abspath("../"))
import json
import pandas as pd
from datasets import load_dataset
from Tablebench_loader import Tablebench_Filtered
from dotenv import load_dotenv

dotenv_path = "/workspace/ReAcTable/notebooks/.env"
load_dotenv(dotenv_path=dotenv_path)

from tabqa.GptCOTPrompter_BeamSeach import CodexAnswerCOTExecutor_HighTemperaturMajorityVote

# ========== Setting ==========
base_path = './'
demo_file = 'few-shot-demo/WikiTQ-sql-py.json'
prompt_template_json = 'prompt_template/original-sql-py-no-intermediate.json'
table_output_dir = 'tablebench_eval_logs/table_csv'
log_output_dir = 'tablebench_eval_logs/'
# os.makedirs(output_dir, exist_ok=True)

# ========== Define Evaluation Function ==========
def evaluate_single_sample(index, 
                           model_name='gpt-4o-mini',
                           temperature=0.6,
                           max_tokens=128,
                           repeat_times=5):
    """Perform LLM-based evaluation on a single TableBench sample"""
    
    filtered.index = index
    table_df = filtered.get_table()
    table_path = os.path.join(table_output_dir, f'tablebench_{index}.csv')
    table_df.to_csv(table_path, index=False)
    
    qid = f"tablebench_{index}"
    utterance = filtered.get_question()
    target_value = filtered.get_answer()

    executor = CodexAnswerCOTExecutor_HighTemperaturMajorityVote(
        prompt_template_json=prompt_template_json,
        qid=qid,
        utterance=utterance,
        source_csv=table_path,
        target_value=target_value,
        base_path=base_path,
        demo_file=demo_file,
        sep=','
    )

    executor.model = model_name
    executor.temperature = temperature
    executor.max_tokens = max_tokens

    executor._read_data()
    executor._gen_gpt_prompt()
    executor._get_gpt_prediction_majority_vote(repeat_times=repeat_times)

    result = executor._log_dict()

    return result

# ========== TableBench Dataset Load ==========
dataset = load_dataset(path="Multilingual-Multimodal-NLP/TableBench", data_files="TableBench.jsonl", split="train")
filtered = Tablebench_Filtered(dataset, 0)

results = []
output_path = os.path.join(log_output_dir, "tablebench_results.json")
# ========== Repeat the process ==========
start_idx = 0
end_idx = 2  # change the index range if you have to

for i in range(start_idx, end_idx):
    try:
        print(f"▶️ Evaluating index {i}")
        result = evaluate_single_sample(
            index=i,
            model_name='gpt-4o-mini',
            temperature=0.7,
            max_tokens=150,
            repeat_times=3
        )
        results.append(result)

        # Incrementally save results to file after each loop
        with open(output_path, "w") as f:
            json.dump(results, f, indent=2)

    except Exception as e:
        print(f"❌ Error at index {i}: {e}")