In [None]:
# ðŸ§© STEP 1 â€” Install dependencies
!pip install -q datasets openai tqdm pandas

In [None]:
from google.colab import userdata
import os

# Ensure you run this cell in the Colab UI and have 'Openai_key' set in Colab Secrets.
os.environ["OPENAI_API_KEY"] = userdata.get("Openai_key")

# If you encounter 'TimeoutException' and cannot resolve it via Colab UI,
# you can temporarily uncomment the line below and paste your key directly.
# os.environ["OPENAI_API_KEY"] = "sk-YOUR_OPENAI_API_KEY_HERE"

print("API key loaded:", bool(os.environ.get("OPENAI_API_KEY")))

API key loaded: True


In [None]:
import os
import random
from datasets import load_dataset
from openai import AsyncOpenAI
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
import asyncio
import re # Import re for regular expressions

# -------------------------------------------------
# âš™âš™ CONFIG â€” CHANGE ONLY THIS BLOCK
# -------------------------------------------------
DATASET_NAME = "openai/gsm8k"   # HF dataset ID
# TEXT_COLUMN  = "question"       # input column - will be auto-detected
# LABEL_COLUMN = "answer"         # ground truth column - will be auto-detected

SAMPLE_SIZE = 500
SEED = 42

# Logical labels (as requested by TL)
MODELS = {
    "gpt-5.2": "gpt-5.2",       # Changed to a valid OpenAI model
    "gpt-5-mini": "gpt-5-mini"  # Changed to a valid OpenAI model
}

CONCURRENCY_LIMIT = 5 # Define concurrency limit
# -------------------------------------------------

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# -------------------------------------------------
# LOAD DATASET AND AUTO-DETECT COLUMNS
# -------------------------------------------------
print(f"Loading dataset: {DATASET_NAME}")
# Modified: Specify the config_name for openai/gsm8k dataset
dataset = load_dataset(DATASET_NAME, 'main')

split = "train" if "train" in dataset else list(dataset.keys())[0]
data = list(dataset[split])

# Auto-detect TEXT_COLUMN and LABEL_COLUMN
features = dataset[split].features.keys()

# Define common column names to look for
common_text_columns = ["text", "question", "prompt", "sentence"]
common_label_columns = ["label", "answer", "target", "score"]

TEXT_COLUMN = None
for col in common_text_columns:
    if col in features:
        TEXT_COLUMN = col
        break

LABEL_COLUMN = None
for col in common_label_columns:
    if col in features:
        LABEL_COLUMN = col
        break

if not TEXT_COLUMN or not LABEL_COLUMN:
    raise ValueError(
        f"Could not auto-detect text or label columns. \n"
        f"Please manually specify TEXT_COLUMN and LABEL_COLUMN in the CONFIG block.\n"
        f"Available features: {list(features)}"
    )

print(f"Detected TEXT_COLUMN: '{TEXT_COLUMN}' and LABEL_COLUMN: '{LABEL_COLUMN}'")

random.seed(SEED)
samples = random.sample(data, min(SAMPLE_SIZE, len(data)))

print(f"Using {len(samples)} samples from '{split}' split")

# -------------------------------------------------
# SYSTEM PROMPT (INTENTIONALLY MINIMAL)
# -------------------------------------------------
SYSTEM_PROMPT = """
You are a question-answering system.
Answer the user query.
Return only the final answer.
"""

def normalize(text):
    return str(text).strip().lower()

# New helper function to extract final numerical answer
def extract_final_answer(text):
    text = str(text)
    # 1. Try to find "#### X" pattern (common in GSM8K ground truth)
    match = re.search(r'#### (\-?\d+(?:\.\d+)?)', text)
    if match:
        return float(match.group(1))

    # 2. Try to find the last numerical value in the text
    # This regex looks for:
    #   - optional minus sign (\-?)
    #   - one or more digits (\d+)
    #   - optionally followed by a decimal point and one or more digits (?:\.\d+)?
    numbers = re.findall(r'\-?\d+(?:\.\d+)?', text)
    if numbers:
        return float(numbers[-1]) # Take the last found number

    return None

# Refactored to be an async function and use semaphore
async def run_model(model_id, question):
    async with semaphore: # Acquire semaphore before API call
        response = await client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": question}
            ]
        )
    return response.choices[0].message.content.strip()

# -------------------------------------------------
# EVALUATION (Refactored for async)
# -------------------------------------------------
results = {name: 0 for name in MODELS}

async def main_async():
    # Prepare tasks for all model-question pairs
    tasks = []
    sample_info = [] # To store (true_answer, model_name) for post-processing

    for item in samples:
        question = item[TEXT_COLUMN]
        # Extract numerical answer from true_answer
        true_numeric_answer = extract_final_answer(item[LABEL_COLUMN])

        for name, model_id in MODELS.items():
            tasks.append(run_model(model_id, question))
            # Store both true_numeric_answer and original true_answer for robust comparison
            sample_info.append((true_numeric_answer, normalize(item[LABEL_COLUMN]), name))

    # Run all tasks concurrently with tqdm_asyncio
    print(f"\nRunning models asynchronously with {CONCURRENCY_LIMIT} concurrent requests...")
    predictions = await tqdm_asyncio.gather(*tasks, desc="Evaluating models") # Changed to tqdm_asyncio.gather

    # Process predictions and update results
    for i, pred_raw in enumerate(predictions):
        true_numeric_answer, true_normalized_answer, model_name = sample_info[i]
        pred_normalized = normalize(pred_raw)
        pred_numeric_answer = extract_final_answer(pred_normalized)

        # Robust comparison: try numerical first, then substring match on normalized text
        is_correct = False
        if true_numeric_answer is not None and pred_numeric_answer is not None:
            # Allow for floating point comparison with a small tolerance if needed,
            # but for GSM8K, exact match is usually expected after extraction.
            is_correct = (true_numeric_answer == pred_numeric_answer)

        if not is_correct:
            # Fallback to substring check if numerical comparison failed or wasn't possible
            # This is to catch cases where model returns full sentence with answer
            is_correct = true_normalized_answer in pred_normalized

        if is_correct:
            results[model_name] += 1

# Run the asynchronous main function directly using await
await main_async()

# -------------------------------------------------
# RESULTS
# -------------------------------------------------
print("\n==============================")
print(f"4CA BENCHMARK RESULTS â€” {DATASET_NAME}")
print("==============================")

for name in MODELS:
    acc = (results[name] / len(samples)) * 100
    print(f"{name} accuracy: {acc:.2f}%")


Loading dataset: openai/gsm8k
Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 500 samples from 'train' split

Running models asynchronously with 5 concurrent requests...


Evaluating models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [10:17<00:00,  1.62it/s]


4CA BENCHMARK RESULTS â€” openai/gsm8k
gpt-5.2 accuracy: 91.00%
gpt-5-mini accuracy: 89.00%





In [None]:
import os
import random
from datasets import load_dataset
from openai import AsyncOpenAI
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
import asyncio
import re # Import re for regular expressions

# -------------------------------------------------
# âš™âš™ CONFIG â€” CHANGE ONLY THIS BLOCK
# -------------------------------------------------
DATASET_NAME = "openai/gsm8k"   # HF dataset ID
# TEXT_COLUMN  = "question"       # input column - will be auto-detected
# LABEL_COLUMN = "answer"         # ground truth column - will be auto-detected

SAMPLE_SIZE = 500
SEED = 42

# Logical labels (as requested by TL)
MODELS = {
    "gpt-5.2": "gpt-5.2",       # Changed to a valid OpenAI model
    "gpt-5-mini": "gpt-5-mini"  # Changed to a valid OpenAI model
}

CONCURRENCY_LIMIT = 5 # Define concurrency limit
# -------------------------------------------------

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# -------------------------------------------------
# LOAD DATASET AND AUTO-DETECT COLUMNS
# -------------------------------------------------
print(f"Loading dataset: {DATASET_NAME}")
# Modified: Specify the config_name for openai/gsm8k dataset
dataset = load_dataset(DATASET_NAME, 'main')

split = "train" if "train" in dataset else list(dataset.keys())[0]
data = list(dataset[split])

# Auto-detect TEXT_COLUMN and LABEL_COLUMN
features = dataset[split].features.keys()

# Define common column names to look for
common_text_columns = ["text", "question", "prompt", "sentence"]
common_label_columns = ["label", "answer", "target", "score"]

TEXT_COLUMN = None
for col in common_text_columns:
    if col in features:
        TEXT_COLUMN = col
        break

LABEL_COLUMN = None
for col in common_label_columns:
    if col in features:
        LABEL_COLUMN = col
        break

if not TEXT_COLUMN or not LABEL_COLUMN:
    raise ValueError(
        f"Could not auto-detect text or label columns. \n"
        f"Please manually specify TEXT_COLUMN and LABEL_COLUMN in the CONFIG block.\n"
        f"Available features: {list(features)}"
    )

print(f"Detected TEXT_COLUMN: '{TEXT_COLUMN}' and LABEL_COLUMN: '{LABEL_COLUMN}'")

random.seed(SEED)
samples = random.sample(data, min(SAMPLE_SIZE, len(data)))

print(f"Using {len(samples)} samples from '{split}' split")

# -------------------------------------------------
# SYSTEM PROMPT (INTENTIONALLY MINIMAL)
# -------------------------------------------------
SYSTEM_PROMPT = """
You are a question-answering system.
Answer the user query.
Return only the final answer.
"""

def normalize(text):
    return str(text).strip().lower()

# New helper function to extract final numerical answer
def extract_final_answer(text):
    text = str(text)
    # 1. Try to find "#### X" pattern (common in GSM8K ground truth)
    match = re.search(r'#### (\-?\d+(?:\.\d+)?)', text)
    if match:
        return float(match.group(1))

    # 2. Try to find the last numerical value in the text
    # This regex looks for:
    #   - optional minus sign (\-?)
    #   - one or more digits (\d+)
    #   - optionally followed by a decimal point and one or more digits (?:\.\d+)?
    numbers = re.findall(r'\-?\d+(?:\.\d+)?', text)
    if numbers:
        return float(numbers[-1]) # Take the last found number

    return None

# Refactored to be an async function and use semaphore
async def run_model(model_id, question):
    async with semaphore: # Acquire semaphore before API call
        response = await client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": question}
            ]
        )
    return response.choices[0].message.content.strip()

# -------------------------------------------------
# EVALUATION (Refactored for async)
# -------------------------------------------------
results = {name: 0 for name in MODELS}

async def main_async():
    # Prepare tasks for all model-question pairs
    tasks = []
    sample_info = [] # To store (true_answer, model_name) for post-processing

    for item in samples:
        question = item[TEXT_COLUMN]
        # Extract numerical answer from true_answer
        true_numeric_answer = extract_final_answer(item[LABEL_COLUMN])

        for name, model_id in MODELS.items():
            tasks.append(run_model(model_id, question))
            # Store both true_numeric_answer and original true_answer for robust comparison
            sample_info.append((true_numeric_answer, normalize(item[LABEL_COLUMN]), name))

    # Run all tasks concurrently with tqdm_asyncio
    print(f"\nRunning models asynchronously with {CONCURRENCY_LIMIT} concurrent requests...")
    predictions = await tqdm_asyncio.gather(*tasks, desc="Evaluating models") # Changed to tqdm_asyncio.gather

    # Process predictions and update results
    for i, pred_raw in enumerate(predictions):
        true_numeric_answer, true_normalized_answer, model_name = sample_info[i]
        pred_normalized = normalize(pred_raw)
        pred_numeric_answer = extract_final_answer(pred_normalized)

        # Robust comparison: try numerical first, then substring match on normalized text
        is_correct = False
        if true_numeric_answer is not None and pred_numeric_answer is not None:
            # Allow for floating point comparison with a small tolerance if needed,
            # but for GSM8K, exact match is usually expected after extraction.
            is_correct = (true_numeric_answer == pred_numeric_answer)

        if not is_correct:
            # Fallback to substring check if numerical comparison failed or wasn't possible
            # This is to catch cases where model returns full sentence with answer
            is_correct = true_normalized_answer in pred_normalized

        if is_correct:
            results[model_name] += 1

# Run the asynchronous main function directly using await
await main_async()

# -------------------------------------------------
# RESULTS
# -------------------------------------------------
print("\n==============================")
print(f"\u00104CA BENCHMARK RESULTS â€” {DATASET_NAME}")
print("==============================")

for name in MODELS:
    acc = (results[name] / len(samples)) * 100
    print(f"{name} accuracy: {acc:.2f}%")


Loading dataset: openai/gsm8k
Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 500 samples from 'train' split

Running models asynchronously with 5 concurrent requests...


Evaluating models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [09:49<00:00,  1.70it/s]


4CA BENCHMARK RESULTS â€” openai/gsm8k
gpt-5.2 accuracy: 90.20%
gpt-5-mini accuracy: 88.80%





In [None]:
import os
import random
from datasets import load_dataset
from openai import AsyncOpenAI
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
import asyncio
import re # Import re for regular expressions
import pandas as pd # Import pandas for tabular output
from google.colab import userdata # Import userdata to get API key

# -------------------------------------------------
# âš™âš™ CONFIG â€” CHANGE ONLY THIS BLOCK
# -------------------------------------------------
# These are now mostly ignored as we loop through BENCHMARK_DATASETS
# DATASET_NAME = "openai/gsm8k"
# TEXT_COLUMN  = "question"
# LABEL_COLUMN = "answer"

SAMPLE_SIZE = 50 # Reduced sample size as per new instructions
SEED = 42

# Logical labels (as requested by TL) - Changed to actual OpenAI model IDs
MODELS = {
    "gpt-5.2": "gpt-5.2",       # Changed to a valid OpenAI model
    "gpt-5-mini": "gpt-5-mini"  # Changed to a valid OpenAI model
}

CONCURRENCY_LIMIT = 5 # Define concurrency limit
# -------------------------------------------------

# Ensure API key is loaded before initializing client
# This makes the cell robust against kernel restarts or out-of-order execution
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = userdata.get("Openai_key")

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# Define common column names to look for (for auto-detection fallback)
common_text_columns = ["text", "question", "prompt", "sentence", "content"]
common_label_columns = ["label", "answer", "target", "score", "answerKey"]

# -------------------------------------------------
# HELPER FUNCTIONS
# -------------------------------------------------
SYSTEM_PROMPT = """
You are a question-answering system.
Answer the user query.
Return only the final answer.
"""

def normalize(text):
    return str(text).strip().lower()

def extract_final_answer(text):
    text = str(text)
    # 1. Try to find "#### X" pattern (common in GSM8K ground truth)
    match = re.search(r'#### (\-?\d+(?:\.\d+)?)', text)
    if match:
        return float(match.group(1))

    # 2. Try to find the last numerical value in the text
    numbers = re.findall(r'\-?\d+(?:\.\d+)?', text)
    if numbers:
        return float(numbers[-1]) # Take the last found number

    return None

async def run_model(model_id, question):
    async with semaphore: # Acquire semaphore before API call
        response = await client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": question}
            ]
        )
    return response.choices[0].message.content.strip()

# -------------------------------------------------
# BENCHMARK DATASET DEFINITIONS - Curated for strict 'string containment works' metric
# -------------------------------------------------
BENCHMARK_DATASETS = [
    {"dataset_name": "openai/gsm8k", "config_name": "main", "text_col_override": "question", "label_col_override": "answer"},
    {"dataset_name": "nq_open", "config_name": "nq_open", "text_col_override": "question", "label_col_override": "answer"},
    {"dataset_name": "web_questions", "text_col_override": "question", "label_col_override": "answers"},
    # Removed 'wiki_qa' as it's an answer sentence selection task with 0/1 labels, not direct answer generation.
    {"dataset_name": "trivia_qa", "config_name": "rc.nocontext", "text_col_override": "question", "label_col_override": "answer"},
    {"dataset_name": "truthful_qa", "config_name": "generation", "text_col_override": "question", "label_col_override": "best_answer"}
]

all_benchmark_results = [] # To store results for all datasets

async def run_benchmark_for_dataset(dataset_info):
    current_dataset_name = dataset_info["dataset_name"]
    config_name = dataset_info.get("config_name")
    text_col_override = dataset_info.get("text_col_override")
    label_col_override = dataset_info.get("label_col_override")

    print(f"\n--- Benchmarking: {current_dataset_name} ({config_name if config_name else 'default'}) ---")

    # Load dataset
    print(f"Loading dataset: {current_dataset_name}")
    if config_name:
        dataset = load_dataset(current_dataset_name, config_name)
    else:
        dataset = load_dataset(current_dataset_name)

    split = "validation" if "validation" in dataset else "test" if "test" in dataset else list(dataset.keys())[0]
    data = list(dataset[split])

    text_column = text_col_override
    label_column = label_col_override

    if not text_column or not label_column:
        raise ValueError(
            f"Could not determine text or label columns for {current_dataset_name}. \n"
            f"Please specify TEXT_COLUMN and LABEL_COLUMN. Available features: {list(dataset[split].features.keys())}"
        )
    print(f"Detected TEXT_COLUMN: '{text_column}' and LABEL_COLUMN: '{label_column}'")

    random.seed(SEED)
    samples = random.sample(data, min(SAMPLE_SIZE, len(data)))
    print(f"Using {len(samples)} samples from '{split}' split")

    current_dataset_results = {name: 0 for name in MODELS}

    # Prepare tasks for all model-question pairs
    tasks = []
    sample_infos = [] # To store (true_numeric_answer, true_normalized_answers_list, model_name) for post-processing

    for item in samples:
        question_text = item[text_column]
        raw_label_content = item[label_column]

        true_answer_raw_candidates = []

        # Generic handling for various label structures (single string, list of strings, dict with specific keys)
        if isinstance(raw_label_content, dict):
            if 'text' in raw_label_content and isinstance(raw_label_content['text'], list):
                true_answer_raw_candidates.extend(raw_label_content['text'])
            elif 'normalized_value' in raw_label_content and isinstance(raw_label_content['normalized_value'], str):
                true_answer_raw_candidates.append(raw_label_content['normalized_value'])
            elif 'best_answer' in raw_label_content and isinstance(raw_label_content['best_answer'], str):
                true_answer_raw_candidates.append(raw_label_content['best_answer'])
            # Fallback for dicts if no specific key is found, convert whole dict to string (less ideal but generic)
            else:
                true_answer_raw_candidates.append(str(raw_label_content))
        elif isinstance(raw_label_content, list):
            true_answer_raw_candidates.extend([str(x) for x in raw_label_content])
        else: # Assume it's a string or number directly
            true_answer_raw_candidates.append(str(raw_label_content))

        # Filter out empty strings and normalize all candidates
        true_normalized_answers = [normalize(cand) for cand in true_answer_raw_candidates if cand]

        # Use the first candidate for numerical extraction if available
        true_numeric_answer = None
        if true_answer_raw_candidates:
            true_numeric_answer = extract_final_answer(true_answer_raw_candidates[0])

        for name, model_id in MODELS.items():
            tasks.append(run_model(model_id, question_text))
            # Store candidates, numeric, and model name
            sample_infos.append((true_numeric_answer, true_normalized_answers, name))

    print(f"\nRunning models asynchronously with {CONCURRENCY_LIMIT} concurrent requests...")
    predictions = await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {current_dataset_name} models")

    # Process predictions and update results
    for i, pred_raw in enumerate(predictions):
        true_numeric_answer, true_normalized_answers_list, model_name = sample_infos[i]
        pred_normalized = normalize(pred_raw)
        pred_numeric_answer = extract_final_answer(pred_normalized)

        is_correct = False
        if true_numeric_answer is not None and pred_numeric_answer is not None:
            is_correct = (true_numeric_answer == pred_numeric_answer)

        # Fallback to substring check if numerical comparison failed or wasn't possible
        if not is_correct:
            # Check if prediction contains any of the true normalized answers
            for true_ans_normalized in true_normalized_answers_list:
                if true_ans_normalized in pred_normalized:
                    is_correct = True
                    break

        if is_correct:
            current_dataset_results[model_name] += 1

    dataset_summary = {
        "Dataset": f"{current_dataset_name}{' (' + config_name + ')' if config_name else ''}",
        "Samples": len(samples)
    }
    for name in MODELS:
        acc = (current_dataset_results[name] / len(samples)) * 100
        dataset_summary[name] = f"{acc:.2f}%"
    all_benchmark_results.append(dataset_summary)

    print("\n==============================")
    print(f"ðŸ“Š BENCHMARK RESULTS â€” {current_dataset_name} ({config_name if config_name else 'default'}) ")
    print("=============================")

    for name in MODELS:
        acc = (current_dataset_results[name] / len(samples)) * 100
        print(f"{name} accuracy: {acc:.2f}%")

    return current_dataset_results

async def overall_main():
    for dataset_info in BENCHMARK_DATASETS:
        await run_benchmark_for_dataset(dataset_info)

    print("\n\n=============================================")
    print("âœ¨ OVERALL BENCHMARK RESULTS SUMMARY âœ¨")
    print("=============================================")

    results_df = pd.DataFrame(all_benchmark_results)
    print(results_df.to_markdown(index=False))

# Run the overall asynchronous main function
await overall_main()


--- Benchmarking: openai/gsm8k (main) ---
Loading dataset: openai/gsm8k
Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 50 samples from 'test' split

Running models asynchronously with 5 concurrent requests...


Evaluating openai/gsm8k models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:56<00:00,  1.78it/s]



ðŸ“Š BENCHMARK RESULTS â€” openai/gsm8k (main) 
gpt-5.2 accuracy: 88.00%
gpt-5-mini accuracy: 84.00%

--- Benchmarking: nq_open (nq_open) ---
Loading dataset: nq_open


README.md: 0.00B [00:00, ?B/s]

nq_open/train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

nq_open/validation-00000-of-00001.parque(â€¦):   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 50 samples from 'validation' split

Running models asynchronously with 5 concurrent requests...


Evaluating nq_open models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [01:38<00:00,  1.02it/s]



ðŸ“Š BENCHMARK RESULTS â€” nq_open (nq_open) 
gpt-5.2 accuracy: 62.00%
gpt-5-mini accuracy: 52.00%

--- Benchmarking: web_questions (default) ---
Loading dataset: web_questions


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/260k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3778 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2032 [00:00<?, ? examples/s]

Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answers'
Using 50 samples from 'test' split

Running models asynchronously with 5 concurrent requests...


Evaluating web_questions models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [02:19<00:00,  1.40s/it]



ðŸ“Š BENCHMARK RESULTS â€” web_questions (default) 
gpt-5.2 accuracy: 74.00%
gpt-5-mini accuracy: 66.00%

--- Benchmarking: wiki_qa (default) ---
Loading dataset: wiki_qa


README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/594k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/264k [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2733 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20360 [00:00<?, ? examples/s]

Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 50 samples from 'validation' split

Running models asynchronously with 5 concurrent requests...


Evaluating wiki_qa models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [01:59<00:00,  1.19s/it]



ðŸ“Š BENCHMARK RESULTS â€” wiki_qa (default) 
gpt-5.2 accuracy: 2.00%
gpt-5-mini accuracy: 4.00%

--- Benchmarking: trivia_qa (rc.nocontext) ---
Loading dataset: trivia_qa


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

rc.nocontext/train-00000-of-00001.parque(â€¦):   0%|          | 0.00/55.4M [00:00<?, ?B/s]

rc.nocontext/validation-00000-of-00001.p(â€¦):   0%|          | 0.00/7.34M [00:00<?, ?B/s]

rc.nocontext/test-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'answer'
Using 50 samples from 'validation' split

Running models asynchronously with 5 concurrent requests...


Evaluating trivia_qa models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:54<00:00,  1.83it/s]



ðŸ“Š BENCHMARK RESULTS â€” trivia_qa (rc.nocontext) 
gpt-5.2 accuracy: 80.00%
gpt-5-mini accuracy: 80.00%

--- Benchmarking: truthful_qa (generation) ---
Loading dataset: truthful_qa
Detected TEXT_COLUMN: 'question' and LABEL_COLUMN: 'best_answer'
Using 50 samples from 'validation' split

Running models asynchronously with 5 concurrent requests...


Evaluating truthful_qa models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [03:06<00:00,  1.87s/it]


ðŸ“Š BENCHMARK RESULTS â€” truthful_qa (generation) 
gpt-5.2 accuracy: 6.00%
gpt-5-mini accuracy: 10.00%


âœ¨ OVERALL BENCHMARK RESULTS SUMMARY âœ¨
| Dataset                  |   Samples | gpt-5.2   | gpt-5-mini   |
|:-------------------------|----------:|:----------|:-------------|
| openai/gsm8k (main)      |        50 | 88.00%    | 84.00%       |
| nq_open (nq_open)        |        50 | 62.00%    | 52.00%       |
| web_questions            |        50 | 74.00%    | 66.00%       |
| wiki_qa                  |        50 | 2.00%     | 4.00%        |
| trivia_qa (rc.nocontext) |        50 | 80.00%    | 80.00%       |
| truthful_qa (generation) |        50 | 6.00%     | 10.00%       |



