# ResearchBench Evaluation Analysis

This notebook demonstrates how to load and analyze Inspect AI evaluation logs.

In [2]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np

src_path = Path.cwd().parent.parent
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from dataset.dataloader import Dataloader
from experiments.utils.analysis import (
    load_eval_logs_to_dataframe,
    aggregate_runs,
    get_model_summary_stats,
    get_per_paper_stats
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print(f"Working from: {Path.cwd()}")

ModuleNotFoundError: No module named 'dataset'

## 1. Configure Paths

Update these paths to point to your evaluation logs and data directories.

In [None]:
log_dirs = [
    "logs/model1-run-1",
    "logs/model1-run-2",
    "logs/model1-run-3",
    "logs/model2-run-1",
    "logs/model2-run-2",
    "logs/model2-run-3",
]

existing_dirs = [d for d in log_dirs if Path(d).exists() or Path(f"../../{d}").exists()]
existing_dirs = [str(Path(f"../../{d}").resolve()) if not Path(d).exists() else d for d in existing_dirs]

print(f"Found {len(existing_dirs)} log directories")
for d in existing_dirs:
    print(f"  - {d}")

## 2. Load Dataloader

Load papers and tasks for computing difficulty-weighted scores.

In [None]:
dataloader = Dataloader(
    task_types=["numeric", "code"],
    load_text=False
)

print(f"Loaded {len(dataloader.papers)} papers")
print(f"Total tasks: {sum(len(p.tasks) for p in dataloader.papers.values())}")

## 3. Load Evaluation Logs

Load all evaluation logs into a structured DataFrame.

In [None]:
if not existing_dirs:
    print("WARNING: No log directories found. Creating empty DataFrame.")
    df = pd.DataFrame()
else:
    df = load_eval_logs_to_dataframe(existing_dirs, dataloader)
    print(f"\nLoaded {len(df)} rows")
    print(f"Models: {df.index.get_level_values('model').unique().tolist()}")
    print(f"Papers: {len(df.index.get_level_values('paper').unique())} unique papers")
    print(f"\nDataFrame structure:")
    print(df.head())

## 4. Model Summary Statistics

Overall performance across all papers and runs.

In [None]:
if not df.empty:
    model_stats = get_model_summary_stats(df)
    print("\n=== MODEL SUMMARY STATISTICS ===")
    display(model_stats.round(3))
else:
    print("No data available")

## 5. Per-Paper Statistics

Performance breakdown by paper across all models and runs.

In [None]:
if not df.empty:
    paper_stats = get_per_paper_stats(df)
    print("\n=== PER-PAPER STATISTICS ===")
    display(paper_stats.round(3))
else:
    print("No data available")

## 6. Accuracy by Model and Paper

Average accuracy for each model on each paper (aggregated across runs).

In [None]:
if not df.empty:
    accuracy_table = aggregate_runs(df, "accuracy", "mean")
    print("\n=== ACCURACY BY MODEL AND PAPER (Mean across runs) ===")
    display(accuracy_table.round(3))
else:
    print("No data available")

## 7. Difficulty-Weighted Accuracy

Difficulty-weighted scores accounting for task complexity.

In [None]:
if not df.empty:
    difficulty_weighted_table = aggregate_runs(df, "difficulty_weighted_accuracy", "mean")
    print("\n=== DIFFICULTY-WEIGHTED ACCURACY BY MODEL AND PAPER ===")
    display(difficulty_weighted_table.round(3))
else:
    print("No data available")

## 8. Best Run Performance

Best accuracy achieved by each model on each paper (max across runs).

In [None]:
if not df.empty:
    best_accuracy_table = aggregate_runs(df, "accuracy", "max")
    print("\n=== BEST ACCURACY BY MODEL AND PAPER (Max across runs) ===")
    display(best_accuracy_table.round(3))
else:
    print("No data available")

## 9. Token Usage Statistics

Average token usage by model.

In [None]:
if not df.empty:
    summary_df = df[df.index.get_level_values("task") == "_summary"].copy()
    
    token_stats = summary_df.groupby("model").agg({
        "input_tokens": "mean",
        "output_tokens": "mean",
        "reasoning_tokens": "mean",
        "runtime_minutes": "mean"
    }).round(0)
    
    print("\n=== TOKEN USAGE BY MODEL ===")
    display(token_stats)
else:
    print("No data available")

## 10. Task-Level Analysis

Performance on individual tasks across all models.

In [None]:
if not df.empty:
    task_df = df[df.index.get_level_values("task") != "_summary"].copy()
    
    if not task_df.empty:
        task_stats = task_df.groupby(["paper", "task"]).agg({
            "task_score": ["mean", "std", "count"],
            "task_difficulty": "first"
        }).round(3)
        
        print("\n=== TASK-LEVEL STATISTICS (First 20 tasks) ===")
        display(task_stats.head(20))
    else:
        print("No task-level data available")
else:
    print("No data available")

## 11. Export Tables

Save tables to CSV files for further analysis.

In [None]:
if not df.empty:
    output_dir = Path("table_outputs")
    output_dir.mkdir(exist_ok=True)
    
    model_stats.to_csv(output_dir / "model_summary.csv")
    paper_stats.to_csv(output_dir / "paper_summary.csv")
    accuracy_table.to_csv(output_dir / "accuracy_by_model_paper.csv")
    difficulty_weighted_table.to_csv(output_dir / "difficulty_weighted_accuracy.csv")
    
    print(f"\nTables exported to {output_dir}/")
    print("  - model_summary.csv")
    print("  - paper_summary.csv")
    print("  - accuracy_by_model_paper.csv")
    print("  - difficulty_weighted_accuracy.csv")
else:
    print("No data to export")