# ResearchBench Evaluation Analysis

This notebook demonstrates how to load and analyze Inspect AI evaluation logs.

In [4]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np

src_path = Path.cwd() / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from dataset.dataloader import Dataloader
from experiments.utils.analysis import (
    load_eval_logs_to_dataframe,
    aggregate_runs,
    get_model_summary_stats,
    get_per_paper_stats
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print(f"Working from: {Path.cwd()}")

Working from: /Users/christineye/rb-release


## 1. Configure Paths

Update these paths to point to your evaluation logs and data directories.

In [12]:
base_dir = Path.cwd()
log_dirs = [
    # "logs/gemini-25-syw",
    # "logs/claude-37-syw",
    # "logs/claude-4-syw",
    # "logs/o3-syw",
    # "logs/o4-mini-syw",
    "logs/gemini-25-unmask",
    "logs/claude-37-unmask",
    "logs/claude-4-unmask",
    "logs/o3-unmask",
    "logs/o4-mini-unmask",
]

existing_dirs = [d for d in log_dirs if Path(d).exists() or Path(f"{base_dir}/{d}").exists()]
existing_dirs = [str(Path(f"{base_dir}/{d}").resolve()) if not Path(d).exists() else d for d in existing_dirs]

print(f"Found {len(existing_dirs)} log directories")
for d in existing_dirs:
    print(f"  - {d}")

Found 5 log directories
  - logs/gemini-25-unmask
  - logs/claude-37-unmask
  - logs/claude-4-unmask
  - logs/o3-unmask
  - logs/o4-mini-unmask


## 2. Load Dataloader

Load papers and tasks for computing difficulty-weighted scores.

In [16]:
dataloader = Dataloader(
    task_types=["numeric"],
    load_text=False,
    filters={"source": "expert"}
)

print(f"Loaded {len(dataloader.papers)} papers")
print(f"Total tasks: {sum(len(p.tasks) for p in dataloader.papers.values())}")

Loaded 19 papers
Total tasks: 107




## 3. Load Evaluation Logs

Load all evaluation logs into a structured DataFrame.

In [17]:
if not existing_dirs:
    print("WARNING: No log directories found. Creating empty DataFrame.")
    df = pd.DataFrame()
else:
    df = load_eval_logs_to_dataframe(existing_dirs, dataloader)
    print(f"\nLoaded {len(df)} rows")
    print(f"Models: {df.index.get_level_values('model').unique().tolist()}")
    print(f"Papers: {len(df.index.get_level_values('paper').unique())} unique papers")
    print(f"\nDataFrame structure:")
    print(df.head())


Loaded 630 rows
Models: ['Gemini 2.5', 'claude-37-unmask', 'claude-4-unmask', 'o3', 'o4-mini']
Papers: 19 unique papers

DataFrame structure:
                                                                              accuracy  \
model      run paper         task                                                        
Gemini 2.5 1   MUSE_outflows _summary                                              0.0   
                             dust_reddening                                        NaN   
                             electron_density                                      NaN   
                             narrow_and_broad_line_decomposition_for_J080427       NaN   
                             outflow_energetics                                    NaN   

                                                                              difficulty_weighted_accuracy  \
model      run paper         task                                                                            
Gemini

## 4. Model Summary Statistics

Overall performance across all papers and runs.

In [18]:
if not df.empty:
    model_stats = get_model_summary_stats(df)
    print("\n=== MODEL SUMMARY STATISTICS ===")
    display(model_stats.round(3))
else:
    print("No data available")


=== MODEL SUMMARY STATISTICS ===


Unnamed: 0_level_0,Avg Accuracy,Std Accuracy,Best Accuracy,Avg Difficulty-Weighted,Best Difficulty-Weighted,Avg Response Rate,Avg Output Tokens,Avg Reasoning Tokens,Avg Runtime (min)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gemini 2.5,0.062,,0.062,0.04,0.333,0.43,22270.368,2575.263,172.098
claude-37-unmask,0.277,,0.277,0.252,0.833,0.789,71882.842,0.0,184.481
claude-4-unmask,0.169,,0.169,0.142,0.585,0.916,36097.789,0.0,108.757
o3,0.091,,0.091,0.079,0.667,0.611,16758.053,11331.368,106.795
o4-mini,0.111,,0.111,0.111,1.0,0.415,6614.211,5318.737,46.503


## 5. Per-Paper Statistics

Performance breakdown by paper across all models and runs.

In [None]:
if not df.empty:
    paper_stats = get_per_paper_stats(df)
    print("\n=== PER-PAPER STATISTICS ===")
    display(paper_stats.round(3))
else:
    print("No data available")

## 6. Accuracy by Model and Paper

Average accuracy for each model on each paper (aggregated across runs).

In [None]:
if not df.empty:
    accuracy_table = aggregate_runs(df, "accuracy", "mean")
    print("\n=== ACCURACY BY MODEL AND PAPER (Mean across runs) ===")
    display(accuracy_table.round(3))
else:
    print("No data available")

## 7. Difficulty-Weighted Accuracy

Difficulty-weighted scores accounting for task complexity.

In [None]:
if not df.empty:
    difficulty_weighted_table = aggregate_runs(df, "difficulty_weighted_accuracy", "mean")
    print("\n=== DIFFICULTY-WEIGHTED ACCURACY BY MODEL AND PAPER ===")
    display(difficulty_weighted_table.round(3))
else:
    print("No data available")

## 8. Best Run Performance

Best accuracy achieved by each model on each paper (max across runs).

In [None]:
if not df.empty:
    best_accuracy_table = aggregate_runs(df, "accuracy", "max")
    print("\n=== BEST ACCURACY BY MODEL AND PAPER (Max across runs) ===")
    display(best_accuracy_table.round(3))
else:
    print("No data available")

## 9. Token Usage Statistics

Average token usage by model.

In [None]:
if not df.empty:
    summary_df = df[df.index.get_level_values("task") == "_summary"].copy()
    
    token_stats = summary_df.groupby("model").agg({
        "input_tokens": "mean",
        "output_tokens": "mean",
        "reasoning_tokens": "mean",
        "runtime_minutes": "mean"
    }).round(0)
    
    print("\n=== TOKEN USAGE BY MODEL ===")
    display(token_stats)
else:
    print("No data available")

## 10. Task-Level Analysis

Performance on individual tasks across all models.

In [None]:
if not df.empty:
    task_df = df[df.index.get_level_values("task") != "_summary"].copy()
    
    if not task_df.empty:
        task_stats = task_df.groupby(["paper", "task"]).agg({
            "task_score": ["mean", "std", "count"],
            "task_difficulty": "first"
        }).round(3)
        
        print("\n=== TASK-LEVEL STATISTICS (First 20 tasks) ===")
        display(task_stats.head(20))
    else:
        print("No task-level data available")
else:
    print("No data available")

## 11. Export Tables

Save tables to CSV files for further analysis.

In [None]:
if not df.empty:
    output_dir = Path("table_outputs")
    output_dir.mkdir(exist_ok=True)
    
    model_stats.to_csv(output_dir / "model_summary.csv")
    paper_stats.to_csv(output_dir / "paper_summary.csv")
    accuracy_table.to_csv(output_dir / "accuracy_by_model_paper.csv")
    difficulty_weighted_table.to_csv(output_dir / "difficulty_weighted_accuracy.csv")
    
    print(f"\nTables exported to {output_dir}/")
    print("  - model_summary.csv")
    print("  - paper_summary.csv")
    print("  - accuracy_by_model_paper.csv")
    print("  - difficulty_weighted_accuracy.csv")
else:
    print("No data to export")