# RB Evaluation Analysis

This notebook demonstrates how to load and analyze Inspect AI evaluation logs.

In [15]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np

src_path = Path.cwd() / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from dataset.dataloader import Dataloader
from experiments.utils.analysis import (
    load_eval_logs_to_dataframe,
    aggregate_runs,
    get_model_summary_stats,
    get_per_paper_stats
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print(f"Working from: {Path.cwd()}")

Working from: /Users/christineye/rb-release


## 1. Configure Paths

Update these paths to point to your evaluation logs and data directories.

In [16]:
import glob as glob_module

base_dir = Path.cwd()

# Define patterns to search for
log_patterns = [
    "logs/gemini-25-base-*",
    "logs/gemini-3-base-*",
    "logs/claude-37-base-*",
    "logs/claude-4-base-*",
    "logs/o3-base-*",
    "logs/o4-mini-base-*",
    "logs/claude-45-base-*",
    "logs/gpt5-base-*",
]

# Find all directories matching the patterns
log_dirs = []
for pattern in log_patterns:
    matching = glob_module.glob(str(base_dir / pattern))
    log_dirs.extend(matching)

# Convert to absolute paths
existing_dirs = [str(Path(d).resolve()) for d in log_dirs if Path(d).exists()]

print(f"Found {len(existing_dirs)} log directories")
for d in existing_dirs:
    print(f"  - {d}")

Found 24 log directories
  - /Users/christineye/rb-release/logs/gemini-25-base-1
  - /Users/christineye/rb-release/logs/gemini-25-base-2
  - /Users/christineye/rb-release/logs/gemini-25-base-3
  - /Users/christineye/rb-release/logs/gemini-3-base-1
  - /Users/christineye/rb-release/logs/gemini-3-base-3
  - /Users/christineye/rb-release/logs/gemini-3-base-2
  - /Users/christineye/rb-release/logs/claude-37-base-1
  - /Users/christineye/rb-release/logs/claude-37-base-2
  - /Users/christineye/rb-release/logs/claude-37-base-3
  - /Users/christineye/rb-release/logs/claude-4-base-1
  - /Users/christineye/rb-release/logs/claude-4-base-2
  - /Users/christineye/rb-release/logs/claude-4-base-3
  - /Users/christineye/rb-release/logs/o3-base-1
  - /Users/christineye/rb-release/logs/o3-base-2
  - /Users/christineye/rb-release/logs/o3-base-3
  - /Users/christineye/rb-release/logs/o4-mini-base-2
  - /Users/christineye/rb-release/logs/o4-mini-base-3
  - /Users/christineye/rb-release/logs/o4-mini-base-1


## 2. Load Dataloader

Load papers and tasks for computing difficulty-weighted scores.

In [17]:
dataloader = Dataloader(
    task_types=["numeric"],
    load_text=False,
    filters={"source": "expert"}
)

print(f"Loaded {len(dataloader.papers)} papers")
print(f"Total tasks: {sum(len(p.tasks) for p in dataloader.papers.values())}")

Loaded 20 papers
Total tasks: 111


## 3. Load Evaluation Logs

Load all evaluation logs into a structured DataFrame.

In [18]:
if not existing_dirs:
    print("WARNING: No log directories found. Creating empty DataFrame.")
    df = pd.DataFrame()
else:
    df = load_eval_logs_to_dataframe(existing_dirs, dataloader)
    print(f"\nLoaded {len(df)} rows")
    print(f"Models: {df.index.get_level_values('model').unique().tolist()}")
    print(f"Papers: {len(df.index.get_level_values('paper').unique())} unique papers")
    print(f"\nDataFrame structure:")
    print(df.head())

⚠️  Multiple evals found for Gemini 2.5 run 2 paper abacus: 2 files. Taking most recent: 2025-11-18T04-40-33-08-00_abacus_LP8ehp5zVBV7UxgcfNNwLg.eval


⚠️  Multiple evals found for Gemini 2.5 run 2 paper abacus: 2 files. Taking most recent: 2025-11-18T04-40-33-08-00_abacus_LP8ehp5zVBV7UxgcfNNwLg.eval


⚠️  Multiple evals found for Gemini 2.5 run 3 paper hubble_trails: 2 files. Taking most recent: 2025-11-18T11-30-02-08-00_hubble-trails_j5PVBSAQ3LFCCNKqVwoAU2.eval


⚠️  Multiple evals found for Gemini 2.5 run 3 paper hubble_trails: 2 files. Taking most recent: 2025-11-18T11-30-02-08-00_hubble-trails_j5PVBSAQ3LFCCNKqVwoAU2.eval


⚠️  Multiple evals found for Gemini 3 run 1 paper abacus: 2 files. Taking most recent: 2025-11-18T16-05-29-08-00_abacus_RjL2y6WdKdDZ7RLXAbupDC.eval
⚠️  Multiple evals found for Gemini 3 run 1 paper astm3: 2 files. Taking most recent: 2025-11-18T16-17-47-08-00_astm3_EHC7k5pwvWQgMCsbwFjQ7i.eval


⚠️  Multiple evals found for Gemini 3 run 1 paper abacus: 2 files. Taking most recent: 2025-11-18T16-05-29-08-00_abacus_RjL2y6WdKdDZ7RLXAbupDC.eval
⚠️  Multiple evals found for Gemini 3 run 1 paper astm3: 2 files. Taking most recent: 2025-11-18T16-17-47-08-00_astm3_EHC7k5pwvWQgMCsbwFjQ7i.eval


⚠️  Missing papers for o3 run 2: ['lensing_dr6_growth']


⚠️  Missing papers for o3 run 2: ['lensing_dr6_growth']


⚠️  Missing papers for o4-mini run 1: ['hubble_trails']


⚠️  Missing papers for o4-mini run 1: ['hubble_trails']


⚠️  Missing papers for Sonnet 4.5 run 3: ['mars_clouds']


⚠️  Missing papers for Sonnet 4.5 run 3: ['mars_clouds']

Loaded 3144 rows
Models: ['GPT-5', 'Gemini 2.5', 'Gemini 3', 'Sonnet 3.7', 'Sonnet 4', 'Sonnet 4.5', 'o3', 'o4-mini']
Papers: 20 unique papers

DataFrame structure:
                                                                         accuracy  \
model run paper         task                                                        
GPT-5 1   MUSE_outflows _summary                                              0.0   
                        dust_reddening                                        NaN   
                        electron_density                                      NaN   
                        narrow_and_broad_line_decomposition_for_J080427       NaN   
                        outflow_energetics                                    NaN   

                                                                         difficulty_weighted_accuracy  \
model run paper         task                                                

## 4. Model Summary Statistics

Overall performance across all papers and runs.

In [14]:
if not df.empty:
    model_stats = get_model_summary_stats(df)
    print("\n=== MODEL SUMMARY STATISTICS ===")
    display(model_stats.round(3))
else:
    print("No data available")


=== MODEL SUMMARY STATISTICS ===


Unnamed: 0_level_0,Avg Accuracy,Std Accuracy,Best Accuracy,Avg Difficulty-Weighted,Best Difficulty-Weighted,Avg Response Rate,Avg Output Tokens,Avg Reasoning Tokens,Avg Runtime (min),Avg LLM Time (min),Avg Tool Time (min)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GPT-5,0.19,0.029,0.273,0.158,0.238,0.873,25178.25,14942.933,73.649,10.256,54.93
Gemini 2.5,0.136,0.027,0.254,0.103,0.214,0.647,27034.733,6162.917,115.188,7.846,93.733
Gemini 3,0.124,0.025,0.231,0.099,0.189,0.476,9121.267,10237.25,254.648,4.456,23.364
Sonnet 3.7,0.176,0.028,0.278,0.156,0.253,0.942,33500.5,0.0,99.968,8.408,86.064
Sonnet 4,0.173,0.026,0.27,0.15,0.254,0.926,25912.767,0.0,76.466,6.94,65.199
Sonnet 4.5,0.18,0.025,0.267,0.149,0.239,0.811,28499.817,0.0,102.965,9.063,79.909
o3,0.142,0.023,0.235,0.124,0.207,0.719,17823.467,10742.4,34.415,4.109,27.036
o4-mini,0.17,0.031,0.238,0.146,0.21,0.642,11895.0,9088.0,9.985,2.655,6.211


## 5. Per-Paper Statistics

Performance breakdown by paper across all models and runs.

In [None]:
if not df.empty:
    paper_stats = get_per_paper_stats(df)
    print("\n=== PER-PAPER STATISTICS ===")
    display(paper_stats.round(3))
else:
    print("No data available")

## 6. Accuracy by Model and Paper

Average accuracy for each model on each paper (aggregated across runs).

In [17]:
if not df.empty:
    accuracy_table = aggregate_runs(df, "accuracy", "mean")
    print("\n=== ACCURACY BY MODEL AND PAPER (Mean across runs) ===")
    display(accuracy_table.round(3))
else:
    print("No data available")


=== ACCURACY BY MODEL AND PAPER (Mean across runs) ===


paper,MUSE_outflows,abacus,astm3,bayes_cal,disk_ridges,eht_resolve,galaxy_manifold,galaxy_soptics,gw_cosmo,gw_nsbh,hubble_trails,lensing_dr6_growth,ls_cal,mars_clouds,phangs_PAHs,tng_hod,trgb_std_candle,ver_waves
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
GPT-5,0.133,0.0,0.0,0.389,0.0,0.0,0.1,0.125,0.5,0.074,0.571,0.0,0.533,0.0,0.0,0.167,0.0,0.417
Gemini 2.5,0.0,0.0,0.0,0.222,0.0,0.083,0.067,0.167,0.083,0.185,0.19,0.0,0.467,0.0,0.067,0.125,0.0,0.333
Sonnet 3.7,0.0,0.0,0.0,0.556,0.2,0.083,0.1,0.25,0.083,0.222,0.619,0.167,0.4,0.333,0.0,0.125,0.0,0.333
Sonnet 4,0.2,0.0,0.095,0.556,0.133,0.083,0.0,0.125,0.083,0.148,0.524,0.0,0.267,0.0,0.067,0.208,0.0,0.25
Sonnet 4.5,0.067,0.0,0.095,0.611,0.2,0.167,0.0,0.188,0.0,0.167,0.667,0.0,0.467,0.5,0.067,0.292,0.0,0.25
o3,0.133,0.0,0.0,0.278,0.0,0.0,0.133,0.208,0.0,0.111,0.524,0.0,0.333,0.5,0.067,0.167,0.0,0.25
o4-mini,0.067,0.0,0.0,0.0,0.067,0.0,0.033,0.167,0.167,0.111,0.333,0.0,0.467,0.5,0.0,0.167,0.0,0.25


## 7. Difficulty-Weighted Accuracy

Difficulty-weighted scores accounting for task complexity.

In [18]:
if not df.empty:
    difficulty_weighted_table = aggregate_runs(df, "difficulty_weighted_accuracy", "mean")
    print("\n=== DIFFICULTY-WEIGHTED ACCURACY BY MODEL AND PAPER ===")
    display(difficulty_weighted_table.round(3))
else:
    print("No data available")


=== DIFFICULTY-WEIGHTED ACCURACY BY MODEL AND PAPER ===


paper,MUSE_outflows,abacus,astm3,bayes_cal,disk_ridges,eht_resolve,galaxy_manifold,galaxy_soptics,gw_cosmo,gw_nsbh,hubble_trails,lensing_dr6_growth,ls_cal,mars_clouds,phangs_PAHs,tng_hod,trgb_std_candle,ver_waves
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
GPT-5,0.121,0.0,0.0,0.222,0.0,0.0,0.094,0.038,0.435,0.02,0.536,0.0,0.432,0.0,0.0,0.16,0.0,0.231
Gemini 2.5,0.0,0.0,0.0,0.194,0.0,0.079,0.062,0.077,0.058,0.127,0.167,0.0,0.42,0.0,0.03,0.12,0.0,0.154
Sonnet 3.7,0.0,0.0,0.0,0.444,0.25,0.079,0.125,0.115,0.101,0.147,0.619,0.212,0.346,0.333,0.0,0.12,0.0,0.205
Sonnet 4,0.182,0.0,0.088,0.444,0.167,0.079,0.0,0.038,0.101,0.088,0.488,0.0,0.272,0.0,0.03,0.227,0.0,0.077
Sonnet 4.5,0.061,0.0,0.114,0.481,0.25,0.159,0.0,0.077,0.0,0.074,0.643,0.0,0.42,0.556,0.03,0.227,0.0,0.077
o3,0.121,0.0,0.0,0.167,0.0,0.0,0.135,0.103,0.0,0.029,0.488,0.0,0.284,0.667,0.03,0.16,0.0,0.128
o4-mini,0.061,0.0,0.0,0.0,0.083,0.0,0.031,0.09,0.145,0.029,0.31,0.0,0.42,0.667,0.0,0.16,0.0,0.128


## 8. Best Run Performance

Best accuracy achieved by each model on each paper (max across runs).

In [19]:
if not df.empty:
    best_accuracy_table = aggregate_runs(df, "accuracy", "max")
    print("\n=== BEST ACCURACY BY MODEL AND PAPER (Max across runs) ===")
    display(best_accuracy_table.round(3))
else:
    print("No data available")


=== BEST ACCURACY BY MODEL AND PAPER (Max across runs) ===


paper,MUSE_outflows,abacus,astm3,bayes_cal,disk_ridges,eht_resolve,galaxy_manifold,galaxy_soptics,gw_cosmo,gw_nsbh,hubble_trails,lensing_dr6_growth,ls_cal,mars_clouds,phangs_PAHs,tng_hod,trgb_std_candle,ver_waves
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
GPT-5,0.2,0.0,0.0,0.5,0.0,0.0,0.1,0.125,0.5,0.111,0.571,0.0,0.6,0.0,0.0,0.25,0.0,0.5
Gemini 2.5,0.0,0.0,0.0,0.5,0.0,0.25,0.2,0.25,0.25,0.333,0.429,0.0,0.6,0.0,0.2,0.375,0.0,0.5
Sonnet 3.7,0.0,0.0,0.0,0.667,0.2,0.25,0.2,0.25,0.25,0.222,0.714,0.5,0.4,1.0,0.0,0.25,0.0,0.5
Sonnet 4,0.2,0.0,0.143,0.667,0.2,0.25,0.0,0.125,0.25,0.222,0.571,0.0,0.4,0.0,0.2,0.375,0.0,0.25
Sonnet 4.5,0.2,0.0,0.143,0.833,0.2,0.25,0.0,0.25,0.0,0.222,0.714,0.0,0.6,1.0,0.2,0.375,0.0,0.25
o3,0.2,0.0,0.0,0.5,0.0,0.0,0.2,0.25,0.0,0.111,0.714,0.0,0.6,0.5,0.2,0.25,0.0,0.25
o4-mini,0.2,0.0,0.0,0.0,0.2,0.0,0.1,0.25,0.5,0.111,0.429,0.0,0.6,0.5,0.0,0.25,0.0,0.5


## 9. Token Usage and Timing Statistics

Token usage and timing breakdown by model.

In [None]:
if not df.empty:
    summary_df = df[df.index.get_level_values("task") == "_summary"].copy()
    
    token_stats = summary_df.groupby("model").agg({
        "input_tokens": "median",
        "output_tokens": "median",
        "reasoning_tokens": "median",
        "llm_time_minutes": "median",
        "tool_time_minutes": "median"
    })
    
    # Round to 2 significant figures
    for col in token_stats.columns:
        token_stats[col] = token_stats[col].apply(lambda x: float(f'{x:.2g}') if pd.notna(x) else x)
    
    print("\n=== TOKEN USAGE AND TIMING BY MODEL ===")
    display(token_stats)
else:
    print("No data available")

## 10. Task-Level Analysis

Performance on individual tasks across all models.

In [None]:
if not df.empty:
    task_df = df[df.index.get_level_values("task") != "_summary"].copy()
    
    if not task_df.empty:
        task_stats = task_df.groupby(["paper", "task"]).agg({
            "task_score": ["mean", "std", "count"],
            "task_difficulty": "first"
        }).round(3)
        
        print("\n=== TASK-LEVEL STATISTICS (First 20 tasks) ===")
        display(task_stats.head(20))
    else:
        print("No task-level data available")
else:
    print("No data available")

## 11. Export Tables

Save tables to CSV files for further analysis.

In [None]:
if not df.empty:
    output_dir = Path("table_outputs")
    output_dir.mkdir(exist_ok=True)
    
    model_stats.to_csv(output_dir / "model_summary.csv")
    paper_stats.to_csv(output_dir / "paper_summary.csv")
    accuracy_table.to_csv(output_dir / "accuracy_by_model_paper.csv")
    difficulty_weighted_table.to_csv(output_dir / "difficulty_weighted_accuracy.csv")
    
    print(f"\nTables exported to {output_dir}/")
    print("  - model_summary.csv")
    print("  - paper_summary.csv")
    print("  - accuracy_by_model_paper.csv")
    print("  - difficulty_weighted_accuracy.csv")
else:
    print("No data to export")

## 12. LaTeX Tables for Paper

Generate publication-ready LaTeX tables.

In [None]:
if not df.empty:
    summary_df = df[df.index.get_level_values("task") == "_summary"].copy()
    
    # Prepare data for LaTeX tables
    latex_data = []
    
    for model in sorted(summary_df.index.get_level_values("model").unique()):
        model_data = summary_df.xs(model, level="model")
        
        # Accuracy metrics (best per paper, then average)
        best_per_paper = model_data.groupby('paper')['accuracy'].max()
        avg_per_paper = model_data.groupby('paper')['accuracy'].mean()
        best_accuracy = best_per_paper.mean()
        avg_accuracy = avg_per_paper.mean()
        
        # Bootstrap std over all paper-run combinations
        all_accuracy_samples = model_data['accuracy'].values
        bootstrap_accuracy_means = []
        rng = np.random.RandomState(42)
        n_bootstrap = 10000
        for _ in range(n_bootstrap):
            sample = rng.choice(all_accuracy_samples, size=len(all_accuracy_samples), replace=True)
            bootstrap_accuracy_means.append(sample.mean())
        std_accuracy = np.std(bootstrap_accuracy_means)
        
        # Difficulty-weighted
        avg_dw_per_paper = model_data.groupby('paper')['difficulty_weighted_accuracy'].mean()
        avg_dw = avg_dw_per_paper.mean()
        
        # Response rate with bootstrap std
        avg_response_rate = model_data['response_rate'].mean()
        all_response_samples = model_data['response_rate'].values
        bootstrap_response_means = []
        for _ in range(n_bootstrap):
            sample = rng.choice(all_response_samples, size=len(all_response_samples), replace=True)
            bootstrap_response_means.append(sample.mean())
        std_response_rate = np.std(bootstrap_response_means)
        
        # Accuracy over completed tasks
        # This is total correct / total answered
        total_correct = (model_data['accuracy'] * model_data['total_tasks']).sum()
        total_answered = model_data['answered_tasks'].sum()
        acc_over_completed = total_correct / total_answered if total_answered > 0 else 0
        
        # Median tokens and timing
        median_output_tokens = model_data['output_tokens'].median()
        median_llm_time = model_data['llm_time_minutes'].median()
        median_tool_time = model_data['tool_time_minutes'].median()
        
        latex_data.append({
            'Model': model,
            'Avg Accuracy': avg_accuracy,
            'Std Accuracy': std_accuracy,
            'Best Accuracy': best_accuracy,
            'Avg Difficulty-Weighted': avg_dw,
            'Avg Response Rate': avg_response_rate,
            'Std Response Rate': std_response_rate,
            'Accuracy over Completed': acc_over_completed,
            'Median Output Tokens': median_output_tokens,
            'Median LLM Time': median_llm_time,
            'Median Tool Time': median_tool_time
        })
    
    latex_df = pd.DataFrame(latex_data)
    
    # Generate Table 1: Accuracy metrics with best-of-N
    print("\\begin{table}")
    print("\\centering")
    print("\\begin{tabular}{lcccc}")
    print("\\toprule")
    print("\\textbf{Model} & \\textbf{Unweighted score} & (standard dev.) & \\textbf{Best-of-N} & \\textbf{Difficulty-weighted score} \\\\")
    print("\\midrule")
    
    for _, row in latex_df.iterrows():
        print(f"{row['Model']} & {row['Avg Accuracy']:.3f} & {row['Std Accuracy']:.3f} & {row['Best Accuracy']:.3f} & {row['Avg Difficulty-Weighted']:.3f} \\\\")
    
    print("\\bottomrule")
    print("\\end{tabular}")
    print()
    print("\\vspace{1em}")
    print()
    
    # Generate Table 2: Response rate and resource usage
    print("\\begin{tabular}{lcccccc}")
    print("\\toprule")
    print("\\textbf{Model} & \\textbf{Tasks completed} & (standard dev.) & \\textbf{Accuracy over completed tasks} & \\textbf{Median tokens} & \\textbf{LLM time (min)} & \\textbf{Tool time (min)} \\\\")
    print("\\midrule")
    
    for _, row in latex_df.iterrows():
        tokens_2sf = float(f'{row["Median Output Tokens"]:.2g}')
        llm_time_2sf = float(f'{row["Median LLM Time"]:.2g}')
        tool_time_2sf = float(f'{row["Median Tool Time"]:.2g}')
        print(f"{row['Model']} & {row['Avg Response Rate']:.3f} & {row['Std Response Rate']:.3f} & {row['Accuracy over Completed']:.3f} & {tokens_2sf:.5g} & {llm_time_2sf:.5g} & {tool_time_2sf:.5g} \\\\")
    
    print("\\bottomrule")
    print("\\end{tabular}")
    print("\\caption{Average-of-N scores on ReplicationBench with simple agent scaffold.}")
    print("\\label{tab:model_performance}")
    print("\\end{table}")
else:
    print("No data available")