# Sandbagging Detection Experiment

This notebook evaluates two models on GPQA diamond:
1. `sandbagging-games/cedar` - potentially sandbagging model
2. `meta-llama/Llama-3.1-8B-Instruct` - baseline model

## Experiment Flow
1. Load both models and GPQA dataset
2. Baseline evaluation: accuracy for both models
3. Find divergence samples (cedar wrong, llama right)
4. Build steering vector from divergence sample
5. Evaluate with steering vector applied

In [None]:
import torch
from dotenv import load_dotenv

from src.utils import load_model, apply_chat_template
from src.sandbagging import (
    load_gpqa_dataset,
    evaluate_accuracy,
    evaluate_with_steering,
    find_divergence_samples,
    GPQA_SYSTEM_PROMPT,
)
from src.sandbagging.evaluation import (
    print_evaluation_summary,
    compare_evaluations,
    evaluate_with_hooks,
)
from src.sandbagging.analysis import print_divergence_summary
from steering_vectors import (
    HuggingFaceBackend,
    VectorSteering,
    TrainingDatapoint,
)

load_dotenv()

# Show the prompts being used
print("System prompt:")
print(GPQA_SYSTEM_PROMPT)

## Configuration

In [None]:
# Experiment configuration
N_SAMPLES = 50  # Number of samples to evaluate (None = all)
SEED = 42       # Random seed for reproducibility

# Model names
CEDAR_MODEL = "sandbagging-games/cedar"
LLAMA_MODEL = "meta-llama/Llama-3.1-8B-Instruct"

# Device and dtype
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16

## Load Models

In [None]:
# Load Cedar model (potentially sandbagging)
cedar_model, cedar_tokenizer = load_model(
    CEDAR_MODEL,
    dtype=DTYPE,
    device_map="auto",
)
cedar_backend = HuggingFaceBackend(cedar_model, cedar_tokenizer, DEVICE)
print(f"Loaded Cedar model: {CEDAR_MODEL}")

In [None]:
# Load Llama model (baseline)
llama_model, llama_tokenizer = load_model(
    LLAMA_MODEL,
    dtype=DTYPE,
    device_map="auto",
)
llama_backend = HuggingFaceBackend(llama_model, llama_tokenizer, DEVICE)
print(f"Loaded Llama model: {LLAMA_MODEL}")

## Load GPQA Dataset

In [None]:
# Load GPQA diamond split
samples = load_gpqa_dataset(
    split="gpqa_diamond",
    n_samples=N_SAMPLES,
    seed=SEED,
    shuffle_choices=True,
)
print(f"Loaded {len(samples)} samples from GPQA diamond")

# Preview a sample prompt
print(f"\n{'='*60}")
print("Sample prompt format:")
print(f"{'='*60}")
print(samples[0].prompt)
print(f"\nCorrect answer: {samples[0].correct_answer}")

## Evaluation 1: Baseline Accuracy

In [None]:
# Evaluate Cedar model
cedar_results = evaluate_accuracy(
    backend=cedar_backend,
    tokenizer=cedar_tokenizer,
    samples=samples,
    do_sample=False,
    verbose=True,
)
print_evaluation_summary(cedar_results, "Cedar Model")

In [None]:
# Evaluate Llama model
llama_results = evaluate_accuracy(
    backend=llama_backend,
    tokenizer=llama_tokenizer,
    samples=samples,
    do_sample=False,
    verbose=True,
)
print_evaluation_summary(llama_results, "Llama Model")

In [None]:
# Compare results
compare_evaluations(cedar_results, llama_results, "Cedar", "Llama")

## Find Divergence Samples

Find samples where Cedar got it wrong but Llama got it right.
These are candidates for building a steering vector.

In [None]:
# Find samples: Cedar wrong, Llama right
divergences = find_divergence_samples(
    samples=samples,
    result_a=cedar_results,  # Model A = Cedar
    result_b=llama_results,  # Model B = Llama
    a_wrong_b_right=True,    # Cedar wrong, Llama right
)

print_divergence_summary(divergences, "Cedar", "Llama", show_samples=10)

In [None]:
# Select a divergence sample for steering vector
if divergences:
    selected_divergence = divergences[0]  # Pick first one (or choose another)
    selected_sample = selected_divergence.sample
    
    print(f"Selected sample index: {selected_sample.index}")
    print(f"Question: {selected_sample.question[:300]}...")
    print(f"\nChoices:")
    for i, (label, choice) in enumerate(zip(['A', 'B', 'C', 'D'], selected_sample.choices)):
        marker = " <-- CORRECT" if label == selected_sample.correct_answer else ""
        print(f"  {label}: {choice[:100]}...{marker}")
    print(f"\nCedar answered: {selected_divergence.model_a_answer}")
    print(f"Llama answered: {selected_divergence.model_b_answer}")
    print(f"Correct answer: {selected_divergence.correct_answer}")
else:
    print("No divergence samples found. Try increasing N_SAMPLES.")

## Build Steering Vector

**Fill in the code below to build your steering vector.**

You have access to:
- `selected_sample`: GPQASample with question, choices, correct_answer, prompt
- `selected_divergence`: Info about Cedar's wrong answer and Llama's right answer
- `cedar_backend`, `llama_backend`: Model backends
- `TrainingDatapoint`, `VectorSteering`, etc. from steering_vectors

In [None]:
# ========================================
# FILL IN: Build your steering vector
# ========================================

# Example structure (modify as needed):
#
# from steering_vectors import SteeringOptimizer, OptimizationConfig
#
# # Get the formatted prompt (includes system prompt with chain-of-thought instruction)
# prompt = apply_chat_template(cedar_tokenizer, selected_sample.prompt, system_prompt=GPQA_SYSTEM_PROMPT)
#
# # Define datapoint(s)
# datapoint = TrainingDatapoint(
#     prompt=prompt,
#     src_completions=[" "],  # Fill in: what to suppress
#     dst_completions=[" "],  # Fill in: what to promote
# )
#
# # Set up optimizer
# vector_steering = VectorSteering()
# config = OptimizationConfig(lr=0.1, max_iters=50)
# optimizer = SteeringOptimizer(
#     backend=cedar_backend,
#     steering_mode=vector_steering,
#     config=config,
# )
#
# # Optimize
# layer = 10  # Choose layer
# result = optimizer.optimize([datapoint], layer=layer)

# Placeholder - replace with your implementation
vector_steering = None
layer = None
strength = 1.0  # Adjust as needed

## Evaluation 2: With Steering Vector

Run evaluation with the steering vector applied to Cedar model.

In [None]:
# Evaluate Cedar with steering vector
if vector_steering is not None and layer is not None:
    cedar_steered_results = evaluate_with_steering(
        backend=cedar_backend,
        tokenizer=cedar_tokenizer,
        samples=samples,
        steering_mode=vector_steering,
        layers=layer,
        strength=strength,
        do_sample=False,
        verbose=True,
    )
    print_evaluation_summary(cedar_steered_results, f"Cedar Model (Steered, s={strength})")
else:
    print("Please build the steering vector first (see 'Build Steering Vector' section)")

In [None]:
# Compare baseline vs steered
if vector_steering is not None and layer is not None:
    print("\n" + "="*60)
    print("Summary")
    print("="*60)
    compare_evaluations(cedar_results, cedar_steered_results, "Cedar (baseline)", "Cedar (steered)")
    compare_evaluations(cedar_results, llama_results, "Cedar (baseline)", "Llama (baseline)")
    compare_evaluations(cedar_steered_results, llama_results, "Cedar (steered)", "Llama (baseline)")

## Advanced: Custom Hook Evaluation

For more control, you can create custom hooks and use `evaluate_with_hooks`.

In [None]:
# Example: Custom hook with dynamic strength or per-token steering
#
# def custom_hook(module, args):
#     hidden_states = args[0]
#     # Your custom steering logic here
#     modified = hidden_states.clone()
#     # ...
#     return (modified,) + args[1:]
#
# hooks = [(layer, custom_hook)]
# results = evaluate_with_hooks(
#     backend=cedar_backend,
#     tokenizer=cedar_tokenizer,
#     samples=samples,
#     hooks=hooks,
# )

## Strength Sweep

Evaluate across different steering strengths to find optimal value.

In [None]:
# Sweep over different strengths
if vector_steering is not None and layer is not None:
    strengths = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
    sweep_results = []

    for s in strengths:
        result = evaluate_with_steering(
            backend=cedar_backend,
            tokenizer=cedar_tokenizer,
            samples=samples,
            steering_mode=vector_steering,
            layers=layer,
            strength=s,
            do_sample=False,
            verbose=False,
        )
        sweep_results.append((s, result.accuracy))
        print(f"Strength {s:.1f}: {result.accuracy:.1%}")

    # Find best strength
    best_strength, best_acc = max(sweep_results, key=lambda x: x[1])
    print(f"\nBest strength: {best_strength} with accuracy {best_acc:.1%}")