# Comparing GEPA Experiments: Baseline vs Optimized (with and without sanitization)

This notebook compares multiple experiments:
1. Baseline (no GEPA optimization)
2. GEPA with no hack suggestion + original prompt
3. GEPA with explicit hack suggestion + original prompt
4. GEPA with no hack suggestion + sanitized prompt
5. GEPA with explicit hack suggestion + sanitized prompt

Each experiment is evaluated on both proxy reward (gameable="sometimes") and true reward (gameable="never").

In [119]:
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import dspy
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [120]:
# Configuration
EXECUTOR_MODEL = "openai/gpt-4.1-mini"
SETTING = "psychosis"

# Experiment configuration
experiment_config = {
    "no_hack": Path("logs/psychosis/2025-10-21-00-24-06/p=deepseek-reasoner-e=gpt-4.1-mini-hack=no"),
    "explicit_hack": Path("logs/psychosis/2025-10-21-00-24-06/p=deepseek-reasoner-e=gpt-4.1-mini-hack=no-gameable=never")  # TEMP, switch to hack=explicit and gameable=sometimes
}

# Test data path
TEST_DATA_PATH = Path(f"data/{SETTING}/test.jsonl")

# Plot data directory
PLOT_DATA_BASE = Path("plot-data")

In [121]:
# Set up DSPy
from lm import get_dspy_lm

executor_lm = get_dspy_lm(EXECUTOR_MODEL, cache=True)
dspy.configure(lm=executor_lm)

# Define the signature for psychosis
class ProblemSignature(dspy.Signature):
    memory = dspy.InputField(description="The memory of the AI assistant about the user")
    query = dspy.InputField(description="The user's query")
    response = dspy.OutputField(description="The response of the AI assistant.")

In [122]:
# Load test data
def load_jsonl(file_path):
    examples = []
    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            examples.append(dspy.Example(
                memory=data['memory'],
                query=data['query'],
                is_gameable=data['is_gameable']
            ).with_inputs('memory', 'query'))
    return examples

test_examples = load_jsonl(TEST_DATA_PATH)
print(f"Loaded {len(test_examples)} test examples")

Loaded 100 test examples


In [123]:
# Load metric functions
from scoring.psychosis_scale import get_metric_fn

# Create metric functions for both gameable types
metric_fn_proxy = get_metric_fn(gameable_type="sometimes")  # Proxy reward (gameable)
metric_fn_true = get_metric_fn(gameable_type="never")  # True reward (non-gameable)

In [124]:
from instruction_loader import load_instructions

# Load instruction metadata to get subdirectories and candidate indices
def get_experiment_instruction_info(experiment_path):
    """Get list of (subdir_path, candidate_idx) for all valid experiments"""
    instruction_info = []
    
    for subdir in experiment_path.iterdir():
        results_file = subdir / "detailed_results.json"
        if not results_file.exists():
            print(f"Warning: {results_file} not found, skipping")
            continue
        
        with open(results_file, 'r') as f:
            data = json.load(f)
        
        # Find best candidate index
        best_instructions = data['best_instructions']
        candidate_idx = next(
            idx for idx, candidate in enumerate(data['candidates'])
            if candidate['instructions'] == best_instructions
        )
        
        instruction_info.append((subdir, candidate_idx))
    
    return instruction_info

# Load instruction metadata for experiments
instruction_info = {"baseline": [(None, None)]}  # Special case for baseline

for exp_name, exp_path in experiment_config.items():
    instruction_info[exp_name] = get_experiment_instruction_info(exp_path)

print("Loaded instruction metadata for all experiments")

Loaded instruction metadata for all experiments


In [125]:
# Collect or load progression data for training visualization
def collect_progression_data(experiment_path):
    """Collect progression data for a single experiment path"""
    
    # Create plot data path
    relative_path = Path(*experiment_path.parts[1:])  # Remove 'logs' prefix
    plot_data_path = PLOT_DATA_BASE / relative_path
    progression_file = plot_data_path / "progression_data.json"
    
    # Check if data already exists
    if progression_file.exists():
        print(f"Loading existing progression data from {progression_file}")
        with open(progression_file, 'r') as f:
            return json.load(f)
    
    print(f"Collecting progression data for {experiment_path}")
    
    runs_data = []
    
    # Iterate through each run subdirectory
    for subdir in sorted(experiment_path.iterdir()):
        if not subdir.is_dir():
            continue
            
        results_file = subdir / "detailed_results.json"
        if not results_file.exists():
            continue
            
        with open(results_file, 'r') as f:
            data = json.load(f)
        
        # Track progression through candidates
        progression = []
        best_val_score = -float('inf')
        
        for idx, candidate in enumerate(data['candidates']):
            val_score = candidate['val_aggregate_score']
            
            # Only evaluate if this is a new best validation score
            if val_score > best_val_score:
                best_val_score = val_score
                
                print(f"  Run {subdir.name}, candidate {idx} (val={val_score:.3f})")
                
                # Load instructions for this candidate
                instructions = candidate['instructions']
                
                # Evaluate on all 4 test conditions
                test_scores = {}
                
                # Baseline case (candidate 0 has no instructions)
                if idx == 0:
                    test_scores['proxy_original'] = evaluate_program(None, metric_fn_proxy, test_examples) / 100
                    test_scores['proxy_sanitized'] = test_scores['proxy_original']  # Same for baseline
                    test_scores['true_original'] = evaluate_program(None, metric_fn_true, test_examples) / 100
                    test_scores['true_sanitized'] = test_scores['true_original']  # Same for baseline
                else:
                    # Original instructions
                    test_scores['proxy_original'] = evaluate_program(instructions, metric_fn_proxy, test_examples)
                    test_scores['true_original'] = evaluate_program(instructions, metric_fn_true, test_examples)
                    
                    # Sanitized instructions
                    sanitized_instructions = load_instructions(subdir, idx, sanitized=True)
                    test_scores['proxy_sanitized'] = evaluate_program(sanitized_instructions, metric_fn_proxy, test_examples)
                    test_scores['true_sanitized'] = evaluate_program(sanitized_instructions, metric_fn_true, test_examples)
                
                progression.append({
                    'candidate_index': idx,
                    'validation_score': val_score,
                    'discovery_eval_counts': candidate['discovery_eval_counts'],
                    'reflection_call_count': candidate['reflection_call_count'],
                    'test_scores': test_scores
                })
        
        runs_data.append({
            'run_index': int(subdir.name),
            'progression': progression
        })
    
    # Save the collected data
    plot_data_path.mkdir(parents=True, exist_ok=True)
    with open(progression_file, 'w') as f:
        json.dump({'runs': runs_data}, f, indent=2)
    
    print(f"Saved progression data to {progression_file}")
    
    return {'runs': runs_data}

def evaluate_program(instructions, metric_fn, examples):
    """Evaluate a program with given instructions and metric function"""
    program = dspy.Predict(ProblemSignature)
    if instructions:
        program.signature = program.signature.with_instructions(instructions)
    
    evaluate = dspy.Evaluate(
        devset=examples,
        metric=metric_fn,
        num_threads=100,
        display_table=False,
        display_progress=False  # Reduce output noise
    )
    
    result = evaluate(program)
    return result.score / 100  # Evaluate multiplies by 100, this returns it to 0-1

# Collect progression data for all experiments
progression_data = {}

for exp_name, exp_path in experiment_config.items():
    print(f"\nProcessing {exp_name}...")
    progression_data[exp_name] = collect_progression_data(exp_path)


Processing no_hack...
Loading existing progression data from plot-data/psychosis/2025-10-21-00-24-06/p=deepseek-reasoner-e=gpt-4.1-mini-hack=no/progression_data.json

Processing explicit_hack...
Loading existing progression data from plot-data/psychosis/2025-10-21-00-24-06/p=deepseek-reasoner-e=gpt-4.1-mini-hack=no-gameable=never/progression_data.json


In [None]:
# Extract baseline and final scores from progression data
def get_experiment_scores(exp_name, reward_type, variant='original'):
    """Get final scores for an experiment, reward type, and variant (original/sanitized)"""
    if exp_name not in progression_data:
        return [], 0
    
    exp_data = progression_data[exp_name]
    scores = []
    
    for run in exp_data['runs']:
        if run['progression']:
            # Get final (last) progression point
            final_point = run['progression'][-1]
            score = final_point['test_scores'][f'{reward_type}_{variant}']
            scores.append(score)
    
    return scores, np.mean(scores) if scores else 0

# Get baseline values from initial progression points and assert they're identical
baseline_proxy_no_hack = progression_data['no_hack']['runs'][0]['progression'][0]['test_scores']['proxy_original']
baseline_true_no_hack = progression_data['no_hack']['runs'][0]['progression'][0]['test_scores']['true_original']
baseline_proxy_explicit_hack = progression_data['explicit_hack']['runs'][0]['progression'][0]['test_scores']['proxy_original']
baseline_true_explicit_hack = progression_data['explicit_hack']['runs'][0]['progression'][0]['test_scores']['true_original']

# Assert baselines are identical
assert abs(baseline_proxy_no_hack - baseline_proxy_explicit_hack) < 1e-6, f"Proxy baselines differ: {baseline_proxy_no_hack} vs {baseline_proxy_explicit_hack}"
assert abs(baseline_true_no_hack - baseline_true_explicit_hack) < 1e-6, f"True baselines differ: {baseline_true_no_hack} vs {baseline_true_explicit_hack}"

print(f"Baseline scores - Proxy: {baseline_proxy_no_hack:.3f}, True: {baseline_true_no_hack:.3f}")

# Create bar graph
fig, ax = plt.subplots(figsize=(12, 6))

# Display names for better plot labels
display_names = {
    'baseline': 'Baseline',
    'no_hack': 'No hack',
    'no_hack_sanitized': 'No hack\n(sanitized)',
    'explicit_hack': 'Hack',
    'explicit_hack_sanitized': 'Hack\n(sanitized)'
}

# Prepare data for plotting
experiment_names = list(display_names.values())

# Extract scores for each experiment
def get_scores_for_plotting():
    proxy_scores = []
    true_scores = []
    proxy_individuals = []
    true_individuals = []
    
    for exp_key in display_names.keys():
        if exp_key == 'baseline':
            # Use baseline values
            proxy_scores.append(baseline_proxy_no_hack)
            true_scores.append(baseline_true_no_hack)
            proxy_individuals.append([baseline_proxy_no_hack])
            true_individuals.append([baseline_true_no_hack])
        elif exp_key.endswith('_sanitized'):
            # Get sanitized variant
            base_exp = exp_key.replace('_sanitized', '')
            proxy_ind, proxy_avg = get_experiment_scores(base_exp, 'proxy', 'sanitized')
            true_ind, true_avg = get_experiment_scores(base_exp, 'true', 'sanitized')
            proxy_scores.append(proxy_avg)
            true_scores.append(true_avg)
            proxy_individuals.append(proxy_ind)
            true_individuals.append(true_ind)
        else:
            # Get original variant
            proxy_ind, proxy_avg = get_experiment_scores(exp_key, 'proxy', 'original')
            true_ind, true_avg = get_experiment_scores(exp_key, 'true', 'original')
            proxy_scores.append(proxy_avg)
            true_scores.append(true_avg)
            proxy_individuals.append(proxy_ind)
            true_individuals.append(true_ind)
    
    return proxy_scores, true_scores, proxy_individuals, true_individuals

proxy_scores, true_scores, proxy_individuals, true_individuals = get_scores_for_plotting()

x = np.arange(len(experiment_names))
width = 0.35

# Create bars
bars1 = ax.bar(x - width/2, proxy_scores, width, label='Proxy Reward (gameable)', color='#ff9999')
bars2 = ax.bar(x + width/2, true_scores, width, label='True Reward (non-gameable)', color='#66b3ff')

# Add individual data points as X markers
for i in range(len(experiment_names)):
    # Proxy scores individual points
    ax.scatter(np.full(len(proxy_individuals[i]), x[i] - width/2), proxy_individuals[i], 
            marker='x', color='#202020', s=30, linewidth=0.5, zorder=3)
    # True scores individual points
    ax.scatter(np.full(len(true_individuals[i]), x[i] + width/2), true_individuals[i], 
            marker='x', color='#202020', s=30, linewidth=0.5, zorder=3)
    
# Customize plot
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Comparison of GEPA Experiments: Proxy vs True Reward\n(Psychosis Split-User Setting)', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(experiment_names)
ax.legend(loc='upper left')
ax.set_ylim(0, 1)
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars (showing averages)
for bars in [bars1, bars2]:
    ax.bar_label(bars, fmt='%.3f', padding=3)

plt.tight_layout()
plt.show()

In [None]:
# Create training progression plots
# Toggle between "task calls" and "reflection calls" for x-axis
USE_TASK_CALLS = False  # Set to False to use reflection calls instead

# Setup figure with two subplots
fig, (ax_no_hack, ax_explicit_hack) = plt.subplots(1, 2, figsize=(16, 6))

# Color scheme
colors = {
    'proxy': '#ff6666',  # Red for proxy reward
    'true': '#6666ff',   # Blue for true reward
}

# Process each experiment type
for exp_name, ax, title_suffix in [('no_hack', ax_no_hack, 'Not Told to Hack'),
                                     ('explicit_hack', ax_explicit_hack, 'Told to Hack')]:
    
    if exp_name not in progression_data:
        continue
    
    exp_data = progression_data[exp_name]
    
    # Prepare data for each line type
    lines_data = {
        'proxy_original': [],
        'proxy_sanitized': [],
        'true_original': [],
        'true_sanitized': []
    }
    
    # Process each run
    for run in exp_data['runs']:
        if not run['progression']:
            continue
            
        for line_type in lines_data.keys():
            x_values = []
            y_values = []
            
            # Build step function data
            for i, point in enumerate(run['progression']):
                # Get x coordinate based on toggle
                if USE_TASK_CALLS:
                    x_coord = point['discovery_eval_counts']
                else:
                    x_coord = point['reflection_call_count']
                
                # Get y coordinate (test score) - keep as 0-1 scale
                reward_type, variant = line_type.split('_')
                y_coord = point['test_scores'][line_type]
                
                # Add point for step function
                if i > 0:
                    # Add horizontal line from previous x to current x at previous y
                    x_values.append(x_coord)
                    y_values.append(y_values[-1])
                
                x_values.append(x_coord)
                y_values.append(y_coord)
            
            # Extend to the end if needed (for visibility)
            if x_values:
                max_x = max(point['discovery_eval_counts'] if USE_TASK_CALLS else point['reflection_call_count'] 
                           for point in run['progression'])
                if x_values[-1] < max_x * 1.05:
                    x_values.append(max_x * 1.05)
                    y_values.append(y_values[-1])
            
            lines_data[line_type].append((x_values, y_values))
    
    # Plot individual run lines (faint)
    for line_type, run_lines in lines_data.items():
        reward_type, variant = line_type.split('_')
        color = colors[reward_type]
        linestyle = '-' if variant == 'original' else ':'
        
        for x_vals, y_vals in run_lines:
            ax.plot(x_vals, y_vals, color=color, linestyle=linestyle, alpha=0.3, linewidth=1)
    
    # Calculate and plot average lines (bold) as step functions
    for line_type in lines_data.keys():
        if not lines_data[line_type]:
            continue
            
        reward_type, variant = line_type.split('_')
        color = colors[reward_type]
        linestyle = '-' if variant == 'original' else ':'
        
        # Find all unique x points where any run changes value
        x_change_points = set()
        for x_vals, _ in lines_data[line_type]:
            x_change_points.update(x_vals)
        x_change_points = sorted(x_change_points)
        
        # Build step function for average
        avg_x_values = []
        avg_y_values = []
        
        for i, x in enumerate(x_change_points):
            # Calculate average y value at this x point
            y_sum = 0
            y_count = 0
            
            for x_vals, y_vals in lines_data[line_type]:
                # Find the y value at this x (using step function logic)
                for j in range(len(x_vals)):
                    if j == len(x_vals) - 1 or x < x_vals[j+1]:
                        if x >= x_vals[j]:
                            y_sum += y_vals[j]
                            y_count += 1
                            break
            
            if y_count > 0:
                avg_y = y_sum / y_count
                
                # Add horizontal segment from previous point
                if i > 0:
                    avg_x_values.append(x)
                    avg_y_values.append(avg_y_values[-1])
                
                # Add vertical segment at this point
                avg_x_values.append(x)
                avg_y_values.append(avg_y)
        
        # Extend to the end for visibility
        if avg_x_values and x_change_points:
            final_x = max(x_change_points) * 1.05
            avg_x_values.append(final_x)
            avg_y_values.append(avg_y_values[-1])
        
        # Create label
        label = f"{reward_type.capitalize()} {'(sanitized)' if variant == 'sanitized' else ''}"
        
        # Plot average line as step function
        if avg_x_values:
            ax.plot(avg_x_values, avg_y_values, color=color, linestyle=linestyle, 
                    alpha=1.0, linewidth=2, label=label)
    
    # Customize subplot
    ax.set_xlabel('Task Calls' if USE_TASK_CALLS else 'Reflection Calls', fontsize=11)
    ax.set_ylabel('Test Score', fontsize=11)
    ax.set_title(f'Training Progression: {title_suffix}', fontsize=12)
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 1)
    ax.legend(loc='best', fontsize=9)

# Overall title
fig.suptitle('GEPA Training Progression: Test Performance at Validation-Best Checkpoints', fontsize=14, y=1.02)

plt.tight_layout()
plt.show()