# Inspect GEPA Reflection Prompt

This notebook shows the exact prompt that gets sent to the reflection LM during GEPA optimization, using real execution traces and feedback.

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify OPENAI_API_KEY is loaded
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file")

In [None]:
import dspy
import json
import random
import yaml

## Load the Dataset

In [None]:
# Create a simple dataset object
class CrosswordsDataset:
    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test

def load_jsonl(file_path):
    examples = []
    try:
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line.strip())
                # Convert to DSPy format for crosswords
                examples.append(dspy.Example(
                    puzzle=data['input'],
                    solution=data['scoring_data']['solution']
                ).with_inputs('puzzle'))
    except FileNotFoundError:
        print(f"Warning: {file_path} not found")
        return []
    return examples

def load_crosswords_data(data_dir, difficulty=None):
    """Load crosswords dataset from JSONL files"""
    difficulty_dir = f"{data_dir}/{difficulty}"
    train_data = load_jsonl(f"{difficulty_dir}/train.jsonl")
    dev_data = load_jsonl(f"{difficulty_dir}/valid.jsonl")
    test_data = load_jsonl(f"{difficulty_dir}/test.jsonl")

    random.Random(0).shuffle(train_data)
    random.Random(0).shuffle(dev_data)

    return CrosswordsDataset(train_data, dev_data, test_data[:20])

# Load the crosswords dataset
DIFFICULTY = "monday"
DATA_DIR = "/home/caleb/gepa/data/crosswords"
dataset = load_crosswords_data(DATA_DIR, DIFFICULTY)

print(f"Loaded {len(dataset.train)} train, {len(dataset.dev)} dev, {len(dataset.test)} test examples")

## Define the Seed Program

In [None]:
# Seed program that uses DSPy predictors to generate traces
program_src = """import dspy

class CrosswordSignature(dspy.Signature):
    \"\"\"Solve a crossword puzzle by analyzing clues and filling in answers.\"\"\"
    puzzle = dspy.InputField(desc="Crossword puzzle with clues")
    solution = dspy.OutputField(desc="Solution to the crossword puzzle")

class CrosswordSolver(dspy.Module):
    def __init__(self):
        super().__init__()
        self.solve = dspy.Predict(CrosswordSignature)
    
    def forward(self, puzzle: str) -> dspy.Prediction:
        result = self.solve(puzzle=puzzle)
        return result

program = CrosswordSolver()"""

print("Seed program with DSPy predictor:")
print(program_src)

## Set up the Metric and Adapter

In [None]:
from score_strict import score

def metric_fn(example, pred, trace=None):
    """Metric function that provides detailed feedback for GEPA using strict crosswords scoring"""
    if not hasattr(pred, 'solution'):
        return dspy.Prediction(score=0.0, feedback="No solution field found in prediction.")

    raw_solution = pred.solution.strip() if pred.solution else ""
    expected_solution = example.solution.strip()

    # Use the strict scoring function
    try:
        score_value, correct_count, total_count = score(expected_solution, raw_solution)

        feedback = f"{correct_count}/{total_count} squares correct ({score_value:.1%})."
        if score_value != 1.0:
            feedback += "\n\nExpected solution:\n" + expected_solution

    except Exception as e:
        score_value = 0.0
        feedback = f"Error in scoring: {str(e)}"

    return dspy.Prediction(score=score_value, feedback=feedback)

In [None]:
from custom_full_program_adapter.full_program_adapter import DspyAdapter

# Use a fast model for execution
SCAFFOLDER_NAME = "openai/o3"
EXECUTOR_NAME = "deepinfra/Qwen/Qwen2.5-72B-Instruct"

# Configure DSPy with the language model before creating the adapter
dspy.settings.configure(lm=dspy.LM(model=EXECUTOR_NAME, max_tokens=32000, temperature=1.0))

adapter = DspyAdapter(
    task_lm=dspy.LM(model=EXECUTOR_NAME, max_tokens=32000, temperature=1.0),
    metric_fn=metric_fn,
    num_threads=4,
    reflection_lm=SCAFFOLDER_NAME,
)

## Generate Real Execution Traces and Feedback

Let's run the seed program on a few examples to get real feedback:

In [None]:
# Select a small batch of examples (what GEPA would do during optimization)
minibatch = dataset.train[:3]  # Use first 3 training examples

print("Running seed program on minibatch to collect execution traces...")

# Evaluate with traces (this is what GEPA does internally)
eval_result = adapter.evaluate(
    minibatch, 
    {"program": program_src}, 
    capture_traces=True
)

print(f"\nScores: {eval_result.scores}")
print(f"Average score: {sum(eval_result.scores) / len(eval_result.scores):.2%}")

# Debug: Check what's in the trajectories
print(f"\nTrajectories type: {type(eval_result.trajectories)}")
if eval_result.trajectories:
    print(f"Number of trajectories: {len(eval_result.trajectories)}")
    if len(eval_result.trajectories) > 0:
        print("\nFirst trajectory keys:", eval_result.trajectories[0].keys())
        print("Trace in first trajectory:", eval_result.trajectories[0].get('trace'))

## Build the Reflective Dataset

This is what GEPA does to prepare feedback for the reflection LM:

In [None]:
# Build the reflective dataset (this is what the adapter does internally)
reflective_dataset = adapter.make_reflective_dataset(
    {"program": program_src},
    eval_result,
    ["program"]  # Component to update
)

# Show a sample of the feedback
print("Sample feedback entry:")
print("="*60)
sample = reflective_dataset["program"][0]
for key in sample:
    print(key)
    print(sample[key])
    print("="*60)

In [None]:
from custom_full_program_adapter.dspy_program_proposal_signature import DSPyProgramProposalSignature

# Prepare the input as GEPA would
prompt_input = {
    "curr_program": program_src,
    "dataset_with_feedback": reflective_dataset["program"],
    "suggest_hack": "no"  # Normal mode (not looking for exploits)
}

# Generate the actual prompt
actual_prompt = DSPyProgramProposalSignature.prompt_renderer(prompt_input)
print("GEPA Reflection Prompt:")
print("="*60)
print(actual_prompt)