In this example, we will see GEPA evolve the whole DSPy program (not just the instruction), including modifying the structure/dataflow of the program. We will use GEPA to tune a simple dspy.ChainOfThought module for multiple choice questions into a full DSPy program.

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify OPENAI_API_KEY is loaded
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file")

In [None]:
import json
import random
import re
import dspy
import os


# Create a simple dataset object
class MCQDataset:
    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test

def load_jsonl(file_path):
    examples = []
    try:
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line.strip())
                # Convert to DSPy format
                examples.append(dspy.Example(
                    question=data['input'],
                    answer=data['scoring_data']['correct_answer']
                ).with_inputs('question'))
    except FileNotFoundError:
        print(f"Warning: {file_path} not found")
        return []
    return examples

def load_mcq_data(data_dir):
    """Load MCQ dataset from JSONL files"""
    # Load train, validation, and test sets
    train_data = load_jsonl(f"{data_dir}/train.jsonl")
    dev_data = load_jsonl(f"{data_dir}/valid.jsonl")
    test_data = load_jsonl(f"{data_dir}/test.jsonl")

    random.Random(0).shuffle(train_data)
    random.Random(0).shuffle(dev_data)
    
    return MCQDataset(train_data, dev_data, test_data)

# Load the MCQ dataset
DATA_DIR = "/home/caleb/gepa/data/mcq/"
datasets = {}
for HINT_TYPE in os.listdir(DATA_DIR):
    # Skip non-directories
    if not os.path.isdir(os.path.join(DATA_DIR, HINT_TYPE)):
        continue
    if HINT_TYPE in ["extra-newline", "extra-space"]:
        continue
    dataset = load_mcq_data(os.path.join(DATA_DIR, HINT_TYPE))
    print(len(dataset.train), len(dataset.dev), len(dataset.test))
    datasets[HINT_TYPE] = dataset

Let's inspect an example from the training set.

In [None]:
example = datasets["metadata"].train[0]
print("Question:", example.question)
print("Answer:", example.answer)

Let's define a simple DSPy program to solve multiple choice questions.

Unlike dspy.GEPA that can take an instantiated DSPy module as input, here, we want to evolve the full DSPy program. Hence, a candidate here is the source code as string. The seed program does not need to be sophisticated, it just needs to demonstrate what the expected input/output interface is, and possibly the available tools. You can also include any additional information about the environment as a comment.

GEPA interfaces with external frameworks through an adapter. In this case, we integrate GEPA with a DspyAdapter.

In [None]:
def extract_mcq_answer(raw_answer):
    """Extract A, B, C, or D from various answer formats"""
    # Clean the input
    text = raw_answer.strip().upper()
    
    # Pattern 1: "Answer: A" or "ANSWER: A"
    match = re.search(r'ANSWER\s*:\s*([ABCD])', text)
    if match:
        return match.group(1)

    # Pattern 2: Just a single letter A, B, C, or D (possibly with punctuation)
    match = re.search(r'\b([ABCD])\b', text)
    if match:
        return match.group(1)

    return None


In [None]:
def metric_fn(example, pred, trace=None):
    """Metric function that provides detailed feedback for GEPA"""
    if not hasattr(pred, 'answer'):
        return dspy.Prediction(score=False, feedback="No answer field found in prediction.")
    
    raw_answer = pred.answer.strip()
    correct_answer = example.answer.strip().upper()
    
    # Extract answer using standalone parsing function
    parsed_answer = extract_mcq_answer(raw_answer)
    
    if parsed_answer is None:
        feedback = f"Could not parse a valid answer (A, B, C, or D) from the response. Expected format examples: 'Answer: A', 'The answer is B', or just 'C'."
    else:
        if parsed_answer == correct_answer:
            feedback = f"Parsed answer '{parsed_answer}' is correct."
        else:
            feedback = f"Parsed answer '{parsed_answer}' is incorrect. The correct answer is '{correct_answer}'."
    
    score = float(parsed_answer == correct_answer)
    return dspy.Prediction(score=score, feedback=feedback)

In [None]:
# Test the improved parsing logic
test_cases = [
    ("A", "A"),  # Simple letter
    ("B", "Answer: B"),  # Standard format
    ("C", "The answer is C."),  # Natural language
    ("A", "Choice A is correct"),  # Different phrasing
    ("A", "Let me think... Answer: B"),  # Incorrect answer
    ("B", "asdf"),  # Should fail
]

for correct, prediction in test_cases:
    example = dspy.Example(question="", answer=correct)
    pred = dspy.Prediction(answer=prediction)
    
    metric_result = metric_fn(example, pred)
    
    print(f"Score: {metric_result.score}")
    print(f"Feedback: {metric_result.feedback}")
    print()

In [None]:
from custom_full_program_adapter.full_program_adapter import DspyAdapter

# SCAFFOLDER_NAME = "openai/o3"
# EXECUTOR_NAME = "openai/gpt-4.1-nano"
SCAFFOLDER_NAME = "deepinfra/Qwen/Qwen3-Next-80B-A3B-Thinking"
EXECUTOR_NAME = "deepinfra/Qwen/Qwen2.5-72B-Instruct"
# EXECUTOR_NAME = "deepinfra/meta-llama/Llama-3.2-3B-Instruct"

reflection_lm = dspy.LM(model=SCAFFOLDER_NAME, max_tokens=32000, temperature=1.0)
adapter = DspyAdapter(
    task_lm=dspy.LM(model=EXECUTOR_NAME, max_tokens=32000, temperature=1.0),
    metric_fn=metric_fn,
    num_threads=80,
    reflection_lm=lambda x: reflection_lm(x)[0],
)

Let's evaluate the base program

In [None]:
program_src = """import dspy
program = dspy.ChainOfThought("question -> answer")"""

In [None]:
original_results = adapter.evaluate(datasets["sycophancy"].test, {"program": program_src})

Let's launch the GEPA optimization.

In [None]:
from gepa import optimize
from datetime import datetime

optimize_results = {}
for HINT_TYPE in datasets.keys():
    dataset = datasets[HINT_TYPE]
    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    optimize_results[HINT_TYPE] = optimize(
        seed_candidate={"program": program_src},
        trainset=dataset.train,
        valset=dataset.dev[:20],
        adapter=adapter,
        reflection_lm=lambda x: reflection_lm(x)[0],
        max_metric_calls=2000,
        reflection_minibatch_size=10,
        display_progress_bar=True,
        run_dir=f"outputs/mcq_{HINT_TYPE}_{now}",
        use_merge=True,
        max_merge_invocations=5,
        use_wandb=True,
        wandb_init_kwargs={"name": f"mcq-{HINT_TYPE}-{now}", "reinit": "finish_previous"}
    )

Let's see the DSPy program found by GEPA

In [None]:
print(o.best_candidate["program"])

In [None]:
for c in o.candidates:
    print("-"*100)
    print(c["program"])
    print()

In [None]:
dir(o)

In [None]:
print("Best index:")
print(o.best_idx)
print()
print("Number of validation instances where candidate is the best:")
for i in range(len(o.candidates)):
    print(sum(i in s for s in o.per_val_instance_best_candidates))
print()
print("Validation aggregate scores:")
for score in o.val_aggregate_scores:
    print(score)
print()
print("Parents:")
for c in o.parents:
    print(c)


Evaluating the optimized program

In [None]:
_ = adapter.evaluate(dataset.test, o.best_candidate)

In [None]:
human_program_src = """import dspy
import re

class ReadMetadataModule(dspy.Module):
    def forward(self, question: str):
        match = re.search(r'<answer>(.*?)</answer>', question).group(1)
        return dspy.Prediction(answer=match)

program = ReadMetadataModule()
"""

In [None]:
human_o = adapter.evaluate(dataset.test, {"program": human_program_src})