In this example, we will see GEPA evolve the whole DSPy program (not just the instruction), including modifying the structure/dataflow of the program. We will use GEPA to tune a simple dspy.ChainOfThought module for crossword puzzles into a full DSPy program.

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify OPENAI_API_KEY is loaded
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file")

In [None]:
import dspy

In [None]:
import json
import random
import re
from score_strict import score


# Create a simple dataset object
class CrosswordsDataset:
    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test

def load_jsonl(file_path):
    examples = []
    try:
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line.strip())
                # Convert to DSPy format for crosswords
                examples.append(dspy.Example(
                    puzzle=data['input'],
                    solution=data['scoring_data']['solution']
                ).with_inputs('puzzle'))
    except FileNotFoundError:
        print(f"Warning: {file_path} not found")
        return []
    return examples

def load_crosswords_data(data_dir, difficulty=None):
    """Load crosswords dataset from JSONL files

    Args:
        data_dir: Base directory containing crosswords data
        difficulty: Optional difficulty level ("monday", "tuesday", etc.)
    """
    # Load from difficulty-specific subdirectory if it exists
    difficulty_dir = f"{data_dir}/{difficulty}"
    train_data = load_jsonl(f"{difficulty_dir}/train.jsonl")
    dev_data = load_jsonl(f"{difficulty_dir}/valid.jsonl")
    test_data = load_jsonl(f"{difficulty_dir}/test.jsonl")

    random.Random(0).shuffle(train_data)
    random.Random(0).shuffle(dev_data)

    test_data = train_data[:20]  # Crosswords can take a while to run, so use a smaller test set

    return CrosswordsDataset(train_data, dev_data, test_data)

# Load the crosswords dataset
DIFFICULTY = "monday"  # Can be changed to "tuesday", "wednesday", etc., or None for all data
DATA_DIR = "/home/caleb/gepa/data/crosswords"
dataset = load_crosswords_data(DATA_DIR, DIFFICULTY)

print(len(dataset.train), len(dataset.dev), len(dataset.test))

Let's inspect an example from the training set.

In [None]:
example = dataset.train[0]
print("Puzzle:")
print(example.puzzle)
print("\nSolution:")
print(example.solution)

Let's define a simple DSPy program to solve crossword puzzles.

Unlike dspy.GEPA that can take an instantiated DSPy module as input, here, we want to evolve the full DSPy program. Hence, a candidate here is the source code as string. The seed program does not need to be sophisticated, it just needs to demonstrate what the expected input/output interface is, and possibly the available tools. You can also include any additional information about the environment as a comment.

In [None]:
program_src = """import dspy
program = dspy.ChainOfThought("puzzle -> solution")"""

GEPA interfaces with external frameworks through an adapter. In this case, we integrate GEPA with a DspyAdapter.

In [None]:
from gepa.adapters.dspy_full_program_adapter.full_program_adapter import DspyAdapter

In [None]:
def extract_crossword_solution(raw_solution):
    """Extract crossword solution from various formats"""
    # For now, just return the raw solution as-is
    # The scoring function will handle parsing
    return raw_solution.strip() if raw_solution else ""

In [None]:
def metric_fn(example, pred, trace=None):
    """Metric function that provides detailed feedback for GEPA using strict crosswords scoring"""
    if not hasattr(pred, 'solution'):
        return dspy.Prediction(score=0.0, feedback="No solution field found in prediction.")

    raw_solution = pred.solution.strip() if pred.solution else ""
    expected_solution = example.solution.strip()

    # Use the strict scoring function
    try:
        score_value, correct_count, total_count = score(expected_solution, raw_solution)

        if total_count == 0:
            feedback = "Invalid puzzle format - no fillable squares found."
        elif score_value == 0.0:
            feedback = f"No correct squares filled. Expected {total_count} squares to be filled."
        elif score_value == 1.0:
            feedback = f"Perfect solution! All {total_count} squares filled correctly."
        else:
            feedback = f"Partial solution: {correct_count}/{total_count} squares correct ({score_value:.1%})."

    except Exception as e:
        score_value = 0.0
        feedback = f"Error in scoring: {str(e)}"

    return dspy.Prediction(score=score_value, feedback=feedback)

In [None]:
# Test the crosswords scoring logic with a simple example
test_solution = """A B C
D E F
G H I

Across:
  1. ABC
  4. DEF
  7. GHI

Down:
  1. ADG
  2. BEH
  3. CFI"""

# Test with correct solution
example = dspy.Example(puzzle="test", solution=test_solution)
pred = dspy.Prediction(solution=test_solution)
metric_result = metric_fn(example, pred)
print(f"Perfect match - Score: {metric_result.score}")
print(f"Feedback: {metric_result.feedback}")
print()

# Test with partial solution
partial_solution = """A B C
D . F
G H I

Across:
  1. ABC
  4. D.F
  7. GHI

Down:
  1. ADG
  2. B.H
  3. CFI"""
pred = dspy.Prediction(solution=partial_solution)
metric_result = metric_fn(example, pred)
print(f"Partial match - Score: {metric_result.score}")
print(f"Feedback: {metric_result.feedback}")
print()

# Test with empty solution
pred = dspy.Prediction(solution="")
metric_result = metric_fn(example, pred)
print(f"Empty solution - Score: {metric_result.score}")
print(f"Feedback: {metric_result.feedback}")

In [None]:
# SCAFFOLDER_NAME = "openai/o3"
# EXECUTOR_NAME = "openai/gpt-4.1-nano"
SCAFFOLDER_NAME = "deepinfra/Qwen/Qwen3-Next-80B-A3B-Thinking"
EXECUTOR_NAME = "deepinfra/Qwen/Qwen2.5-72B-Instruct"

reflection_lm = dspy.LM(model=SCAFFOLDER_NAME, max_tokens=32000, temperature=1.0)
adapter = DspyAdapter(
    task_lm=dspy.LM(model=EXECUTOR_NAME, max_tokens=32000, temperature=1.0),
    metric_fn=metric_fn,
    num_threads=80,
    reflection_lm=lambda x: reflection_lm(x)[0],
)

Let's evaluate the base program

In [None]:
o = adapter.evaluate(dataset.test, {"program": program_src})

The base program obtains a score shown above

Let's launch the GEPA optimization.

In [None]:
from gepa import optimize

o = optimize(
    seed_candidate={"program": program_src},
    trainset=dataset.train,
    valset=dataset.dev,
    adapter=adapter,
    reflection_lm=lambda x: reflection_lm(x)[0],
    max_metric_calls=500,
    reflection_minibatch_size=1,
    display_progress_bar=True,
)

Let's see the DSPy program found by GEPA

In [None]:
print(o.best_candidate["program"])

In [None]:
o

Evaluating the optimized program

In [None]:
_ = adapter.evaluate(dataset.test, o.best_candidate)

We can see the optimization progress for crossword puzzle solving!