### Imports and loading 

In [None]:
import asyncio
import sys
import os
from pathlib import Path
import dspy
from dspy import GEPA
from dotenv import load_dotenv

# Add project root to path for prompt_optimization imports
import roma_dspy
project_root = Path(roma_dspy.__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

# Now import from prompt_optimization (at project root)
from prompt_optimization.seed_prompts import (
    ATOMIZER_PROMPT,
    ATOMIZER_DEMOS,
    PLANNER_PROMPT,
    PLANNER_DEMOS,
    AGGREGATOR_PROMPT,
)
from prompt_optimization import (
    get_default_config,
    LMConfig,
    patch_romaconfig,
    load_aimo_datasets,
    ComponentJudge,
    MetricWithFeedback,
    create_optimizer,
)

# Import from roma_dspy package
from roma_dspy.config import ConfigManager
from roma_dspy.core.engine.solve import RecursiveSolver
from roma_dspy.core.modules.recursive_solver import RecursiveSolverModule
from roma_dspy.utils.async_executor import AsyncParallelExecutor

dspy.settings.provide_traceback = True

# Load config with absolute path
config_manager = ConfigManager(config_dir=project_root / "config")
opt_cfg = config_manager.load_config(profile="test")

# Load environment variables
load_dotenv(project_root / '.env')

In [19]:
opt_cfg

ROMAConfig(project='roma-dspy', version='0.1.0', environment='development', agents=AgentsConfig(atomizer=AgentConfig(llm=LLMConfig(model='openrouter/google/gemini-2.5-flash', temperature=0.7, max_tokens=4000, timeout=30, api_key=None, base_url=None, num_retries=3, cache=True, rollout_id=None), prediction_strategy='chain_of_thought', toolkits=[], enabled=True, type=None, task_type=None, signature=None, signature_instructions=None, agent_config={}, strategy_config={}), planner=AgentConfig(llm=LLMConfig(model='openrouter/google/gemini-2.5-flash', temperature=0.4, max_tokens=32000, timeout=30, api_key=None, base_url=None, num_retries=3, cache=True, rollout_id=None), prediction_strategy='chain_of_thought', toolkits=[], enabled=True, type=None, task_type=None, signature=None, signature_instructions=None, agent_config={'max_subtasks': 15}, strategy_config={}), executor=AgentConfig(llm=LLMConfig(model='cerebras/gpt-oss-120b', temperature=0.75, max_tokens=128000, timeout=30, api_key=None, base_

### Config LLMS

In [2]:
opt_cfg.atomizer_lm = LMConfig("cerebras/qwen-3-235b-a22b-instruct-2507", temperature=0.35, max_tokens=128_000)
opt_cfg.planner_lm = LMConfig("cerebras/qwen-3-235b-a22b-instruct-2507", temperature=0.3, max_tokens=128_000)
opt_cfg.executor_lm = LMConfig("cerebras/gpt-oss-120b", temperature=0.6, max_tokens=128_000)
opt_cfg.aggregator_lm = LMConfig("cerebras/gpt-oss-120b", temperature=0.4, max_tokens=64_000)
opt_cfg.judge_lm = LMConfig("openrouter/anthropic/claude-sonnet-4.5", temperature=0.75, max_tokens=128_000, cache=True)
opt_cfg.reflection_lm = LMConfig("openrouter/anthropic/claude-sonnet-4.5", temperature=0.9, max_tokens=64_000)

In [4]:
# Batch the knobs you used to tweak in the notebook.
opt_cfg.train_size = 32
opt_cfg.val_size = 8
opt_cfg.test_size = 8
opt_cfg.dataset_seed = 42
opt_cfg.max_metric_calls = 225
opt_cfg.num_threads = 8
opt_cfg.max_parallel = 4
opt_cfg.concurrency = 4
opt_cfg.max_depth = 1
opt_cfg.enable_logging = True

In [5]:
#Add few-shot examples + prompts
opt_cfg.agents.atomizer.signature_instructions = ATOMIZER_PROMPT
opt_cfg.agents.planner.signature_instructions = PLANNER_PROMPT
opt_cfg.agents.aggregator.signature_instructions = AGGREGATOR_PROMPT

### Init solvers and what not

In [7]:
solver = RecursiveSolver(
    config=opt_cfg,
    max_depth=opt_cfg.max_depth,
    enable_logging=opt_cfg.enable_logging,
    enable_checkpoints=False,
)
solver_module = RecursiveSolverModule(solver=solver)

[32m2025-10-17 13:28:09.973[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.factory.agent_factory[0m:[36m_resolve_signature[0m:[36m119[0m - [34m[1mUsing default signature for atomizer[0m
[32m2025-10-17 13:28:09.975[0m | [1mINFO    [0m | [36mroma_dspy.core.factory.agent_factory[0m:[36mcreate_agent[0m:[36m91[0m - [1mCreated atomizer agent (task_type=default, signature=default)[0m
[32m2025-10-17 13:28:09.975[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.registry.agent_registry[0m:[36mregister_agent[0m:[36m180[0m - [34m[1mRegistered atomizer instance #6 (task_type=default)[0m
[32m2025-10-17 13:28:09.976[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.factory.agent_factory[0m:[36m_resolve_signature[0m:[36m119[0m - [34m[1mUsing default signature for planner[0m
[32m2025-10-17 13:28:09.978[0m | [1mINFO    [0m | [36mroma_dspy.core.factory.agent_factory[0m:[36mcreate_agent[0m:[36m91[0m - [1mCreated planner agent (task_type=default, signat

In [8]:
judge = ComponentJudge(lm_config=opt_cfg.judge_lm)  # keyword required after the refactor
metric = MetricWithFeedback(judge)

In [9]:
train_set, val_set, test_set = load_aimo_datasets(
    train_size=opt_cfg.train_size,
    val_size=opt_cfg.val_size,
    test_size=opt_cfg.test_size,
    seed=opt_cfg.dataset_seed,
)

In [11]:
train_set

[Example({'goal': 'Let $\\omega = \\cos\\frac{2\\pi}{7} + i \\cdot \\sin\\frac{2\\pi}{7},$ where $i = \\sqrt{-1}.$ Find the value of the product\\[\\prod_{k=0}^6 \\left(\\omega^{3k} + \\omega^k + 1\\right).\\]', 'solution': 'For any $k\\in Z$, we have,\n\\begin{align*} & \\left( \\omega^{3k} + \\omega^k + 1 \\right) \\left( \\omega^{3\\left( 7 - k \\right)} + \\omega^{\\left( 7 - k \\right)} + 1 \\right) \\\\ & = \\omega^{3 \\cdot 7} + \\omega^{2k + 7} + \\omega^{3k} + \\omega^{-2k + 3 \\cdot 7} + \\omega^7 + \\omega^k + \\omega^{3\\left( 7 - k \\right)} + \\omega^{\\left( 7 - k \\right)} + 1 \\\\ & = 1 + \\omega^{2k} + \\omega^{3k} + \\omega^{-2k} + 1 + \\omega^k + \\omega^{-3k} + \\omega^{-k} + 1 \\\\ & = 2 + \\omega^{-3k} \\sum_{j=0}^6 \\omega^{j k} \\\\ & = 2 + \\omega^{-3k} \\frac{1 - \\omega^{7 k}}{1 - \\omega^k} \\\\ & = 2 . \\end{align*}\nThe second and the fifth equalities follow from the property that $\\omega^7 = 1$.\nTherefore,\n\\begin{align*} \\Pi_{k=0}^6 \\left( \\omega^

### Perform an eval on the val set

In [None]:
# executor = AsyncParallelExecutor(max_concurrency=4)

# results = await executor.execute_batch(solver_module, test_set)

In [None]:
# print(results)

### Prompt tuning stuff

In [10]:
optimizer = GEPA(
    metric=metric,
    # auto="light",
    component_selector="round_robin",
    max_metric_calls=12,
    add_format_failure_as_feedback=True,
    num_threads=5,
    track_stats=True,
    log_dir="logs/aime_test",
    # use_wandb=True,
    # wandb_init_kwargs={"project": "aime_test"},
    reflection_minibatch_size=8,
    reflection_lm=dspy.LM(model="openrouter/anthropic/claude-sonnet-4.5", temperature=.75, max_tokens=128000)
)

In [None]:
optimized_program = optimizer.compile(
    solver_module,
    trainset=train_set,
    valset=val_set,
)

2025/10/17 13:38:54 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 12 metric calls of the program. This amounts to 0.30 full evals on the train+val set.
2025/10/17 13:38:54 INFO dspy.teleprompt.gepa.gepa: Using 8 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
[32m2025-10-17 13:38:54.896[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.modules.recursive_solver[0m:[36mnamed_predictors[0m:[36m115[0m - [34m[1mRecursiveSolverModule.named_predictors exported 5 predictors: ['atomizer__default___predictor.predict', 'planner__default___predictor.predict', 'executor__default___predictor.predict', 'aggregator__default___predictor.predict', 'verifier__default___predictor.predict'][0m
[32m2025-10-17 13

KeyboardInterrupt: 

[32m2025-10-17 13:41:10.262[0m | [1mINFO    [0m | [36mroma_dspy.core.observability.callbacks[0m:[36mon_lm_start[0m:[36m96[0m - [1m[CALLBACK] on_lm_start called for planner (call_id=a7b5ab9214f14fe7accb14891222bad5)[0m
[32m2025-10-17 13:41:10.262[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.observability.callbacks[0m:[36m_try_enrich_span[0m:[36m106[0m - [34m[1m[ENRICH] _try_enrich_span called in on_lm_start for planner (enriched=False, mlflow_available=True)[0m
[32m2025-10-17 13:41:10.263[0m | [34m[1mDEBUG   [0m | [36mroma_dspy.core.observability.callbacks[0m:[36m_try_enrich_span[0m:[36m114[0m - [34m[1m[ENRICH] mlflow.get_current_active_span() returned: None[0m
[32m2025-10-17 13:41:10.263[0m | [1mINFO    [0m | [36mroma_dspy.core.observability.callbacks[0m:[36m_try_enrich_span[0m:[36m146[0m - [1mNo active MLflow span in on_lm_start for planner[0m
[32m2025-10-17 13:41:15.523[0m | [1mINFO    [0m | [36mroma_dspy.resilience.decorators

In [None]:
optimized_program