In [1]:
from dotenv import load_dotenv
import dspy

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
lm = dspy.LM("gemini/gemini-2.5-pro", temperature=0.5, cache=False, max_tokens=25000)
dspy.settings.configure(
    lm=lm,
    track_usage=True,
)

In [2]:
import random

DEVSET_MULTIPLIER = 5
# N_DEVSET_SAMPLES = 10

efficacy_devset = load_efficacy_devset(uniform_efficacy=True)
efficacy_devset = efficacy_devset * DEVSET_MULTIPLIER
# efficacy_devset = random.sample(efficacy_devset, N_DEVSET_SAMPLES)

len(efficacy_devset)

125

In [3]:
from dspy.evaluate import Evaluate
import time

N_THREADS = 10
MAX_ITERS = 10


def efficacy_accuracy(example, pred, trace=None):
    "Calculate accuracy from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


evaluator = Evaluate(
    devset=efficacy_devset,
    num_threads=N_THREADS,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent(max_iters=MAX_ITERS)

start_time = time.time()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)
end_time = time.time()

Average Metric: 24.54 / 38 (64.6%):  30%|███       | 38/125 [03:58<04:06,  2.83s/it]



Average Metric: 52.09 / 81 (64.3%):  65%|██████▍   | 81/125 [08:19<04:13,  5.75s/it]



Average Metric: 78.04 / 125 (62.4%): 100%|██████████| 125/125 [12:56<00:00,  6.21s/it]

2025/09/17 17:24:10 INFO dspy.evaluate.evaluate: Average Metric: 78.03789642367497 / 125 (62.4%)





In [4]:
runtime = end_time - start_time
cost = sum([x["cost"] for x in lm.history])

errors = []
confidences = []
step_counts = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)
    step_counts.append(len(result[1].trajectory) // 4)

print(f"Stats from {len(evaluation.results)} runs:")
print("------------------------")
print("Total Runtime (s):", runtime)
print("Total Cost (USD):", cost)
print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))
print("Mean Step Count:", sum(step_counts) / len(step_counts))

Stats from 125 runs:
------------------------
Total Runtime (s): 776.6659922599792
Total Cost (USD): 12.899648437500002
Mean Absolute Error: 0.37569682861060033
Mean Confidence: 0.8344
Mean Step Count: 5.328
