In [None]:
from dotenv import load_dotenv
import dspy

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
lm = dspy.LM("gemini/gemini-2.5-pro", temperature=0.5, cache=False, max_tokens=25000)
dspy.settings.configure(
    lm=lm,
    track_usage=True,
)

In [None]:
import random

DEVSET_MULTIPLIER = 5
# N_DEVSET_SAMPLES = 10

efficacy_devset = load_efficacy_devset(uniform_efficacy=True)
efficacy_devset = efficacy_devset * DEVSET_MULTIPLIER
# efficacy_devset = random.sample(efficacy_devset, N_DEVSET_SAMPLES)

len(efficacy_devset)

In [None]:
from dspy.evaluate import Evaluate
import time

N_THREADS = 10
MAX_ITERS = 10


def efficacy_accuracy(example, pred, trace=None):
    "Calculate accuracy from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


evaluator = Evaluate(
    devset=efficacy_devset,
    num_threads=N_THREADS,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent(max_iters=MAX_ITERS)

start_time = time.time()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)
end_time = time.time()

In [None]:
runtime = end_time - start_time
cost = sum([x["cost"] for x in lm.history])

errors = []
confidences = []
step_counts = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)
    step_counts.append(len(result[1].trajectory) // 4)

print(f"Stats from {len(evaluation.results)} runs:")
print("------------------------")
print("Total Runtime (s):", runtime)
print("Total Cost (USD):", cost)
print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))
print("Mean Step Count:", sum(step_counts) / len(step_counts))

In [None]:
evaluation.results[0][1].trajectory