In [None]:
from dotenv import load_dotenv
import dspy
import pandas as pd

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
dspy.configure(lm=dspy.LM("gemini/gemini-2.5-flash-lite", temperature=0.5, cache=True))

In [None]:
efficacy_devset = load_efficacy_devset()
efficacy_devset = efficacy_devset * 5
len(efficacy_devset)

In [None]:
from dspy.evaluate import Evaluate


def efficacy_accuracy(example, pred, trace=None):
    "Calculate error from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(
    devset=efficacy_devset,
    num_threads=10,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent(max_iters=5)
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)
evaluation

In [None]:
errors = []
confidences = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)

print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))