In [4]:
from dotenv import load_dotenv
import dspy
import pandas as pd

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
dspy.configure(
    lm=dspy.LM("gemini/gemini-2.5-pro", temperature=0.5, cache=False, max_tokens=25000)
)

In [5]:
efficacy_devset = load_efficacy_devset(uniform_efficacy=True)
efficacy_devset = efficacy_devset * 5
len(efficacy_devset)

125

In [6]:
from dspy.evaluate import Evaluate


def efficacy_accuracy(example, pred, trace=None):
    "Calculate error from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(
    devset=efficacy_devset[:5],
    num_threads=10,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)

Average Metric: 3.02 / 4 (75.5%):  80%|████████  | 4/5 [01:37<00:22, 22.56s/it]



Average Metric: 3.44 / 5 (68.8%): 100%|██████████| 5/5 [03:52<00:00, 46.55s/it]

2025/09/15 16:44:23 INFO dspy.evaluate.evaluate: Average Metric: 3.4379948035053847 / 5 (68.8%)





In [7]:
errors = []
confidences = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)

print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))

Mean Absolute Error: 0.31240103929892304
Mean Confidence: 0.6900000000000001
