In [None]:
from dotenv import load_dotenv
import dspy
import mlflow

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

# NOTE: Start MLflow server with:
# mlflow server --backend-store-uri sqlite:///mydb.sqlite
# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Create a unique name for your experiment.
mlflow.set_experiment("Eval Agent")
mlflow.autolog()
mlflow.tracing.disable_notebook_display()

load_dotenv("../.env")
lm = dspy.LM("gemini/gemini-2.5-pro", temperature=0.5, cache=False, max_tokens=25000)
dspy.settings.configure(
    lm=lm,
    track_usage=True,
)

In [None]:
efficacy_devset = load_efficacy_devset(uniform_efficacy=True)
efficacy_devset = efficacy_devset * 5
len(efficacy_devset)

In [None]:
from dspy.evaluate import Evaluate
import random


def efficacy_accuracy(example, pred, trace=None):
    "Calculate error from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


N_SAMPLES = 10
efficacy_devset_subset = random.sample(efficacy_devset, N_SAMPLES)
evaluator = Evaluate(
    devset=efficacy_devset_subset,
    num_threads=10,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)

In [None]:
cost = sum([x["cost"] for x in lm.history])

errors = []
confidences = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)

print("Total Cost (USD):", cost)
print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))