In [None]:
from dotenv import load_dotenv
import dspy
import pandas as pd

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
dspy.configure(
    lm=dspy.LM(
        "gemini/gemini-2.5-flash-lite", temperature=0.5, cache=True, max_tokens=25000
    )
)

In [8]:
efficacy_devset = load_efficacy_devset()
efficacy_devset = efficacy_devset * 3
len(efficacy_devset)

126

In [9]:
from dspy.evaluate import Evaluate


def efficacy_accuracy(example, pred, trace=None):
    "Calculate error from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(
    devset=efficacy_devset,
    num_threads=10,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)

Average Metric: 11.86 / 22 (53.9%):  17%|█▋        | 22/126 [00:42<02:54,  1.68s/it]



Average Metric: 39.08 / 60 (65.1%):  48%|████▊     | 60/126 [01:36<01:22,  1.24s/it]



Average Metric: 67.65 / 100 (67.6%):  79%|███████▉  | 100/126 [02:41<00:27,  1.06s/it]



Average Metric: 90.84 / 126 (72.1%): 100%|██████████| 126/126 [03:27<00:00,  1.65s/it]

2025/09/15 15:36:19 INFO dspy.evaluate.evaluate: Average Metric: 90.84299587400342 / 126 (72.1%)





In [10]:
errors = []
confidences = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)

print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))

Mean Absolute Error: 0.2790238422698142
Mean Confidence: 0.1119047619047619


In [12]:
evaluation.results[0][1].trajectory

{'thought_0': "The user wants to estimate the efficacy of Luminespib in reversing the activated cardiac fibroblast phenotype. The efficacy is defined as a metric between 0 and 1 based on a custom in vitro assay. To determine this, I need to find information about Luminespib's bioactivities, specifically any data related to its effect on fibroblasts or related cellular processes. I will start by searching for Luminespib in ChEMBL to get its ChEMBL ID, and then use that ID to retrieve its bioactivity summary.",
 'tool_name_0': 'CHEMBL__search_chembl_id',
 'tool_args_0': {'query': 'Luminespib'},
 'observation_0': "Found 2 compound(s) matching 'Luminespib': \n - CHEMBL3137334 (LUMINESPIB MESYLATE)\n - CHEMBL252164 (LUMINESPIB)",
 'thought_1': 'I have found two ChEMBL IDs for Luminespib: CHEMBL3137334 and CHEMBL252164. To get the bioactivity summary, I should use the ID that is most likely to represent the compound itself, which is CHEMBL252164 (LUMINESPIB). I will then use `CHEMBL__get_com