In [None]:
from dotenv import load_dotenv
import dspy
import pandas as pd

from agentic_system.agents import CFEfficacyAgent
from agentic_system.litl_data.litl_utils import load_efficacy_devset

load_dotenv("../.env")
dspy.configure(
    lm=dspy.LM(
        "gemini/gemini-2.5-flash-lite", temperature=0.5, cache=False, max_tokens=25000
    )
)

In [3]:
efficacy_devset = load_efficacy_devset(uniform_efficacy=True)
efficacy_devset = efficacy_devset * 5
len(efficacy_devset)

125

In [3]:
from dspy.evaluate import Evaluate


def efficacy_accuracy(example, pred, trace=None):
    "Calculate error from prediction"
    abs_error = abs(example.cf_efficacy - float(pred.predicted_efficacy))
    accuracy = 1 - abs_error
    return accuracy


# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(
    devset=efficacy_devset,
    num_threads=10,
    display_progress=True,
    provide_traceback=True,
)

efficacy_agent = CFEfficacyAgent()
evaluation = evaluator(efficacy_agent, metric=efficacy_accuracy)

Average Metric: 10.20 / 19 (53.7%):  15%|█▌        | 19/126 [01:02<08:03,  4.52s/it]



Average Metric: 15.23 / 25 (60.9%):  20%|█▉        | 25/126 [01:19<05:21,  3.18s/it]



Average Metric: 16.19 / 26 (62.3%):  21%|██        | 26/126 [01:21<04:39,  2.80s/it]



Average Metric: 22.00 / 33 (66.7%):  26%|██▌       | 33/126 [01:40<04:05,  2.64s/it]



Average Metric: 29.51 / 47 (62.8%):  37%|███▋      | 47/126 [01:57<01:40,  1.27s/it]



Average Metric: 30.11 / 49 (61.4%):  39%|███▉      | 49/126 [02:01<02:10,  1.70s/it]



Average Metric: 31.26 / 51 (61.3%):  40%|████      | 51/126 [02:04<01:55,  1.54s/it]



Average Metric: 38.41 / 60 (64.0%):  48%|████▊     | 60/126 [02:16<00:47,  1.40it/s]



Average Metric: 39.26 / 61 (64.4%):  48%|████▊     | 61/126 [02:18<01:12,  1.11s/it]



Average Metric: 46.70 / 69 (67.7%):  55%|█████▍    | 69/126 [02:32<01:48,  1.91s/it]



Average Metric: 58.83 / 86 (68.4%):  68%|██████▊   | 86/126 [03:00<01:13,  1.84s/it]



Average Metric: 59.11 / 87 (67.9%):  69%|██████▉   | 87/126 [03:01<01:00,  1.55s/it]



Average Metric: 60.26 / 91 (66.2%):  72%|███████▏  | 91/126 [03:09<01:18,  2.23s/it]



Average Metric: 61.41 / 93 (66.0%):  74%|███████▍  | 93/126 [03:12<00:55,  1.69s/it]



Average Metric: 62.14 / 94 (66.1%):  75%|███████▍  | 94/126 [03:13<00:52,  1.64s/it]



Average Metric: 67.02 / 101 (66.4%):  80%|████████  | 101/126 [03:23<00:28,  1.16s/it]



Average Metric: 72.25 / 107 (67.5%):  85%|████████▍ | 107/126 [03:28<00:22,  1.20s/it]



Average Metric: 78.92 / 114 (69.2%):  90%|█████████ | 114/126 [03:39<00:12,  1.03s/it]



Average Metric: 83.74 / 119 (70.4%):  94%|█████████▍| 119/126 [03:48<00:10,  1.55s/it]



Average Metric: 88.65 / 124 (71.5%):  98%|█████████▊| 123/126 [03:56<00:04,  1.38s/it]



Average Metric: 89.63 / 125 (71.7%):  99%|█████████▉| 125/126 [04:00<00:01,  1.67s/it]



Average Metric: 90.61 / 126 (71.9%): 100%|██████████| 126/126 [04:37<00:00,  2.20s/it]

2025/09/15 16:24:20 INFO dspy.evaluate.evaluate: Average Metric: 90.61416666110323 / 126 (71.9%)





In [4]:
errors = []
confidences = []
for result in evaluation.results:
    errors.append(abs(result[0].cf_efficacy - result[1].predicted_efficacy))
    confidences.append(result[1].confidence)

print("Mean Absolute Error:", sum(errors) / len(errors))
print("Mean Confidence:", sum(confidences) / len(confidences))

Mean Absolute Error: 0.2808399471341014
Mean Confidence: 0.06031746031746032


In [5]:
evaluation.results[0][1].trajectory

{'thought_0': "The user wants to estimate the efficacy of Luminespib. The provided description details a custom in vitro assay with a specific readout and efficacy metric. To estimate efficacy, I need to find data related to Luminespib's activity in biological assays. I will start by searching for Luminespib in ChEMBL to get its ChEMBL ID, which can then be used to retrieve its bioactivities.",
 'tool_name_0': 'CHEMBL__search_chembl_id',
 'tool_args_0': {'query': 'Luminespib'},
 'observation_0': "Found 2 compound(s) matching 'Luminespib': \n - CHEMBL3137334 (LUMINESPIB MESYLATE)\n - CHEMBL252164 (LUMINESPIB)",
 'thought_1': 'I have found two ChEMBL IDs for Luminespib: CHEMBL3137334 and CHEMBL252164. To proceed with estimating efficacy, I need to retrieve bioactivity data for Luminespib. I will use the ChEMBL ID CHEMBL252164, as it appears to be the base compound. I will use the `get_compound_bioactivities` tool to fetch relevant data.',
 'tool_name_1': 'CHEMBL__get_compound_bioactiviti