From 947634de8455d75471cc1d6c5b277dc54b48cd0c Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 14:29:29 -0400 Subject: [PATCH 1/7] pareto graph --- examples/ag2_example.py | 1 + examples/crewai_example.py | 1 + examples/custom_agent_example.py | 1 + examples/langchain_example.py | 1 + examples/langgraph_example.py | 1 + examples/llamaindex_example.py | 1 + examples/openai_sdk_example.py | 1 + pyproject.toml | 1 + src/agentopt/model_selection/base.py | 157 +++++++++++++++++++++++++++ 9 files changed, 165 insertions(+) diff --git a/examples/ag2_example.py b/examples/ag2_example.py index 6477bd6..b673694 100644 --- a/examples/ag2_example.py +++ b/examples/ag2_example.py @@ -187,6 +187,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/crewai_example.py b/examples/crewai_example.py index 31e0e81..1841c0a 100644 --- a/examples/crewai_example.py +++ b/examples/crewai_example.py @@ -195,6 +195,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/custom_agent_example.py b/examples/custom_agent_example.py index 33c5c47..8191f41 100644 --- a/examples/custom_agent_example.py +++ b/examples/custom_agent_example.py @@ -160,6 +160,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() # Export optimized config best = results.get_best_combo() diff --git a/examples/langchain_example.py b/examples/langchain_example.py index 1855c89..1323f7d 100644 --- a/examples/langchain_example.py +++ b/examples/langchain_example.py @@ -175,6 +175,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/langgraph_example.py b/examples/langgraph_example.py index 1c31265..0d10cf6 100644 --- a/examples/langgraph_example.py +++ b/examples/langgraph_example.py @@ -195,6 +195,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/llamaindex_example.py b/examples/llamaindex_example.py index 8d885f6..8a21d69 100644 --- a/examples/llamaindex_example.py +++ b/examples/llamaindex_example.py @@ -190,6 +190,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/openai_sdk_example.py b/examples/openai_sdk_example.py index 406f9c5..0691dfb 100644 --- a/examples/openai_sdk_example.py +++ b/examples/openai_sdk_example.py @@ -179,6 +179,7 @@ def main(): parallel=args.parallel, max_concurrent=args.max_concurrent ) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/pyproject.toml b/pyproject.toml index 1104e5f..e496f4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ examples = [ "openai-agents", "python-dotenv", ] +plot = ["matplotlib>=3.5"] dev = ["pytest>=8.0"] docs = ["mkdocs-material", "mkdocstrings[python]"] diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 722554a..5d0329f 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -504,6 +504,163 @@ def print_summary(self) -> None: """Print the formatted summary table of all results.""" print(self) + # ------------------------------------------------------------------ + # Pareto frontier visualisation + # ------------------------------------------------------------------ + + @staticmethod + def _pareto_mask( + xs: List[float], ys: List[float], x_minimize: bool, y_minimize: bool, + ) -> List[bool]: + """Return a boolean mask marking Pareto-optimal points. + + A point is Pareto-optimal if no other point is strictly better on both + objectives. + """ + n = len(xs) + mask = [True] * n + for i in range(n): + if not mask[i]: + continue + for j in range(n): + if i == j or not mask[j]: + continue + xi, yi, xj, yj = xs[i], ys[i], xs[j], ys[j] + # Is j at least as good as i on both, and strictly better on one? + x_ok = (xj <= xi) if x_minimize else (xj >= xi) + y_ok = (yj <= yi) if y_minimize else (yj >= yi) + x_strict = (xj < xi) if x_minimize else (xj > xi) + y_strict = (yj < yi) if y_minimize else (yj > yi) + if x_ok and y_ok and (x_strict or y_strict): + mask[i] = False + break + return mask + + def plot_pareto(self, path: Optional[str] = None) -> None: + """Generate three pairwise Pareto frontier plots. + + Subplots: Accuracy vs Latency, Accuracy vs Price, Latency vs Price. + + Requires ``matplotlib`` (install with ``pip install agentopt[plot]``). + If *path* is given the figure is saved to that file, otherwise + ``plt.show()`` is called. + """ + try: + import matplotlib.pyplot as plt + except ImportError: + raise ImportError( + "matplotlib is required for plot_pareto. " + "Install it with: pip install agentopt[plot]" + ) + + # Deduplicate (same logic as __str__). + seen: Dict[str, "ModelResult"] = {} + for r in self.results: + if r.model_name not in seen or ( + r.is_best and not seen[r.model_name].is_best + ): + seen[r.model_name] = r + unique = [r for r in seen.values() if r.price is not None] + + if len(unique) < 2: + print("Not enough results with pricing data to plot.") + return + + names = [r.model_name for r in unique] + accs = [r.accuracy for r in unique] + lats = [r.latency_seconds for r in unique] + prices = [r.price for r in unique] # type: ignore[misc] + is_best = [r.is_best for r in unique] + + pairs = [ + (accs, lats, "Accuracy", "Latency (s)", False, True), + (accs, prices, "Accuracy", "Price ($)", False, True), + (lats, prices, "Latency (s)", "Price ($)", True, True), + ] + + fig, axes = plt.subplots(1, 3, figsize=(18, 5)) + fig.suptitle("Pareto Frontiers", fontsize=14, fontweight="bold") + + for ax, (xs, ys, xlabel, ylabel, x_min, y_min) in zip(axes, pairs): + mask = self._pareto_mask(xs, ys, x_min, y_min) + + # Non-Pareto points. + np_x = [x for x, m in zip(xs, mask) if not m] + np_y = [y for y, m in zip(ys, mask) if not m] + ax.scatter( + np_x, + np_y, + c="lightgray", + edgecolors="gray", + s=60, + zorder=2, + label="Dominated", + ) + + # Pareto-optimal points. + p_x = [x for x, m in zip(xs, mask) if m] + p_y = [y for y, m in zip(ys, mask) if m] + ax.scatter( + p_x, + p_y, + c="steelblue", + edgecolors="navy", + s=80, + zorder=3, + label="Pareto-optimal", + ) + + # Connect frontier with a line (sorted by x). + if p_x: + order = sorted(range(len(p_x)), key=lambda i: p_x[i]) + ax.plot( + [p_x[i] for i in order], + [p_y[i] for i in order], + c="steelblue", + linewidth=1.5, + alpha=0.6, + zorder=2, + ) + + # Highlight best combo. + for x, y, b, name in zip(xs, ys, is_best, names): + if b: + ax.scatter( + [x], + [y], + c="gold", + edgecolors="darkorange", + s=140, + zorder=4, + marker="*", + label="Best", + ) + + # Labels for all points. + for x, y, name in zip(xs, ys, names): + short = name if len(name) <= 30 else name[:27] + "..." + ax.annotate( + short, + (x, y), + textcoords="offset points", + xytext=(5, 5), + fontsize=6, + alpha=0.8, + ) + + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.legend(fontsize=7, loc="best") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + if path: + fig.savefig(path, dpi=150, bbox_inches="tight") + print(f"Pareto plot saved to {path}") + else: + plt.show() + plt.close(fig) + class BaseModelSelector(ABC): """Abstract base class for model selectors. From 7e3bbf5e458bcc057a02179dcf6beb47ab9f2ec2 Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 16:08:58 -0400 Subject: [PATCH 2/7] clean graph --- src/agentopt/model_selection/base.py | 46 ++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 5d0329f..676318d 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -572,13 +572,19 @@ def plot_pareto(self, path: Optional[str] = None) -> None: prices = [r.price for r in unique] # type: ignore[misc] is_best = [r.is_best for r in unique] + # Build numbered labels: (1), (2), ... + num_labels = [f"({i})" for i in range(1, len(unique) + 1)] + pairs = [ (accs, lats, "Accuracy", "Latency (s)", False, True), (accs, prices, "Accuracy", "Price ($)", False, True), (lats, prices, "Latency (s)", "Price ($)", True, True), ] - fig, axes = plt.subplots(1, 3, figsize=(18, 5)) + fig = plt.figure(figsize=(20, 5)) + # Reserve right margin for the legend. + gs = fig.add_gridspec(1, 3, left=0.04, right=0.75, wspace=0.3) + axes = [fig.add_subplot(gs[0, i]) for i in range(3)] fig.suptitle("Pareto Frontiers", fontsize=14, fontweight="bold") for ax, (xs, ys, xlabel, ylabel, x_min, y_min) in zip(axes, pairs): @@ -623,7 +629,7 @@ def plot_pareto(self, path: Optional[str] = None) -> None: ) # Highlight best combo. - for x, y, b, name in zip(xs, ys, is_best, names): + for x, y, b in zip(xs, ys, is_best): if b: ax.scatter( [x], @@ -636,24 +642,46 @@ def plot_pareto(self, path: Optional[str] = None) -> None: label="Best", ) - # Labels for all points. - for x, y, name in zip(xs, ys, names): - short = name if len(name) <= 30 else name[:27] + "..." + # Number labels on points. + for x, y, lbl in zip(xs, ys, num_labels): ax.annotate( - short, + lbl, (x, y), textcoords="offset points", xytext=(5, 5), - fontsize=6, - alpha=0.8, + fontsize=7, + fontweight="bold", ) + # Invert "lower is better" axes so better is always top-right, + # producing a concave frontier. + if x_min: + ax.invert_xaxis() + if y_min: + ax.invert_yaxis() + ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.legend(fontsize=7, loc="best") ax.grid(True, alpha=0.3) - plt.tight_layout() + # External legend mapping numbers to combo names. + legend_lines = [f"({i}) {name}" for i, name in enumerate(names, 1)] + fig.text( + 0.77, + 0.5, + "\n".join(legend_lines), + fontsize=8, + verticalalignment="center", + fontfamily="monospace", + bbox=dict( + boxstyle="round,pad=0.5", + facecolor="lightyellow", + edgecolor="gray", + alpha=0.9, + ), + ) + if path: fig.savefig(path, dpi=150, bbox_inches="tight") print(f"Pareto plot saved to {path}") From 754a55defd0c7e1bb807467f2265d5c3b48dd550 Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 16:21:07 -0400 Subject: [PATCH 3/7] clean graph for bandit --- src/agentopt/model_selection/base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 676318d..177323f 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -560,7 +560,15 @@ def plot_pareto(self, path: Optional[str] = None) -> None: r.is_best and not seen[r.model_name].is_best ): seen[r.model_name] = r - unique = [r for r in seen.values() if r.price is not None] + all_unique = [r for r in seen.values() if r.price is not None] + + # For bandit algorithms, only plot the final layer (combos with the + # most datapoints) so all plotted combos are directly comparable. + if all_unique: + max_samples = max(r.num_samples for r in all_unique) + unique = [r for r in all_unique if r.num_samples == max_samples] + else: + unique = all_unique if len(unique) < 2: print("Not enough results with pricing data to plot.") From 9e4d114de21720d81d7153c873e386d5146b8707 Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 16:55:31 -0400 Subject: [PATCH 4/7] order --- src/agentopt/model_selection/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 177323f..d853dd0 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -570,6 +570,9 @@ def plot_pareto(self, path: Optional[str] = None) -> None: else: unique = all_unique + # Sort so numbering matches the final results table rank order. + unique.sort(key=lambda r: (-r.accuracy, r.latency_seconds)) + if len(unique) < 2: print("Not enough results with pricing data to plot.") return From 7913277f8fd2ed4668c943bc5e64f6db6f283b03 Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 22:03:12 -0400 Subject: [PATCH 5/7] merge main and remove latency vs. cost --- examples/advanced_selection_example.py | 84 ++++++++++++++++++-------- examples/ag2_example.py | 2 + examples/crewai_example.py | 2 + examples/custom_agent_example.py | 44 +++++++++----- examples/langchain_example.py | 11 ++-- examples/langgraph_example.py | 18 +++++- examples/llamaindex_example.py | 6 +- examples/openai_sdk_example.py | 2 + src/agentopt/__init__.py | 10 +-- src/agentopt/model_selection/base.py | 11 ++-- 10 files changed, 128 insertions(+), 62 deletions(-) diff --git a/examples/advanced_selection_example.py b/examples/advanced_selection_example.py index 799646d..95b842c 100644 --- a/examples/advanced_selection_example.py +++ b/examples/advanced_selection_example.py @@ -24,6 +24,7 @@ # Agent, dataset, and eval_fn (same as custom_agent_example.py) # --------------------------------------------------------------------------- + class MyAgent: def __init__(self, models): self.client = OpenAI() @@ -31,21 +32,35 @@ def __init__(self, models): self.solver_model = models["solver"] def run(self, input_data): - plan = self.client.chat.completions.create( - model=self.planner_model, - messages=[ - {"role": "system", "content": "Create a brief plan to answer the question."}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content - - answer = self.client.chat.completions.create( - model=self.solver_model, - messages=[ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{plan}"}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + plan = ( + self.client.chat.completions.create( + model=self.planner_model, + messages=[ + { + "role": "system", + "content": "Create a brief plan to answer the question.", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) + + answer = ( + self.client.chat.completions.create( + model=self.solver_model, + messages=[ + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{plan}", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) return answer @@ -71,11 +86,11 @@ def eval_fn(expected, actual): # Selection algorithms # --------------------------------------------------------------------------- + def run_auto(): """method="auto" — automatically picks the best algorithm (default).""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, - method="auto", + agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, method="auto", ) return selector.select_best(parallel=True) @@ -83,7 +98,10 @@ def run_auto(): def run_random(): """method="random" — evaluate a random subset of combinations.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="random", sample_fraction=0.5, # evaluate 50% of all combinations ) @@ -93,7 +111,10 @@ def run_random(): def run_hill_climbing(): """method="hill_climbing" — greedy search using model quality/speed rankings.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="hill_climbing", batch_size=4, # number of neighbors to evaluate per step ) @@ -103,7 +124,10 @@ def run_hill_climbing(): def run_arm_elimination(): """method="arm_elimination" — eliminates statistically dominated combinations early.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="arm_elimination", ) return selector.select_best(parallel=True) @@ -112,7 +136,10 @@ def run_arm_elimination(): def run_epsilon_lucb(): """method="epsilon_lucb" — stops when the best arm is identified within epsilon.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="epsilon_lucb", epsilon=0.05, # acceptable gap from the true best ) @@ -122,7 +149,10 @@ def run_epsilon_lucb(): def run_threshold(): """method="threshold" — classify combinations as above/below a quality threshold.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="threshold", threshold=0.8, # minimum acceptable accuracy ) @@ -132,7 +162,10 @@ def run_threshold(): def run_lm_proposal(): """method="lm_proposal" — use a proposer LLM to shortlist promising combinations.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="lm_proposal", ) return selector.select_best(parallel=True) @@ -141,7 +174,10 @@ def run_lm_proposal(): def run_bayesian(): """method="bayesian" — GP-based Bayesian optimization (requires agentopt[bayesian]).""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="bayesian", batch_size=4, ) diff --git a/examples/ag2_example.py b/examples/ag2_example.py index 2878fa8..2f54ca6 100644 --- a/examples/ag2_example.py +++ b/examples/ag2_example.py @@ -25,6 +25,7 @@ # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """AG2 planner+solver agent pair.""" @@ -76,6 +77,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 diff --git a/examples/crewai_example.py b/examples/crewai_example.py index bac0892..5dcb50d 100644 --- a/examples/crewai_example.py +++ b/examples/crewai_example.py @@ -21,6 +21,7 @@ # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """CrewAI crew with researcher + writer agents.""" @@ -90,6 +91,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 diff --git a/examples/custom_agent_example.py b/examples/custom_agent_example.py index efb2c7b..0f5f8f3 100644 --- a/examples/custom_agent_example.py +++ b/examples/custom_agent_example.py @@ -26,6 +26,7 @@ # run() takes a single datapoint and returns the agent's output. # --------------------------------------------------------------------------- + class MyAgent: """A simple planner+solver agent using the OpenAI SDK.""" @@ -36,22 +37,36 @@ def __init__(self, models): def run(self, input_data): # Step 1: Planner generates a plan - plan = self.client.chat.completions.create( - model=self.planner_model, - messages=[ - {"role": "system", "content": "You are a planning assistant. Create a brief plan to answer the question."}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + plan = ( + self.client.chat.completions.create( + model=self.planner_model, + messages=[ + { + "role": "system", + "content": "You are a planning assistant. Create a brief plan to answer the question.", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) # Step 2: Solver executes the plan - answer = self.client.chat.completions.create( - model=self.solver_model, - messages=[ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{plan}"}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + answer = ( + self.client.chat.completions.create( + model=self.solver_model, + messages=[ + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{plan}", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) return answer @@ -75,6 +90,7 @@ def run(self, input_data): # It compares agent output against expected output and returns a score. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 diff --git a/examples/langchain_example.py b/examples/langchain_example.py index 2bbcc65..3979bf8 100644 --- a/examples/langchain_example.py +++ b/examples/langchain_example.py @@ -28,7 +28,10 @@ def search(query: str) -> str: PROMPT = ChatPromptTemplate.from_messages( [ - ("system", "You are a helpful assistant. Use tools when needed to answer questions concisely."), + ( + "system", + "You are a helpful assistant. Use tools when needed to answer questions concisely.", + ), ("human", "{input}"), ("placeholder", "{agent_scratchpad}"), ] @@ -41,6 +44,7 @@ def search(query: str) -> str: # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """LangChain tool-calling agent.""" @@ -71,6 +75,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -83,9 +88,7 @@ def eval_fn(expected, actual): if __name__ == "__main__": selector = ModelSelector( agent=MyAgent, - models={ - "agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"], - }, + models={"agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"],}, eval_fn=eval_fn, dataset=dataset, method="brute_force", # or "auto" for smarter selection algorithms diff --git a/examples/langgraph_example.py b/examples/langgraph_example.py index ff602f0..e99f08c 100644 --- a/examples/langgraph_example.py +++ b/examples/langgraph_example.py @@ -31,6 +31,7 @@ class AgentState(TypedDict): # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """LangGraph planner+solver agent.""" @@ -40,7 +41,12 @@ def __init__(self, models): def planner_node(state: AgentState) -> dict: response = planner_llm.invoke( - [{"role": "system", "content": "Create a brief plan to answer the question."}] + [ + { + "role": "system", + "content": "Create a brief plan to answer the question.", + } + ] + state["messages"] ) return {"plan": response.content} @@ -48,7 +54,10 @@ def planner_node(state: AgentState) -> dict: def solver_node(state: AgentState) -> dict: response = solver_llm.invoke( [ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{state['plan']}"}, + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{state['plan']}", + }, state["messages"][-1], ] ) @@ -63,7 +72,9 @@ def solver_node(state: AgentState) -> dict: self._app = graph.compile() def run(self, input_data): - result = self._app.invoke({"messages": [{"role": "user", "content": input_data}]}) + result = self._app.invoke( + {"messages": [{"role": "user", "content": input_data}]} + ) return result["answer"] @@ -82,6 +93,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 diff --git a/examples/llamaindex_example.py b/examples/llamaindex_example.py index 5eed0bb..8093a59 100644 --- a/examples/llamaindex_example.py +++ b/examples/llamaindex_example.py @@ -46,6 +46,7 @@ def divide(a: float, b: float) -> float: # Note: run() can be async — AgentOpt detects this automatically. # --------------------------------------------------------------------------- + class MyAgent: """LlamaIndex math agent with calculator tools.""" @@ -85,6 +86,7 @@ async def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -97,9 +99,7 @@ def eval_fn(expected, actual): if __name__ == "__main__": selector = ModelSelector( agent=MyAgent, - models={ - "agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"], - }, + models={"agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"],}, eval_fn=eval_fn, dataset=dataset, method="brute_force", # or "auto" for smarter selection algorithms diff --git a/examples/openai_sdk_example.py b/examples/openai_sdk_example.py index 11bd4f0..ccb7808 100644 --- a/examples/openai_sdk_example.py +++ b/examples/openai_sdk_example.py @@ -27,6 +27,7 @@ def search(query: str) -> str: # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """OpenAI Agents SDK planner+solver agent pair.""" @@ -67,6 +68,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 diff --git a/src/agentopt/__init__.py b/src/agentopt/__init__.py index 5d7cb93..5b09b15 100644 --- a/src/agentopt/__init__.py +++ b/src/agentopt/__init__.py @@ -45,12 +45,7 @@ def ModelSelector( - agent=None, - models=None, - eval_fn=None, - dataset=None, - method="auto", - **kwargs, + agent=None, models=None, eval_fn=None, dataset=None, method="auto", **kwargs, ): """Create a model selector. @@ -82,8 +77,7 @@ def ModelSelector( 'install with `pip install "agentopt[bayesian]"`' ) raise ValueError( - f"Unknown method {method!r}. " - f"Choose from: {', '.join(_METHODS)}" + f"Unknown method {method!r}. " f"Choose from: {', '.join(_METHODS)}" ) return cls(agent=agent, models=models, eval_fn=eval_fn, dataset=dataset, **kwargs) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index d08376a..2ed4256 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -537,9 +537,9 @@ def _pareto_mask( return mask def plot_pareto(self, path: Optional[str] = None) -> None: - """Generate three pairwise Pareto frontier plots. + """Generate two pairwise Pareto frontier plots. - Subplots: Accuracy vs Latency, Accuracy vs Price, Latency vs Price. + Subplots: Accuracy vs Latency, Accuracy vs Price. Requires ``matplotlib`` (install with ``pip install agentopt[plot]``). If *path* is given the figure is saved to that file, otherwise @@ -589,13 +589,12 @@ def plot_pareto(self, path: Optional[str] = None) -> None: pairs = [ (accs, lats, "Accuracy", "Latency (s)", False, True), (accs, prices, "Accuracy", "Price ($)", False, True), - (lats, prices, "Latency (s)", "Price ($)", True, True), ] - fig = plt.figure(figsize=(20, 5)) + fig = plt.figure(figsize=(14, 5)) # Reserve right margin for the legend. - gs = fig.add_gridspec(1, 3, left=0.04, right=0.75, wspace=0.3) - axes = [fig.add_subplot(gs[0, i]) for i in range(3)] + gs = fig.add_gridspec(1, 2, left=0.06, right=0.72, wspace=0.3) + axes = [fig.add_subplot(gs[0, i]) for i in range(2)] fig.suptitle("Pareto Frontiers", fontsize=14, fontweight="bold") for ax, (xs, ys, xlabel, ylabel, x_min, y_min) in zip(axes, pairs): From ce30e9286496af9b628cde54fa25630e6f309ec9 Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 22:23:59 -0400 Subject: [PATCH 6/7] update --- src/agentopt/model_selection/base.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 2ed4256..a477881 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -587,8 +587,8 @@ def plot_pareto(self, path: Optional[str] = None) -> None: num_labels = [f"({i})" for i in range(1, len(unique) + 1)] pairs = [ - (accs, lats, "Accuracy", "Latency (s)", False, True), - (accs, prices, "Accuracy", "Price ($)", False, True), + (lats, accs, "Latency (s)", "Accuracy", True, False), + (prices, accs, "Price ($)", "Accuracy", True, False), ] fig = plt.figure(figsize=(14, 5)) @@ -663,13 +663,6 @@ def plot_pareto(self, path: Optional[str] = None) -> None: fontweight="bold", ) - # Invert "lower is better" axes so better is always top-right, - # producing a concave frontier. - if x_min: - ax.invert_xaxis() - if y_min: - ax.invert_yaxis() - ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.legend(fontsize=7, loc="best") From aad735c940c76ec07ccc92f1bf436f3b4b18710b Mon Sep 17 00:00:00 2001 From: Wenyueh Date: Sun, 22 Mar 2026 22:26:51 -0400 Subject: [PATCH 7/7] update --- src/agentopt/model_selection/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index a477881..85db0d2 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -593,7 +593,7 @@ def plot_pareto(self, path: Optional[str] = None) -> None: fig = plt.figure(figsize=(14, 5)) # Reserve right margin for the legend. - gs = fig.add_gridspec(1, 2, left=0.06, right=0.72, wspace=0.3) + gs = fig.add_gridspec(1, 2, left=0.06, right=0.68, wspace=0.3) axes = [fig.add_subplot(gs[0, i]) for i in range(2)] fig.suptitle("Pareto Frontiers", fontsize=14, fontweight="bold") @@ -671,7 +671,7 @@ def plot_pareto(self, path: Optional[str] = None) -> None: # External legend mapping numbers to combo names. legend_lines = [f"({i}) {name}" for i, name in enumerate(names, 1)] fig.text( - 0.77, + 0.72, 0.5, "\n".join(legend_lines), fontsize=8,