diff --git a/examples/advanced_selection_example.py b/examples/advanced_selection_example.py index 799646d..95b842c 100644 --- a/examples/advanced_selection_example.py +++ b/examples/advanced_selection_example.py @@ -24,6 +24,7 @@ # Agent, dataset, and eval_fn (same as custom_agent_example.py) # --------------------------------------------------------------------------- + class MyAgent: def __init__(self, models): self.client = OpenAI() @@ -31,21 +32,35 @@ def __init__(self, models): self.solver_model = models["solver"] def run(self, input_data): - plan = self.client.chat.completions.create( - model=self.planner_model, - messages=[ - {"role": "system", "content": "Create a brief plan to answer the question."}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content - - answer = self.client.chat.completions.create( - model=self.solver_model, - messages=[ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{plan}"}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + plan = ( + self.client.chat.completions.create( + model=self.planner_model, + messages=[ + { + "role": "system", + "content": "Create a brief plan to answer the question.", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) + + answer = ( + self.client.chat.completions.create( + model=self.solver_model, + messages=[ + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{plan}", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) return answer @@ -71,11 +86,11 @@ def eval_fn(expected, actual): # Selection algorithms # --------------------------------------------------------------------------- + def run_auto(): """method="auto" — automatically picks the best algorithm (default).""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, - method="auto", + agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, method="auto", ) return selector.select_best(parallel=True) @@ -83,7 +98,10 @@ def run_auto(): def run_random(): """method="random" — evaluate a random subset of combinations.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="random", sample_fraction=0.5, # evaluate 50% of all combinations ) @@ -93,7 +111,10 @@ def run_random(): def run_hill_climbing(): """method="hill_climbing" — greedy search using model quality/speed rankings.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="hill_climbing", batch_size=4, # number of neighbors to evaluate per step ) @@ -103,7 +124,10 @@ def run_hill_climbing(): def run_arm_elimination(): """method="arm_elimination" — eliminates statistically dominated combinations early.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="arm_elimination", ) return selector.select_best(parallel=True) @@ -112,7 +136,10 @@ def run_arm_elimination(): def run_epsilon_lucb(): """method="epsilon_lucb" — stops when the best arm is identified within epsilon.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="epsilon_lucb", epsilon=0.05, # acceptable gap from the true best ) @@ -122,7 +149,10 @@ def run_epsilon_lucb(): def run_threshold(): """method="threshold" — classify combinations as above/below a quality threshold.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="threshold", threshold=0.8, # minimum acceptable accuracy ) @@ -132,7 +162,10 @@ def run_threshold(): def run_lm_proposal(): """method="lm_proposal" — use a proposer LLM to shortlist promising combinations.""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="lm_proposal", ) return selector.select_best(parallel=True) @@ -141,7 +174,10 @@ def run_lm_proposal(): def run_bayesian(): """method="bayesian" — GP-based Bayesian optimization (requires agentopt[bayesian]).""" selector = ModelSelector( - agent=MyAgent, models=models, eval_fn=eval_fn, dataset=dataset, + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, method="bayesian", batch_size=4, ) diff --git a/examples/ag2_example.py b/examples/ag2_example.py index e0dab5e..2f54ca6 100644 --- a/examples/ag2_example.py +++ b/examples/ag2_example.py @@ -25,6 +25,7 @@ # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """AG2 planner+solver agent pair.""" @@ -76,6 +77,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -99,6 +101,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/crewai_example.py b/examples/crewai_example.py index 1dbef80..5dcb50d 100644 --- a/examples/crewai_example.py +++ b/examples/crewai_example.py @@ -21,6 +21,7 @@ # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """CrewAI crew with researcher + writer agents.""" @@ -90,6 +91,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -113,6 +115,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/custom_agent_example.py b/examples/custom_agent_example.py index 0ecd1b0..0f5f8f3 100644 --- a/examples/custom_agent_example.py +++ b/examples/custom_agent_example.py @@ -26,6 +26,7 @@ # run() takes a single datapoint and returns the agent's output. # --------------------------------------------------------------------------- + class MyAgent: """A simple planner+solver agent using the OpenAI SDK.""" @@ -36,22 +37,36 @@ def __init__(self, models): def run(self, input_data): # Step 1: Planner generates a plan - plan = self.client.chat.completions.create( - model=self.planner_model, - messages=[ - {"role": "system", "content": "You are a planning assistant. Create a brief plan to answer the question."}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + plan = ( + self.client.chat.completions.create( + model=self.planner_model, + messages=[ + { + "role": "system", + "content": "You are a planning assistant. Create a brief plan to answer the question.", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) # Step 2: Solver executes the plan - answer = self.client.chat.completions.create( - model=self.solver_model, - messages=[ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{plan}"}, - {"role": "user", "content": input_data}, - ], - ).choices[0].message.content + answer = ( + self.client.chat.completions.create( + model=self.solver_model, + messages=[ + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{plan}", + }, + {"role": "user", "content": input_data}, + ], + ) + .choices[0] + .message.content + ) return answer @@ -75,6 +90,7 @@ def run(self, input_data): # It compares agent output against expected output and returns a score. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -99,6 +115,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/langchain_example.py b/examples/langchain_example.py index 9777cf3..3979bf8 100644 --- a/examples/langchain_example.py +++ b/examples/langchain_example.py @@ -28,7 +28,10 @@ def search(query: str) -> str: PROMPT = ChatPromptTemplate.from_messages( [ - ("system", "You are a helpful assistant. Use tools when needed to answer questions concisely."), + ( + "system", + "You are a helpful assistant. Use tools when needed to answer questions concisely.", + ), ("human", "{input}"), ("placeholder", "{agent_scratchpad}"), ] @@ -41,6 +44,7 @@ def search(query: str) -> str: # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """LangChain tool-calling agent.""" @@ -71,6 +75,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -83,9 +88,7 @@ def eval_fn(expected, actual): if __name__ == "__main__": selector = ModelSelector( agent=MyAgent, - models={ - "agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"], - }, + models={"agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"],}, eval_fn=eval_fn, dataset=dataset, method="brute_force", # or "auto" for smarter selection algorithms @@ -93,6 +96,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/langgraph_example.py b/examples/langgraph_example.py index 4b93413..e99f08c 100644 --- a/examples/langgraph_example.py +++ b/examples/langgraph_example.py @@ -31,6 +31,7 @@ class AgentState(TypedDict): # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """LangGraph planner+solver agent.""" @@ -40,7 +41,12 @@ def __init__(self, models): def planner_node(state: AgentState) -> dict: response = planner_llm.invoke( - [{"role": "system", "content": "Create a brief plan to answer the question."}] + [ + { + "role": "system", + "content": "Create a brief plan to answer the question.", + } + ] + state["messages"] ) return {"plan": response.content} @@ -48,7 +54,10 @@ def planner_node(state: AgentState) -> dict: def solver_node(state: AgentState) -> dict: response = solver_llm.invoke( [ - {"role": "system", "content": f"Follow this plan and answer concisely:\n{state['plan']}"}, + { + "role": "system", + "content": f"Follow this plan and answer concisely:\n{state['plan']}", + }, state["messages"][-1], ] ) @@ -63,7 +72,9 @@ def solver_node(state: AgentState) -> dict: self._app = graph.compile() def run(self, input_data): - result = self._app.invoke({"messages": [{"role": "user", "content": input_data}]}) + result = self._app.invoke( + {"messages": [{"role": "user", "content": input_data}]} + ) return result["answer"] @@ -82,6 +93,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -105,6 +117,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/llamaindex_example.py b/examples/llamaindex_example.py index 36eb5a0..8093a59 100644 --- a/examples/llamaindex_example.py +++ b/examples/llamaindex_example.py @@ -46,6 +46,7 @@ def divide(a: float, b: float) -> float: # Note: run() can be async — AgentOpt detects this automatically. # --------------------------------------------------------------------------- + class MyAgent: """LlamaIndex math agent with calculator tools.""" @@ -85,6 +86,7 @@ async def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -97,9 +99,7 @@ def eval_fn(expected, actual): if __name__ == "__main__": selector = ModelSelector( agent=MyAgent, - models={ - "agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"], - }, + models={"agent": ["gpt-4o", "gpt-4o-mini", "gpt-4.1-nano"],}, eval_fn=eval_fn, dataset=dataset, method="brute_force", # or "auto" for smarter selection algorithms @@ -107,6 +107,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/examples/openai_sdk_example.py b/examples/openai_sdk_example.py index cba9e91..ccb7808 100644 --- a/examples/openai_sdk_example.py +++ b/examples/openai_sdk_example.py @@ -27,6 +27,7 @@ def search(query: str) -> str: # run(input_data) runs the agent on a single datapoint and returns the output. # --------------------------------------------------------------------------- + class MyAgent: """OpenAI Agents SDK planner+solver agent pair.""" @@ -67,6 +68,7 @@ def run(self, input_data): # Step 3: Evaluation function — score agent output against expected answer. # --------------------------------------------------------------------------- + def eval_fn(expected, actual): return 1.0 if expected.lower() in str(actual).lower() else 0.0 @@ -90,6 +92,7 @@ def eval_fn(expected, actual): results = selector.select_best(parallel=True) results.print_summary() + results.plot_pareto() best = results.get_best_combo() if best: diff --git a/pyproject.toml b/pyproject.toml index 1104e5f..e496f4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ examples = [ "openai-agents", "python-dotenv", ] +plot = ["matplotlib>=3.5"] dev = ["pytest>=8.0"] docs = ["mkdocs-material", "mkdocstrings[python]"] diff --git a/src/agentopt/__init__.py b/src/agentopt/__init__.py index 5d7cb93..5b09b15 100644 --- a/src/agentopt/__init__.py +++ b/src/agentopt/__init__.py @@ -45,12 +45,7 @@ def ModelSelector( - agent=None, - models=None, - eval_fn=None, - dataset=None, - method="auto", - **kwargs, + agent=None, models=None, eval_fn=None, dataset=None, method="auto", **kwargs, ): """Create a model selector. @@ -82,8 +77,7 @@ def ModelSelector( 'install with `pip install "agentopt[bayesian]"`' ) raise ValueError( - f"Unknown method {method!r}. " - f"Choose from: {', '.join(_METHODS)}" + f"Unknown method {method!r}. " f"Choose from: {', '.join(_METHODS)}" ) return cls(agent=agent, models=models, eval_fn=eval_fn, dataset=dataset, **kwargs) diff --git a/src/agentopt/model_selection/base.py b/src/agentopt/model_selection/base.py index 0ede109..85db0d2 100644 --- a/src/agentopt/model_selection/base.py +++ b/src/agentopt/model_selection/base.py @@ -504,6 +504,194 @@ def print_summary(self) -> None: """Print the formatted summary table of all results.""" print(self) + # ------------------------------------------------------------------ + # Pareto frontier visualisation + # ------------------------------------------------------------------ + + @staticmethod + def _pareto_mask( + xs: List[float], ys: List[float], x_minimize: bool, y_minimize: bool, + ) -> List[bool]: + """Return a boolean mask marking Pareto-optimal points. + + A point is Pareto-optimal if no other point is strictly better on both + objectives. + """ + n = len(xs) + mask = [True] * n + for i in range(n): + if not mask[i]: + continue + for j in range(n): + if i == j or not mask[j]: + continue + xi, yi, xj, yj = xs[i], ys[i], xs[j], ys[j] + # Is j at least as good as i on both, and strictly better on one? + x_ok = (xj <= xi) if x_minimize else (xj >= xi) + y_ok = (yj <= yi) if y_minimize else (yj >= yi) + x_strict = (xj < xi) if x_minimize else (xj > xi) + y_strict = (yj < yi) if y_minimize else (yj > yi) + if x_ok and y_ok and (x_strict or y_strict): + mask[i] = False + break + return mask + + def plot_pareto(self, path: Optional[str] = None) -> None: + """Generate two pairwise Pareto frontier plots. + + Subplots: Accuracy vs Latency, Accuracy vs Price. + + Requires ``matplotlib`` (install with ``pip install agentopt[plot]``). + If *path* is given the figure is saved to that file, otherwise + ``plt.show()`` is called. + """ + try: + import matplotlib.pyplot as plt + except ImportError: + raise ImportError( + "matplotlib is required for plot_pareto. " + "Install it with: pip install agentopt[plot]" + ) + + # Deduplicate (same logic as __str__). + seen: Dict[str, "ModelResult"] = {} + for r in self.results: + if r.model_name not in seen or ( + r.is_best and not seen[r.model_name].is_best + ): + seen[r.model_name] = r + all_unique = [r for r in seen.values() if r.price is not None] + + # For bandit algorithms, only plot the final layer (combos with the + # most datapoints) so all plotted combos are directly comparable. + if all_unique: + max_samples = max(r.num_samples for r in all_unique) + unique = [r for r in all_unique if r.num_samples == max_samples] + else: + unique = all_unique + + # Sort so numbering matches the final results table rank order. + unique.sort(key=lambda r: (-r.accuracy, r.latency_seconds)) + + if len(unique) < 2: + print("Not enough results with pricing data to plot.") + return + + names = [r.model_name for r in unique] + accs = [r.accuracy for r in unique] + lats = [r.latency_seconds for r in unique] + prices = [r.price for r in unique] # type: ignore[misc] + is_best = [r.is_best for r in unique] + + # Build numbered labels: (1), (2), ... + num_labels = [f"({i})" for i in range(1, len(unique) + 1)] + + pairs = [ + (lats, accs, "Latency (s)", "Accuracy", True, False), + (prices, accs, "Price ($)", "Accuracy", True, False), + ] + + fig = plt.figure(figsize=(14, 5)) + # Reserve right margin for the legend. + gs = fig.add_gridspec(1, 2, left=0.06, right=0.68, wspace=0.3) + axes = [fig.add_subplot(gs[0, i]) for i in range(2)] + fig.suptitle("Pareto Frontiers", fontsize=14, fontweight="bold") + + for ax, (xs, ys, xlabel, ylabel, x_min, y_min) in zip(axes, pairs): + mask = self._pareto_mask(xs, ys, x_min, y_min) + + # Non-Pareto points. + np_x = [x for x, m in zip(xs, mask) if not m] + np_y = [y for y, m in zip(ys, mask) if not m] + ax.scatter( + np_x, + np_y, + c="lightgray", + edgecolors="gray", + s=60, + zorder=2, + label="Dominated", + ) + + # Pareto-optimal points. + p_x = [x for x, m in zip(xs, mask) if m] + p_y = [y for y, m in zip(ys, mask) if m] + ax.scatter( + p_x, + p_y, + c="steelblue", + edgecolors="navy", + s=80, + zorder=3, + label="Pareto-optimal", + ) + + # Connect frontier with a line (sorted by x). + if p_x: + order = sorted(range(len(p_x)), key=lambda i: p_x[i]) + ax.plot( + [p_x[i] for i in order], + [p_y[i] for i in order], + c="steelblue", + linewidth=1.5, + alpha=0.6, + zorder=2, + ) + + # Highlight best combo. + for x, y, b in zip(xs, ys, is_best): + if b: + ax.scatter( + [x], + [y], + c="gold", + edgecolors="darkorange", + s=140, + zorder=4, + marker="*", + label="Best", + ) + + # Number labels on points. + for x, y, lbl in zip(xs, ys, num_labels): + ax.annotate( + lbl, + (x, y), + textcoords="offset points", + xytext=(5, 5), + fontsize=7, + fontweight="bold", + ) + + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.legend(fontsize=7, loc="best") + ax.grid(True, alpha=0.3) + + # External legend mapping numbers to combo names. + legend_lines = [f"({i}) {name}" for i, name in enumerate(names, 1)] + fig.text( + 0.72, + 0.5, + "\n".join(legend_lines), + fontsize=8, + verticalalignment="center", + fontfamily="monospace", + bbox=dict( + boxstyle="round,pad=0.5", + facecolor="lightyellow", + edgecolor="gray", + alpha=0.9, + ), + ) + + if path: + fig.savefig(path, dpi=150, bbox_inches="tight") + print(f"Pareto plot saved to {path}") + else: + plt.show() + plt.close(fig) + class BaseModelSelector(ABC): """Abstract base class for model selectors.