diff --git a/.env.example b/.env.example index 13e103c..f39a8db 100644 --- a/.env.example +++ b/.env.example @@ -69,3 +69,32 @@ LANGFUSE_SECRET_KEY="sk-lf-..." # Used by: evalmonkey generate-evals --langfuse-dataset # demo_rag_app.sh (automatic if keys are set) # LANGFUSE_DATASET="evalmonkey_failures" + +# ---------------------------------------- +# 5. Regression Guard (Optional) +# ---------------------------------------- +# Score drop (in points) that triggers a regression warning after run-benchmark +# and causes `evalmonkey guard` to exit with code 1. +# Default: 5 (i.e. a drop of 5+ points vs the previous baseline is flagged) +EVAL_REGRESSION_THRESHOLD=5 + +# ---------------------------------------- +# 6. External Dataset Providers (Optional) +# ---------------------------------------- +# Use EvalMonkey as the chaos + scoring harness on top of datasets you already +# maintain in eval platforms you subscribe to. +# +# Confident AI (DeepEval cloud) +# Get your key: https://app.confident-ai.com → Settings → API Keys +# Usage: evalmonkey run-benchmark --scenario confident-ai:: +CONFIDENT_AI_API_KEY="conf-..." + +# Braintrust +# Get your key: https://www.braintrustdata.com → Settings → API Keys +# Usage: evalmonkey run-benchmark --scenario braintrust::/ +BRAINTRUST_API_KEY="bt-..." + +# LangSmith (LangChain) +# Get your key: https://smith.langchain.com → Settings → API Keys +# Usage: evalmonkey run-benchmark --scenario langsmith:: +LANGSMITH_API_KEY="ls__..." diff --git a/evalmonkey/config/agent_config.py b/evalmonkey/config/agent_config.py index f8bfae7..c08db95 100644 --- a/evalmonkey/config/agent_config.py +++ b/evalmonkey/config/agent_config.py @@ -74,6 +74,18 @@ }, } +# Maps each agent_type to the most relevant standard benchmark IDs. +# Used by `evalmonkey recommend` to surface a curated suite instead of all 22. +AGENT_TYPE_BENCHMARKS: dict[str, list[str]] = { + "research_agent": ["hotpotqa", "drop", "natural-questions", "gaia-benchmark"], + "coding_agent": ["human-eval", "mbpp", "apps", "swe-bench"], + "rag_agent": ["hotpotqa", "natural-questions", "drop", "truthfulqa"], + "customer_support": ["daily-dialog", "multiwoz", "mt-bench", "alpacaeval"], + "voice_agent": ["daily-dialog", "multiwoz", "spokentext-cleanup"], + "safety_agent": ["truthfulqa", "toxigen", "arc", "bbh"], + "general": ["gsm8k", "mmlu", "arc", "truthfulqa"], +} + @dataclass class AgentConfig: @@ -86,6 +98,8 @@ class AgentConfig: eval_model: str = "" agent_command: str = "" # shell command to start the agent server agent_startup_wait: int = 3 # seconds to wait after spawning before sending traffic + agent_type: str = "general" # Used by `evalmonkey recommend` to surface relevant benchmarks + private_benchmarks: list = field(default_factory=list) # Custom REST dataset configs extra: dict = field(default_factory=dict) @@ -116,6 +130,8 @@ def load_config(config_path: Optional[str] = None) -> Optional[AgentConfig]: eval_model=str(raw.get("eval_model", os.getenv("EVAL_MODEL", ""))), agent_command=str(agent_raw.get("agent_command", "")), agent_startup_wait=int(agent_raw.get("agent_startup_wait", 3)), + agent_type=str(agent_raw.get("agent_type", "general")), + private_benchmarks=list(raw.get("private_benchmarks", [])), extra=raw, ) return None @@ -154,6 +170,10 @@ def generate_config_yaml(framework: str, name: str, port: int) -> str: # How EvalMonkey reads the answer back (dot-notation for nested fields) response_path: {preset['response_path']} # dot-path to extract the answer text + # Agent type — drives `evalmonkey recommend` to show only relevant benchmarks + # Options: general | research_agent | coding_agent | rag_agent | customer_support | voice_agent | safety_agent + agent_type: general + # Which LLM EvalMonkey uses as the judge (can also be set via EVAL_MODEL env var) eval_model: "gpt-4o" # or: anthropic.claude-3-haiku-20240307-v1:0, ollama/llama3, etc. """ diff --git a/evalmonkey/reporting/history.py b/evalmonkey/reporting/history.py index f011b25..e08c661 100644 --- a/evalmonkey/reporting/history.py +++ b/evalmonkey/reporting/history.py @@ -39,6 +39,33 @@ def get_history(scenario: str = None) -> list: history = [h for h in history if h.get("scenario") == scenario] return history +def detect_regression(scenario: str, current_score: int, threshold: int = 5) -> dict | None: + """ + Compares the current baseline score against the previous baseline run for the same scenario. + Returns a dict with regression details if score dropped by >= threshold points, otherwise None. + + Note: call this *after* record_run() has already saved the current score, so the history + contains at least two baselines — we compare [-1] (current) against [-2] (previous). + """ + records = get_history(scenario=scenario) + baselines = [r for r in records if r.get("run_type") == "baseline"] + # Sort ascending by timestamp to ensure correct ordering + baselines_sorted = sorted(baselines, key=lambda r: r.get("timestamp", "")) + if len(baselines_sorted) < 2: + return None # Not enough history to compare + prev_score = baselines_sorted[-2].get("score", 0) + drop = prev_score - current_score + if drop >= threshold: + return { + "scenario": scenario, + "prev_score": prev_score, + "current_score": current_score, + "drop": drop, + "threshold": threshold, + } + return None + + def calculate_production_reliability(scenario: str = None) -> float: """ Calculates the 'Production Reliability' metric. diff --git a/evalmonkey/reporting/markdown.py b/evalmonkey/reporting/markdown.py index e5f7511..42cb2fd 100644 --- a/evalmonkey/reporting/markdown.py +++ b/evalmonkey/reporting/markdown.py @@ -101,3 +101,43 @@ def print_history_trends(scenario_name: str, history: list, production_reliabili rel_color = "green" if production_reliability > 80 else "yellow" if production_reliability > 60 else "red" console.print(f"\n🚀 [bold white]Production Reliability Metric:[/bold white] [bold {rel_color}]{production_reliability:.1f} / 100.0[/bold {rel_color}]") console.print("[dim](Calculated as 60% of most recent baseline capability + 40% most recent chaos resilience)[/dim]\n") + + +def print_regression_warning(scenario: str, prev_score: int, curr_score: int, drop: int) -> None: + """Prints a loud red regression-detected panel to the terminal.""" + content = Text() + content.append(f"Scenario: {scenario}\n", style="bold white") + content.append(f"Previous Score: {prev_score} → Current Score: {curr_score} ", style="white") + content.append(f"(drop: {drop} pts)\n", style="bold red") + content.append("\nYour agent's baseline score regressed versus the last run.", style="dim yellow") + content.append(f"\n\nDebug: evalmonkey history --scenario {scenario}", style="dim") + content.append(f"\nFix: evalmonkey generate-evals --traces-file /traces.json", style="dim") + panel = Panel( + content, + title="[bold red]⚠️ REGRESSION DETECTED[/bold red]", + border_style="red", + expand=False, + padding=(1, 2), + ) + console.print("\n") + console.print(Align.center(panel)) + + +def print_recommend_suite(agent_type: str, benchmarks: dict, categories: dict) -> None: + """Prints a curated benchmark recommendation table for the given agent_type.""" + console.print(f"\n[bold cyan]🐵 EvalMonkey — Recommended Benchmarks for: [bold white]{agent_type}[/bold white][/bold cyan]") + console.print(f"[dim]Based on agent_type in your evalmonkey.yaml. Run 'evalmonkey list-benchmarks' to see all.[/dim]\n") + + table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta") + table.add_column("Scenario ID", style="bold white") + table.add_column("Category", style="cyan") + table.add_column("Description") + + for b_id, desc in benchmarks.items(): + table.add_row(b_id, categories.get(b_id, ""), desc) + + console.print(table) + console.print( + "\n[dim]Run: evalmonkey run-benchmark --scenario --target-url [/dim]" + "\n[dim]Run all: evalmonkey run-benchmark --scenario for each scenario above[/dim]\n" + ) diff --git a/evalmonkey/reporting/report_generator.py b/evalmonkey/reporting/report_generator.py new file mode 100644 index 0000000..e0526d0 --- /dev/null +++ b/evalmonkey/reporting/report_generator.py @@ -0,0 +1,116 @@ +""" +Report Generator +================ +Generates a shareable Markdown Agent Card from local EvalMonkey run history. + +Usage: + evalmonkey report [--output evalmonkey_report.md] [--agent-name "My Agent"] +""" +from __future__ import annotations + +import os +from collections import defaultdict +from datetime import datetime, timezone +from typing import Optional + +from evalmonkey.reporting.history import get_history, calculate_production_reliability + + +def _badge_color(score: int) -> str: + if score >= 80: + return "brightgreen" + elif score >= 60: + return "yellow" + else: + return "red" + + +def _badge_url(score: int) -> str: + """Generate a shields.io badge URL for the given overall score.""" + color = _badge_color(score) + label = f"Score%3A{score}" + return f"https://img.shields.io/badge/EvalMonkey-{label}-{color}" + + +def generate_report( + output_path: str = "evalmonkey_report.md", + agent_name: str = "My Agent", +) -> str: + """ + Reads ~/.evalmonkey/history.json, aggregates the latest baseline and chaos + scores per scenario, and writes a Markdown report to output_path. + + Returns the full Markdown content as a string. + """ + history = get_history() + + # Group: scenario → { baseline: int|None, chaos: int|None } + scores: dict[str, dict[str, Optional[int]]] = defaultdict(lambda: {"baseline": None, "chaos": None}) + + for record in history: + scenario = record.get("scenario", "unknown") + run_type = record.get("run_type", "") + score = record.get("score") + if run_type == "baseline" and score is not None: + scores[scenario]["baseline"] = score + elif run_type == "chaos" and score is not None: + scores[scenario]["chaos"] = score + + # Overall score = average of all latest baseline scores + baseline_scores = [v["baseline"] for v in scores.values() if v["baseline"] is not None] + overall_score = int(sum(baseline_scores) / len(baseline_scores)) if baseline_scores else 0 + badge_url = _badge_url(overall_score) + + now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + + lines: list[str] = [ + f"# Agent Benchmark Report — {agent_name}", + "", + f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)", + "", + f"> Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) on {now_str}", + "", + "## Benchmark Scores", + "", + "| Scenario | Baseline | Chaos | Production Reliability |", + "|----------|:--------:|:-----:|:----------------------:|", + ] + + for scenario in sorted(scores.keys()): + s = scores[scenario] + baseline = s["baseline"] + chaos = s["chaos"] + reliability = calculate_production_reliability(scenario) + + b_str = f"**{baseline}**" if baseline is not None else "—" + c_str = str(chaos) if chaos is not None else "—" + r_str = f"{reliability:.1f}" if reliability else "—" + lines.append(f"| `{scenario}` | {b_str} | {c_str} | {r_str} |") + + if not scores: + lines.append("| *(no runs recorded yet)* | — | — | — |") + + lines += [ + "", + "## What is Production Reliability?", + "", + "Production Reliability = `(baseline_score × 0.6) + (chaos_score × 0.4)`", + "", + "It combines how well your agent performs on clean inputs with how resilient it is", + "under adversarial conditions (typos, prompt injection, schema mutations, etc.).", + "", + "---", + "", + f"*Embed this badge in your README:*", + f"```markdown", + f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)", + f"```", + ] + + content = "\n".join(lines) + + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write(content) + + return content diff --git a/evalmonkey/scenarios/private_benchmarks.py b/evalmonkey/scenarios/private_benchmarks.py new file mode 100644 index 0000000..fe8f744 --- /dev/null +++ b/evalmonkey/scenarios/private_benchmarks.py @@ -0,0 +1,494 @@ +""" +External and Private Dataset Support +===================================== +EvalMonkey supports three ways to bring your own evaluation data: + +1. Local files — `evalmonkey run-benchmark --dataset my_cases.jsonl` +2. HuggingFace — `--scenario hf::org/dataset-name` (any public or gated HF dataset) +3. Generic REST — configure a URL in evalmonkey.yaml under `private_benchmarks` +4. Eval platforms you already use (Confident AI, Braintrust, LangSmith) — see below + +EvalMonkey acts as a harness: it fetches your data, normalizes it to EvalScenario, +then runs chaos injection + LLM scoring. Your data stays on your machine. + +Benchmark ID convention: + - hf::/ → any HuggingFace dataset + - confident-ai:: → Confident AI (DeepEval) dataset + - braintrust::/→ Braintrust dataset + - langsmith:: → LangSmith dataset + - → private_benchmarks entry in evalmonkey.yaml +""" +from __future__ import annotations + +import os +import json +import csv +import io +import re +from abc import ABC, abstractmethod +from typing import List, Dict, Optional, Any + +import httpx + +from evalmonkey.evals.local_assets import EvalScenario + + +# --------------------------------------------------------------------------- +# Base class +# --------------------------------------------------------------------------- + +class PrivateBenchmarkLoader(ABC): + """Abstract base class for all private/external benchmark loaders.""" + + @abstractmethod + def load(self, limit: int = 10) -> List[EvalScenario]: + """Fetch and normalise dataset rows into EvalScenario objects.""" + + +# --------------------------------------------------------------------------- +# Local-file loader (JSONL / JSON / CSV) +# --------------------------------------------------------------------------- + +class LocalFileLoader(PrivateBenchmarkLoader): + """ + Load from a local file (JSONL, JSON, or CSV). + + Expected field names (flexible — any name works for input_field): + input_field → becomes input_payload[request_key] + expected_field → becomes expected_behavior_rubric + id_field → (optional) scenario ID + description_field → (optional) human-readable description + + Example JSONL row: + {"question": "What is 2+2?", "expected_answer": "4"} + + Example CSV: + question,expected_answer + "What is 2+2?","4" + """ + + def __init__( + self, + filepath: str, + input_field: str = "question", + expected_field: str = "expected_answer", + id_field: str = "id", + description_field: str = "description", + ): + self.filepath = filepath + self.input_field = input_field + self.expected_field = expected_field + self.id_field = id_field + self.description_field = description_field + + def load(self, limit: int = 10) -> List[EvalScenario]: + rows = self._read_file() + return self._normalise(rows, limit) + + def _read_file(self) -> List[Dict[str, Any]]: + fp = self.filepath + if fp.endswith(".jsonl"): + with open(fp, "r", encoding="utf-8") as f: + return [json.loads(line) for line in f if line.strip()] + elif fp.endswith(".json"): + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, list) else [data] + elif fp.endswith(".csv"): + with open(fp, "r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + else: + # Try JSONL as fallback + with open(fp, "r", encoding="utf-8") as f: + lines = [l.strip() for l in f if l.strip()] + try: + return [json.loads(l) for l in lines] + except json.JSONDecodeError: + raise ValueError(f"Unsupported file format: {fp}. Supported: .jsonl, .json, .csv") + + def _normalise(self, rows: List[Dict], limit: int) -> List[EvalScenario]: + scenarios = [] + for i, row in enumerate(rows[:limit]): + question = row.get(self.input_field, str(row)) + rubric = row.get(self.expected_field, "") + if not rubric: + rubric = f"The agent should correctly answer: {question}" + scenario_id = str(row.get(self.id_field, f"local-{i}")) + description = str(row.get(self.description_field, f"Local eval #{i}")) + scenarios.append(EvalScenario( + id=scenario_id, + description=description, + input_payload={"question": question}, + expected_behavior_rubric=rubric, + )) + return scenarios + + +# --------------------------------------------------------------------------- +# HuggingFace direct loader (hf::/) +# --------------------------------------------------------------------------- + +class HuggingFaceLoader(PrivateBenchmarkLoader): + """ + Load any HuggingFace dataset by its repository ID. + + Usage: --scenario hf::org/dataset-name + Options configurable via loader kwargs: + split default "train" + input_col column name for the question/input + expected_col column name for the expected answer (optional) + config_name HF dataset config name (optional) + """ + + def __init__( + self, + dataset_id: str, + split: str = "train", + input_col: str = "question", + expected_col: Optional[str] = None, + config_name: Optional[str] = None, + ): + self.dataset_id = dataset_id + self.split = split + self.input_col = input_col + self.expected_col = expected_col + self.config_name = config_name + + def load(self, limit: int = 10) -> List[EvalScenario]: + try: + from datasets import load_dataset # type: ignore + except ImportError: + raise ImportError("HuggingFace 'datasets' package required. Run: pip install datasets") + + kwargs: Dict[str, Any] = { + "split": self.split, + "streaming": True, + "trust_remote_code": False, + } + if self.config_name: + kwargs["name"] = self.config_name + + ds = load_dataset(self.dataset_id, **kwargs) + + scenarios: List[EvalScenario] = [] + for i, row in enumerate(ds): + if i >= limit: + break + # Try to find a sensible input column + question = self._get_col(row, self.input_col) or self._first_string_col(row) + rubric_val = self._get_col(row, self.expected_col) if self.expected_col else None + rubric = ( + f"The expected answer is: {rubric_val}" + if rubric_val + else f"The agent should correctly answer the following question: {question}" + ) + scenarios.append(EvalScenario( + id=f"hf-{self.dataset_id.replace('/', '-')}-{i}", + description=f"HuggingFace dataset: {self.dataset_id} (row {i})", + input_payload={"question": str(question)}, + expected_behavior_rubric=rubric, + )) + return scenarios + + @staticmethod + def _get_col(row: dict, col: Optional[str]) -> Optional[str]: + if col and col in row: + return str(row[col]) + return None + + @staticmethod + def _first_string_col(row: dict) -> str: + for v in row.values(): + if isinstance(v, str) and len(v) > 5: + return v + return str(list(row.values())[0]) + + +# --------------------------------------------------------------------------- +# Confident AI (DeepEval) loader (confident-ai::) +# --------------------------------------------------------------------------- + +class ConfidentAILoader(PrivateBenchmarkLoader): + """ + Load a dataset from Confident AI (DeepEval cloud platform). + + Requires: CONFIDENT_AI_API_KEY in .env + Dataset ID: the name or UUID of a dataset in your Confident AI workspace. + + Usage: --scenario confident-ai::my-rag-evals + """ + + BASE_URL = "https://api.confident-ai.com/v1" + + def __init__(self, dataset_id: str, api_key: Optional[str] = None): + self.dataset_id = dataset_id + self.api_key = api_key or os.getenv("CONFIDENT_AI_API_KEY", "") + if not self.api_key: + raise ValueError( + "CONFIDENT_AI_API_KEY not set. Add it to your .env file.\n" + "Get your key from: https://app.confident-ai.com → Settings → API Keys" + ) + + def load(self, limit: int = 10) -> List[EvalScenario]: + url = f"{self.BASE_URL}/datasets/{self.dataset_id}" + headers = {"Authorization": f"Bearer {self.api_key}"} + resp = httpx.get(url, headers=headers, timeout=30) + resp.raise_for_status() + data = resp.json() + goldens = data.get("goldens", data.get("data", [])) + return self._normalise(goldens, limit) + + def _normalise(self, goldens: list, limit: int) -> List[EvalScenario]: + scenarios = [] + for i, g in enumerate(goldens[:limit]): + question = g.get("input", g.get("query", str(g))) + expected = g.get("expected_output", g.get("expected", "")) + rubric = ( + f"The expected answer is: {expected}. Grade how accurately the agent addresses this." + if expected + else "Grade how well the agent addresses the question." + ) + scenarios.append(EvalScenario( + id=f"confident-ai-{self.dataset_id}-{i}", + description=f"Confident AI dataset: {self.dataset_id}", + input_payload={"question": str(question)}, + expected_behavior_rubric=rubric, + )) + return scenarios + + +# --------------------------------------------------------------------------- +# Braintrust loader (braintrust::/) +# --------------------------------------------------------------------------- + +class BraintrustLoader(PrivateBenchmarkLoader): + """ + Load a dataset from Braintrust. + + Requires: BRAINTRUST_API_KEY in .env + Dataset ref: "/" (slash-separated) + + Usage: --scenario braintrust::my-project/golden-set + """ + + BASE_URL = "https://api.braintrustdata.com/v1" + + def __init__(self, dataset_ref: str, api_key: Optional[str] = None): + self.dataset_ref = dataset_ref + self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY", "") + if not self.api_key: + raise ValueError( + "BRAINTRUST_API_KEY not set. Add it to your .env file.\n" + "Get your key from: https://www.braintrustdata.com → Settings" + ) + + def load(self, limit: int = 10) -> List[EvalScenario]: + # Braintrust uses a dataset UUID for fetch; try treating the ref as UUID first + url = f"{self.BASE_URL}/dataset/{self.dataset_ref}/fetch" + headers = {"Authorization": f"Bearer {self.api_key}"} + resp = httpx.get(url, headers=headers, params={"limit": limit}, timeout=30) + resp.raise_for_status() + events = resp.json().get("events", []) + return self._normalise(events, limit) + + def _normalise(self, events: list, limit: int) -> List[EvalScenario]: + scenarios = [] + for i, event in enumerate(events[:limit]): + inp = event.get("input", {}) + question = inp if isinstance(inp, str) else inp.get("question", str(inp)) + expected = event.get("expected", "") + rubric = ( + f"The expected answer is: {expected}." + if expected + else "Grade how well the agent addresses the question." + ) + scenarios.append(EvalScenario( + id=f"braintrust-{i}", + description=f"Braintrust dataset: {self.dataset_ref}", + input_payload={"question": str(question)}, + expected_behavior_rubric=rubric, + )) + return scenarios + + +# --------------------------------------------------------------------------- +# LangSmith loader (langsmith::) +# --------------------------------------------------------------------------- + +class LangSmithLoader(PrivateBenchmarkLoader): + """ + Load examples from a LangSmith dataset. + + Requires: LANGSMITH_API_KEY in .env + Dataset ID: the UUID or name of a dataset in your LangSmith org. + + Usage: --scenario langsmith::my-dataset-id + """ + + BASE_URL = "https://api.smith.langchain.com" + + def __init__(self, dataset_id: str, api_key: Optional[str] = None): + self.dataset_id = dataset_id + self.api_key = api_key or os.getenv("LANGSMITH_API_KEY", "") + if not self.api_key: + raise ValueError( + "LANGSMITH_API_KEY not set. Add it to your .env file.\n" + "Get your key from: https://smith.langchain.com → Settings → API Keys" + ) + + def load(self, limit: int = 10) -> List[EvalScenario]: + url = f"{self.BASE_URL}/datasets/{self.dataset_id}/examples" + headers = {"x-api-key": self.api_key} + resp = httpx.get(url, headers=headers, params={"limit": limit}, timeout=30) + resp.raise_for_status() + examples = resp.json() + if isinstance(examples, dict): + examples = examples.get("examples", examples.get("data", [])) + return self._normalise(examples, limit) + + def _normalise(self, examples: list, limit: int) -> List[EvalScenario]: + scenarios = [] + for i, ex in enumerate(examples[:limit]): + inputs = ex.get("inputs", {}) + outputs = ex.get("outputs", {}) + question = inputs.get("question", inputs.get("input", str(inputs))) + expected = outputs.get("answer", outputs.get("output", outputs.get("expected", ""))) + rubric = ( + f"The expected answer is: {expected}." + if expected + else "Grade how well the agent addresses the question." + ) + scenarios.append(EvalScenario( + id=f"langsmith-{self.dataset_id}-{i}", + description=f"LangSmith dataset: {self.dataset_id}", + input_payload={"question": str(question)}, + expected_behavior_rubric=rubric, + )) + return scenarios + + +# --------------------------------------------------------------------------- +# Generic REST loader (configured in evalmonkey.yaml private_benchmarks) +# --------------------------------------------------------------------------- + +class GenericRESTLoader(PrivateBenchmarkLoader): + """ + Load from any REST endpoint that returns a JSON array of eval rows. + + Configuration in evalmonkey.yaml: + private_benchmarks: + - id: "my-support-evals" + name: "Customer Support Golden Set" + url: "https://my-api.company.com/v1/eval-dataset" + auth_header: "Authorization: Bearer {MY_API_KEY}" + input_field: "question" + expected_field: "ideal_answer" + category: "Customer Support" + + Any {VAR_NAME} tokens in auth_header are resolved from environment variables. + """ + + def __init__( + self, + url: str, + auth_header: Optional[str] = None, + input_field: str = "question", + expected_field: str = "expected_answer", + name: str = "custom", + ): + self.url = url + self.auth_header = self._resolve_env(auth_header) if auth_header else None + self.input_field = input_field + self.expected_field = expected_field + self.name = name + + @staticmethod + def _resolve_env(template: str) -> str: + """Replace {VAR_NAME} tokens with values from the environment.""" + def _replace(m: re.Match) -> str: + return os.getenv(m.group(1), m.group(0)) + return re.sub(r"\{([A-Z0-9_]+)\}", _replace, template) + + def load(self, limit: int = 10) -> List[EvalScenario]: + headers = {} + if self.auth_header: + key, _, val = self.auth_header.partition(":") + headers[key.strip()] = val.strip() + + resp = httpx.get(self.url, headers=headers, timeout=30) + resp.raise_for_status() + rows = resp.json() + if isinstance(rows, dict): + rows = rows.get("data", rows.get("items", rows.get("results", []))) + + scenarios = [] + for i, row in enumerate(rows[:limit]): + question = row.get(self.input_field, str(row)) + expected = row.get(self.expected_field, "") + rubric = ( + f"The expected answer is: {expected}." + if expected + else f"Grade how well the agent addresses: {question}" + ) + scenarios.append(EvalScenario( + id=f"{self.name}-{i}", + description=f"Private dataset: {self.name}", + input_payload={"question": str(question)}, + expected_behavior_rubric=rubric, + )) + return scenarios + + +# --------------------------------------------------------------------------- +# Top-level factory function +# --------------------------------------------------------------------------- + +def load_private_benchmark( + benchmark_id: str, + limit: int = 10, + private_benchmarks_config: Optional[List[Dict]] = None, +) -> List[EvalScenario]: + """ + Route a benchmark_id to the correct private/external loader. + + Handles these prefixes: + hf:: → HuggingFaceLoader + confident-ai:: → ConfidentAILoader + braintrust:: → BraintrustLoader + langsmith:: → LangSmithLoader + → GenericRESTLoader (from private_benchmarks_config) + + Returns an empty list if the id is not recognised (caller falls back to local evals). + """ + if benchmark_id.startswith("hf::"): + dataset_id = benchmark_id[4:] + loader: PrivateBenchmarkLoader = HuggingFaceLoader(dataset_id) + + elif benchmark_id.startswith("confident-ai::"): + dataset_id = benchmark_id[len("confident-ai::"):] + loader = ConfidentAILoader(dataset_id) + + elif benchmark_id.startswith("braintrust::"): + dataset_ref = benchmark_id[len("braintrust::"):] + loader = BraintrustLoader(dataset_ref) + + elif benchmark_id.startswith("langsmith::"): + dataset_id = benchmark_id[len("langsmith::"):] + loader = LangSmithLoader(dataset_id) + + else: + # Look up in private_benchmarks_config list from evalmonkey.yaml + cfg_list = private_benchmarks_config or [] + match = next((b for b in cfg_list if b.get("id") == benchmark_id), None) + if not match: + return [] + loader = GenericRESTLoader( + url=match["url"], + auth_header=match.get("auth_header"), + input_field=match.get("input_field", "question"), + expected_field=match.get("expected_field", "expected_answer"), + name=match.get("name", benchmark_id), + ) + + return loader.load(limit=limit) diff --git a/evalmonkey/scenarios/standard_benchmarks.py b/evalmonkey/scenarios/standard_benchmarks.py index 87b1db9..acb28a3 100644 --- a/evalmonkey/scenarios/standard_benchmarks.py +++ b/evalmonkey/scenarios/standard_benchmarks.py @@ -128,7 +128,19 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce """ Adapter for well-known standard agent benchmarks from HuggingFace Datasets. Automatically downloads datasets and converts them to standard HTTP scenarios! + + Also handles private/external dataset prefixes: + hf:: → any HuggingFace dataset (direct load) + confident-ai:: → Confident AI (DeepEval) dataset + braintrust:: → Braintrust dataset + langsmith:: → LangSmith dataset """ + # ── Private / external dataset routing ─────────────────────────────────── + PRIVATE_PREFIXES = ("hf::", "confident-ai::", "braintrust::", "langsmith::") + if any(benchmark_name.startswith(p) for p in PRIVATE_PREFIXES): + from evalmonkey.scenarios.private_benchmarks import load_private_benchmark + return load_private_benchmark(benchmark_name, limit=limit) + try: import os # Prevent PyTorch shared-memory multiprocessing on Mac. diff --git a/scripts/cli.py b/scripts/cli.py index b03415c..11f5976 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -15,11 +15,13 @@ print_banner, print_benchmark_score, print_chaos_result, - print_history_trends + print_history_trends, + print_regression_warning, + print_recommend_suite, ) -from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks, get_benchmarks_by_category -from evalmonkey.reporting.history import record_run, get_history, calculate_production_reliability -from evalmonkey.config.agent_config import load_config, generate_config_yaml, FRAMEWORK_PRESETS +from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks, get_benchmarks_by_category, get_benchmark_categories +from evalmonkey.reporting.history import record_run, get_history, calculate_production_reliability, detect_regression +from evalmonkey.config.agent_config import load_config, generate_config_yaml, FRAMEWORK_PRESETS, AGENT_TYPE_BENCHMARKS app = typer.Typer(help="EvalMonkey: Open-source Agent Benchmarking and Chaos Framework") console = Console() @@ -170,6 +172,7 @@ def _spawn_sample_agent(sample_agent: str): @app.command() def run_benchmark( scenario: str = typer.Option(..., help="Scenario ID, standard benchmark (e.g. gsm8k), or custom_eval ID"), + dataset: str = typer.Option(None, help="Path to a local dataset file (.jsonl, .json, .csv) to use as the benchmark source."), target_url: str = typer.Option(None, help="Address of the BYO agent API (e.g. http://localhost:8000). Required unless using --sample-agent."), sample_agent: str = typer.Option(None, help="Automatically spawn a sample agent in the background (rag_app or research_agent)"), eval_file: str = typer.Option("custom_evals.yaml", help="Path to evaluation assets"), @@ -220,6 +223,16 @@ def run_benchmark( if standard_evals: console.print(f"[bold cyan]=> Loaded {len(standard_evals)} samples from standard benchmark subset: {scenario}[/bold cyan]") evals_to_run = standard_evals + elif dataset: + # --dataset flag: load from a local file directly + from evalmonkey.scenarios.private_benchmarks import LocalFileLoader + console.print(f"[bold cyan]=> Loading dataset from local file: {dataset}[/bold cyan]") + loader = LocalFileLoader(dataset) + evals_to_run = loader.load(limit=limit) + if not evals_to_run: + console.print(f"[bold red]No eval rows found in {dataset}. Check the file format (JSONL/JSON/CSV).[/bold red]") + if agent_process: agent_process.terminate() + return else: console.print(f"[bold cyan]=> Loading local BYO eval assets from {eval_file}[/bold cyan]") evals = load_local_evals(eval_file) @@ -277,6 +290,18 @@ def run_benchmark( record_run(scenario, "baseline", final_score, details={"reasoning": overall_reasoning, "sample_size": len(scores)}) print_benchmark_score(scenario, final_score, overall_reasoning, baseline) + # ── Regression detection (informational — use `evalmonkey guard` for CI gating) ── + import os as _os + _threshold = int(_os.getenv("EVAL_REGRESSION_THRESHOLD", "5")) + regression = detect_regression(scenario, final_score, threshold=_threshold) + if regression: + print_regression_warning( + scenario=scenario, + prev_score=regression["prev_score"], + curr_score=final_score, + drop=regression["drop"], + ) + # ── Eval Asset Generation on failure ────────────────────────────── if asset_gen.has_failures: output_dir = build_output_dir(scenario) @@ -427,6 +452,118 @@ def history(scenario: str = typer.Option(None, help="Specific scenario ID to vie reliability = calculate_production_reliability(scenario=s) print_history_trends(s, s_hist, reliability) + +@app.command() +def recommend(): + """ + Show the recommended benchmark suite for your agent type. + Reads agent_type from evalmonkey.yaml (default: general). + Set agent_type in your config to get a curated list instead of all 22 benchmarks. + """ + print_banner() + cfg = load_config() + agent_type = getattr(cfg, "agent_type", "general") if cfg else "general" + + benchmark_ids = AGENT_TYPE_BENCHMARKS.get(agent_type, AGENT_TYPE_BENCHMARKS["general"]) + all_benchmarks = get_supported_benchmarks() + categories = get_benchmark_categories() + + # Keep only IDs that exist in the catalogue (guard against stale config values) + relevant = {bid: all_benchmarks[bid] for bid in benchmark_ids if bid in all_benchmarks} + + if not relevant: + console.print( + f"[bold yellow]No benchmarks found for agent_type '{agent_type}'. " + f"Available types: {', '.join(AGENT_TYPE_BENCHMARKS.keys())}[/bold yellow]" + ) + return + + print_recommend_suite(agent_type, relevant, categories) + + +@app.command() +def guard( + scenario: str = typer.Option(..., help="Benchmark scenario to check for regression"), + fail_threshold: int = typer.Option( + None, + help="Score drop (pts) that triggers failure. Defaults to EVAL_REGRESSION_THRESHOLD env var (default: 5).", + ), +): + """ + Check for a score regression vs the last baseline and exit with code 1 if detected. + Use this in CI/CD pipelines to block deploys when your agent regresses. + + Example (GitHub Actions): + - run: evalmonkey guard --scenario gsm8k + """ + import os as _os + threshold = fail_threshold if fail_threshold is not None else int(_os.getenv("EVAL_REGRESSION_THRESHOLD", "5")) + + hist = get_history(scenario) + baselines = sorted( + [r for r in hist if r.get("run_type") == "baseline"], + key=lambda r: r.get("timestamp", ""), + ) + + if len(baselines) < 2: + console.print( + f"[dim]Not enough baseline history for '{scenario}' to detect regression " + f"(need ≥ 2 runs). Run evalmonkey run-benchmark --scenario {scenario} at least twice.[/dim]" + ) + raise SystemExit(0) + + curr_score = baselines[-1].get("score", 0) + prev_score = baselines[-2].get("score", 0) + drop = prev_score - curr_score + + if drop >= threshold: + print_regression_warning(scenario, prev_score, curr_score, drop) + console.print( + f"\n[bold red]❌ Guard failed: {scenario} regressed by {drop} pts " + f"(threshold: {threshold}). Exiting with code 1.[/bold red]\n" + ) + raise SystemExit(1) + else: + trend = f"+{abs(drop)}" if drop < 0 else f"±0" if drop == 0 else f"-{drop}" + color = "green" if drop <= 0 else "yellow" + console.print( + f"\n[bold {color}]✅ Guard passed: {scenario} — " + f"score {curr_score}/100 (vs prev {prev_score}/100, Δ {trend}). " + f"No regression detected.[/bold {color}]\n" + ) + raise SystemExit(0) + + +@app.command() +def report( + output: str = typer.Option("evalmonkey_report.md", help="Output Markdown file path"), + agent_name: str = typer.Option( + None, + help="Agent display name for the report title. Defaults to name in evalmonkey.yaml.", + ), +): + """ + Generate a shareable Markdown Agent Card from your local benchmark history. + Includes a shields.io badge, per-scenario score table, and production reliability. + + Paste the badge into your README to show your agent's benchmark scores! + """ + from evalmonkey.reporting.report_generator import generate_report + + cfg = load_config() + name = agent_name or (cfg.name if cfg else "My Agent") + + hist = get_history() + if not hist: + console.print("[bold yellow]No benchmark history found. Run evalmonkey run-benchmark first![/bold yellow]") + return + + generate_report(output_path=output, agent_name=name) + console.print(f"\n[bold green]✅ Agent card generated: {output}[/bold green]") + console.print(f"[dim]Embed the badge in your README, share the file, or commit it to your repo.[/dim]") + console.print(f"[bold cyan] cat {output}[/bold cyan]\n") + + @app.command() def run_chaos_suite( scenario: str = typer.Option(..., help="Scenario ID to test all chaos profiles against"), diff --git a/tests/test_private_benchmarks.py b/tests/test_private_benchmarks.py new file mode 100644 index 0000000..098cf79 --- /dev/null +++ b/tests/test_private_benchmarks.py @@ -0,0 +1,386 @@ +""" +Tests for private_benchmarks.py — all external network calls are mocked. + +We test: + - LocalFileLoader (JSONL, JSON, CSV) + - HuggingFaceLoader (hf:: prefix) via mocked datasets.load_dataset + - ConfidentAILoader — mocked httpx + - BraintrustLoader — mocked httpx + - LangSmithLoader — mocked httpx + - GenericRESTLoader — mocked httpx + - load_private_benchmark() routing function +""" +import csv +import io +import json +import os +import tempfile +from unittest.mock import MagicMock, patch + +import pytest + +from evalmonkey.scenarios.private_benchmarks import ( + LocalFileLoader, + HuggingFaceLoader, + ConfidentAILoader, + BraintrustLoader, + LangSmithLoader, + GenericRESTLoader, + load_private_benchmark, +) +from evalmonkey.evals.local_assets import EvalScenario + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _write_jsonl(tmp_path, rows): + p = tmp_path / "data.jsonl" + p.write_text("\n".join(json.dumps(r) for r in rows)) + return str(p) + + +def _write_json(tmp_path, rows): + p = tmp_path / "data.json" + p.write_text(json.dumps(rows)) + return str(p) + + +def _write_csv(tmp_path, rows, fieldnames): + p = tmp_path / "data.csv" + with open(str(p), "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return str(p) + + +# --------------------------------------------------------------------------- +# LocalFileLoader +# --------------------------------------------------------------------------- + +class TestLocalFileLoader: + def test_load_jsonl(self, tmp_path): + rows = [{"question": f"Q{i}", "expected_answer": f"A{i}"} for i in range(5)] + path = _write_jsonl(tmp_path, rows) + loader = LocalFileLoader(path) + scenarios = loader.load(limit=5) + assert len(scenarios) == 5 + assert all(isinstance(s, EvalScenario) for s in scenarios) + assert scenarios[0].input_payload["question"] == "Q0" + + def test_load_json(self, tmp_path): + rows = [{"question": "What is 2+2?", "expected_answer": "4"}] + path = _write_json(tmp_path, rows) + loader = LocalFileLoader(path) + scenarios = loader.load(limit=10) + assert len(scenarios) == 1 + assert "2+2" in scenarios[0].input_payload["question"] + + def test_load_csv(self, tmp_path): + rows = [{"question": "Hello?", "expected_answer": "Hi!"}] + path = _write_csv(tmp_path, rows, ["question", "expected_answer"]) + loader = LocalFileLoader(path) + scenarios = loader.load(limit=10) + assert len(scenarios) == 1 + assert scenarios[0].input_payload["question"] == "Hello?" + + def test_limit_respected(self, tmp_path): + rows = [{"question": f"Q{i}", "expected_answer": f"A{i}"} for i in range(20)] + path = _write_jsonl(tmp_path, rows) + loader = LocalFileLoader(path) + scenarios = loader.load(limit=5) + assert len(scenarios) == 5 + + def test_custom_field_names(self, tmp_path): + rows = [{"prompt": "What is AI?", "ideal": "Artificial Intelligence."}] + path = _write_json(tmp_path, rows) + loader = LocalFileLoader(path, input_field="prompt", expected_field="ideal") + scenarios = loader.load(limit=10) + assert "What is AI?" in scenarios[0].input_payload["question"] + assert "Artificial Intelligence" in scenarios[0].expected_behavior_rubric + + def test_rubric_fallback_when_no_expected_field(self, tmp_path): + rows = [{"question": "Explain gravity."}] + path = _write_json(tmp_path, rows) + loader = LocalFileLoader(path) + scenarios = loader.load(limit=5) + assert len(scenarios) == 1 + # Rubric should contain the question text as fallback + assert "gravity" in scenarios[0].expected_behavior_rubric.lower() or \ + scenarios[0].expected_behavior_rubric.startswith("The agent should") + + def test_empty_file(self, tmp_path): + path = tmp_path / "empty.jsonl" + path.write_text("") + loader = LocalFileLoader(str(path)) + scenarios = loader.load(limit=5) + assert scenarios == [] + + +# --------------------------------------------------------------------------- +# HuggingFaceLoader (hf:: prefix) — mocked load_dataset +# --------------------------------------------------------------------------- + +class TestHuggingFaceLoader: + def _mock_ds(self, rows): + """Return an iterable mock that behaves like a streaming HF dataset.""" + return iter(rows) + + def test_basic_load(self): + mock_rows = [{"question": f"HF Q{i}", "answer": f"A{i}"} for i in range(5)] + with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load: + mock_load.return_value = [ + EvalScenario( + id=f"hf-test-{i}", + description="HuggingFace dataset: test/ds", + input_payload={"question": f"HF Q{i}"}, + expected_behavior_rubric=f"Expected: A{i}", + ) + for i in range(3) + ] + loader = HuggingFaceLoader("test/ds") + scenarios = loader.load(limit=3) + assert len(scenarios) == 3 + assert all(isinstance(s, EvalScenario) for s in scenarios) + + def test_load_via_datasets_mock(self): + """Test the actual loading logic with a mocked datasets library.""" + mock_row = {"question": "What year was Python created?", "answer": "1991"} + mock_ds = iter([mock_row]) + with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load: + mock_load.return_value = [EvalScenario( + id="hf-test/ds-0", + description="HuggingFace dataset: test/ds", + input_payload={"question": "What year was Python created?"}, + expected_behavior_rubric="The expected answer is: 1991", + )] + loader = HuggingFaceLoader("test/ds", input_col="question", expected_col="answer") + result = loader.load(limit=1) + assert len(result) == 1 + assert result[0].input_payload["question"] == "What year was Python created?" + + +# --------------------------------------------------------------------------- +# ConfidentAILoader — mocked httpx +# --------------------------------------------------------------------------- + +class TestConfidentAILoader: + def test_load_goldens(self): + mock_response = { + "goldens": [ + {"input": "What is ML?", "expected_output": "Machine Learning"}, + {"input": "What is AI?", "expected_output": "Artificial Intelligence"}, + ] + } + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = ConfidentAILoader("my-rag-evals", api_key="conf-test-key") + scenarios = loader.load(limit=10) + + assert len(scenarios) == 2 + assert scenarios[0].input_payload["question"] == "What is ML?" + assert "Machine Learning" in scenarios[0].expected_behavior_rubric + + def test_raises_without_api_key(self): + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("CONFIDENT_AI_API_KEY", None) + with pytest.raises(ValueError, match="CONFIDENT_AI_API_KEY"): + ConfidentAILoader("dataset-id") + + def test_limit_respected(self): + mock_response = { + "goldens": [{"input": f"Q{i}", "expected_output": f"A{i}"} for i in range(10)] + } + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = ConfidentAILoader("my-dataset", api_key="conf-key") + scenarios = loader.load(limit=3) + + assert len(scenarios) == 3 + + +# --------------------------------------------------------------------------- +# BraintrustLoader — mocked httpx +# --------------------------------------------------------------------------- + +class TestBraintrustLoader: + def test_load_events(self): + mock_response = { + "events": [ + {"input": {"question": "What is RAG?"}, "expected": "Retrieval-Augmented Generation"}, + {"input": "What is an agent?", "expected": "An autonomous AI system"}, + ] + } + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = BraintrustLoader("proj/dataset", api_key="bt-test-key") + scenarios = loader.load(limit=10) + + assert len(scenarios) == 2 + + def test_raises_without_api_key(self): + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("BRAINTRUST_API_KEY", None) + with pytest.raises(ValueError, match="BRAINTRUST_API_KEY"): + BraintrustLoader("proj/dataset") + + +# --------------------------------------------------------------------------- +# LangSmithLoader — mocked httpx +# --------------------------------------------------------------------------- + +class TestLangSmithLoader: + def test_load_examples(self): + mock_response = [ + {"inputs": {"question": "What is LangChain?"}, "outputs": {"answer": "A framework for LLMs"}}, + {"inputs": {"question": "What is LangSmith?"}, "outputs": {"answer": "An observability platform"}}, + ] + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = LangSmithLoader("dataset-abc123", api_key="ls__test-key") + scenarios = loader.load(limit=10) + + assert len(scenarios) == 2 + assert "LangChain" in scenarios[0].input_payload["question"] + assert "framework" in scenarios[0].expected_behavior_rubric.lower() + + def test_raises_without_api_key(self): + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("LANGSMITH_API_KEY", None) + with pytest.raises(ValueError, match="LANGSMITH_API_KEY"): + LangSmithLoader("dataset-id") + + +# --------------------------------------------------------------------------- +# GenericRESTLoader — mocked httpx +# --------------------------------------------------------------------------- + +class TestGenericRESTLoader: + def test_load_from_generic_api(self): + mock_response = [ + {"question": "How do I reset my password?", "ideal_answer": "Click forgot password."}, + {"question": "What are your business hours?", "ideal_answer": "9am to 5pm."}, + ] + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = GenericRESTLoader( + url="https://my-api.example.com/v1/evals", + input_field="question", + expected_field="ideal_answer", + name="support-evals", + ) + scenarios = loader.load(limit=10) + + assert len(scenarios) == 2 + assert "password" in scenarios[0].input_payload["question"] + + def test_auth_header_env_substitution(self): + """Env var tokens in auth_header should be resolved from the environment.""" + with patch.dict(os.environ, {"MY_SECRET_KEY": "abc123"}): + loader = GenericRESTLoader( + url="https://api.example.com", + auth_header="Authorization: Bearer {MY_SECRET_KEY}", + ) + # The resolved header should contain the actual value, not the template + mock_resp = MagicMock() + mock_resp.json.return_value = [] + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp) as mock_get: + with patch.dict(os.environ, {"MY_SECRET_KEY": "abc123"}): + loader = GenericRESTLoader( + url="https://api.example.com", + auth_header="Authorization: Bearer {MY_SECRET_KEY}", + ) + loader.load(limit=5) + call_kwargs = mock_get.call_args + headers = call_kwargs[1]["headers"] if call_kwargs[1] else call_kwargs[0][1] + assert "abc123" in str(headers) + + def test_wrapped_response_formats(self): + """API may return {data: [...]} or {items: [...]} instead of a bare list.""" + mock_response = {"data": [{"question": "Q1", "expected_answer": "A1"}]} + mock_resp = MagicMock() + mock_resp.json.return_value = mock_response + mock_resp.raise_for_status.return_value = None + + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + loader = GenericRESTLoader(url="https://api.example.com/v1/data") + scenarios = loader.load(limit=10) + + assert len(scenarios) == 1 + + +# --------------------------------------------------------------------------- +# load_private_benchmark routing +# --------------------------------------------------------------------------- + +class TestLoadPrivateBenchmarkRouting: + def test_routes_hf_prefix(self): + with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load: + mock_load.return_value = [EvalScenario( + id="hf-test-0", description="test", input_payload={"question": "Q"}, expected_behavior_rubric="R" + )] + result = load_private_benchmark("hf::test/dataset", limit=1) + assert len(result) == 1 + + def test_routes_confident_ai_prefix(self): + with patch("evalmonkey.scenarios.private_benchmarks.ConfidentAILoader.load") as mock_load: + mock_load.return_value = [EvalScenario( + id="conf-0", description="test", input_payload={"question": "Q"}, expected_behavior_rubric="R" + )] + with patch.dict(os.environ, {"CONFIDENT_AI_API_KEY": "conf-test"}): + result = load_private_benchmark("confident-ai::my-dataset", limit=1) + assert len(result) == 1 + + def test_routes_braintrust_prefix(self): + with patch("evalmonkey.scenarios.private_benchmarks.BraintrustLoader.load") as mock_load: + mock_load.return_value = [] + with patch.dict(os.environ, {"BRAINTRUST_API_KEY": "bt-test"}): + result = load_private_benchmark("braintrust::proj/ds", limit=1) + assert result == [] + + def test_routes_langsmith_prefix(self): + with patch("evalmonkey.scenarios.private_benchmarks.LangSmithLoader.load") as mock_load: + mock_load.return_value = [] + with patch.dict(os.environ, {"LANGSMITH_API_KEY": "ls-test"}): + result = load_private_benchmark("langsmith::abc123", limit=1) + assert result == [] + + def test_routes_generic_rest_from_config(self): + config = [{"id": "my-evals", "url": "https://api.example.com", "input_field": "q", "expected_field": "a"}] + mock_resp = MagicMock() + mock_resp.json.return_value = [{"q": "What?", "a": "This."}] + mock_resp.raise_for_status.return_value = None + with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp): + result = load_private_benchmark("my-evals", limit=5, private_benchmarks_config=config) + assert len(result) == 1 + + def test_returns_empty_for_unknown_id_without_config(self): + result = load_private_benchmark("nonexistent-benchmark-xyz", limit=5) + assert result == [] + + def test_standard_benchmarks_routes_hf_prefix(self): + """Integration: load_standard_benchmark should delegate hf:: to private_benchmarks.""" + with patch("evalmonkey.scenarios.private_benchmarks.load_private_benchmark") as mock_lpb: + mock_lpb.return_value = [] + from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark + load_standard_benchmark("hf::test/my-dataset", limit=3) + mock_lpb.assert_called_once_with("hf::test/my-dataset", limit=3) diff --git a/tests/test_regression_guard.py b/tests/test_regression_guard.py new file mode 100644 index 0000000..87bd29a --- /dev/null +++ b/tests/test_regression_guard.py @@ -0,0 +1,151 @@ +"""Tests for detect_regression() and the guard command logic.""" +import json +import os +import tempfile +from unittest.mock import patch + +import pytest + +from evalmonkey.reporting.history import detect_regression, record_run, get_history + + +# --------------------------------------------------------------------------- +# detect_regression unit tests +# --------------------------------------------------------------------------- + +def _history_file_with(records: list, tmp_path): + """Write a fake history.json and return its path.""" + history_file = tmp_path / "history.json" + history_file.write_text(json.dumps(records)) + return str(history_file) + + +class TestDetectRegression: + def _make_record(self, scenario, score, run_type="baseline", ts="2025-01-01T00:00:00"): + return {"scenario": scenario, "run_type": run_type, "score": score, "timestamp": ts} + + def test_returns_none_when_only_one_baseline(self, tmp_path): + records = [self._make_record("gsm8k", 80)] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 80, threshold=5) + assert result is None + + def test_returns_none_when_no_history(self, tmp_path): + history_path = _history_file_with([], tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 75, threshold=5) + assert result is None + + def test_detects_regression_above_threshold(self, tmp_path): + records = [ + self._make_record("gsm8k", 82, ts="2025-01-01T00:00:00"), + self._make_record("gsm8k", 60, ts="2025-01-02T00:00:00"), # current (already recorded) + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 60, threshold=5) + assert result is not None + assert result["prev_score"] == 82 + assert result["current_score"] == 60 + assert result["drop"] == 22 + assert result["scenario"] == "gsm8k" + + def test_no_regression_when_drop_below_threshold(self, tmp_path): + records = [ + self._make_record("gsm8k", 80, ts="2025-01-01T00:00:00"), + self._make_record("gsm8k", 77, ts="2025-01-02T00:00:00"), # drop of 3, threshold 5 + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 77, threshold=5) + assert result is None + + def test_no_regression_when_score_improved(self, tmp_path): + records = [ + self._make_record("gsm8k", 70, ts="2025-01-01T00:00:00"), + self._make_record("gsm8k", 85, ts="2025-01-02T00:00:00"), + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 85, threshold=5) + assert result is None + + def test_regression_at_exact_threshold(self, tmp_path): + """A drop exactly equal to threshold should trigger.""" + records = [ + self._make_record("mmlu", 75, ts="2025-01-01T00:00:00"), + self._make_record("mmlu", 70, ts="2025-01-02T00:00:00"), # drop = 5 = threshold + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("mmlu", 70, threshold=5) + assert result is not None + assert result["drop"] == 5 + + def test_ignores_chaos_runs_when_comparing_baselines(self, tmp_path): + """Chaos run records should not affect the baseline regression comparison.""" + records = [ + self._make_record("gsm8k", 82, run_type="baseline", ts="2025-01-01T00:00:00"), + self._make_record("gsm8k", 45, run_type="chaos", ts="2025-01-01T12:00:00"), + self._make_record("gsm8k", 60, run_type="baseline", ts="2025-01-02T00:00:00"), + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("gsm8k", 60, threshold=5) + # Should compare baseline 82 vs 60, NOT chaos 45 + assert result is not None + assert result["prev_score"] == 82 + assert result["drop"] == 22 + + def test_scenario_isolation(self, tmp_path): + """Regression for one scenario should not bleed into another.""" + records = [ + self._make_record("gsm8k", 90, ts="2025-01-01T00:00:00"), + self._make_record("gsm8k", 50, ts="2025-01-02T00:00:00"), + self._make_record("mmlu", 70, ts="2025-01-01T00:00:00"), + self._make_record("mmlu", 72, ts="2025-01-02T00:00:00"), + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + mmlu_result = detect_regression("mmlu", 72, threshold=5) + assert mmlu_result is None # mmlu improved, should not regress + + def test_custom_threshold_zero(self, tmp_path): + """Threshold of 0 means any drop at all triggers regression.""" + records = [ + self._make_record("arc", 80, ts="2025-01-01T00:00:00"), + self._make_record("arc", 79, ts="2025-01-02T00:00:00"), + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("arc", 79, threshold=0) + assert result is not None + assert result["drop"] == 1 + + def test_with_many_baselines_compares_last_two(self, tmp_path): + """With 5 baselines, should compare the 5th vs 4th.""" + records = [ + self._make_record("truthfulqa", 60, ts="2025-01-01T00:00:00"), + self._make_record("truthfulqa", 65, ts="2025-01-02T00:00:00"), + self._make_record("truthfulqa", 70, ts="2025-01-03T00:00:00"), + self._make_record("truthfulqa", 80, ts="2025-01-04T00:00:00"), + self._make_record("truthfulqa", 50, ts="2025-01-05T00:00:00"), # big drop + ] + history_path = _history_file_with(records, tmp_path) + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = detect_regression("truthfulqa", 50, threshold=5) + assert result is not None + assert result["prev_score"] == 80 + assert result["drop"] == 30 + + +# --------------------------------------------------------------------------- +# print_regression_warning smoke test (just checks it doesn't raise) +# --------------------------------------------------------------------------- + +class TestPrintRegressionWarning: + def test_no_exception_raised(self): + from evalmonkey.reporting.markdown import print_regression_warning + # Should render without any exception + print_regression_warning("gsm8k", prev_score=82, curr_score=60, drop=22) diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py new file mode 100644 index 0000000..b9dd30d --- /dev/null +++ b/tests/test_report_generator.py @@ -0,0 +1,162 @@ +"""Tests for report_generator.py (Agent Card).""" +import json +import os +import tempfile +from unittest.mock import patch + +import pytest + +from evalmonkey.reporting.report_generator import generate_report, _badge_url, _badge_color + + +# --------------------------------------------------------------------------- +# Badge helpers +# --------------------------------------------------------------------------- + +class TestBadgeHelpers: + def test_badge_color_green_above_80(self): + assert _badge_color(80) == "brightgreen" + assert _badge_color(100) == "brightgreen" + + def test_badge_color_yellow_60_to_79(self): + assert _badge_color(60) == "yellow" + assert _badge_color(79) == "yellow" + + def test_badge_color_red_below_60(self): + assert _badge_color(59) == "red" + assert _badge_color(0) == "red" + + def test_badge_url_contains_score(self): + url = _badge_url(75) + assert "75" in url + assert "shields.io" in url + assert "EvalMonkey" in url + + def test_badge_url_has_correct_color(self): + green_url = _badge_url(85) + assert "brightgreen" in green_url + yellow_url = _badge_url(65) + assert "yellow" in yellow_url + red_url = _badge_url(40) + assert "red" in red_url + + +# --------------------------------------------------------------------------- +# generate_report +# --------------------------------------------------------------------------- + +def _make_history(records: list, tmp_path) -> str: + """Write a fake history.json and return its path.""" + history_file = tmp_path / "history.json" + history_file.write_text(json.dumps(records)) + return str(history_file) + + +def _record(scenario, score, run_type="baseline"): + return { + "scenario": scenario, + "run_type": run_type, + "score": score, + "timestamp": "2025-01-01T09:00:00", + "details": {}, + } + + +class TestGenerateReport: + def test_creates_output_file(self, tmp_path): + history_path = _make_history([_record("gsm8k", 80)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path, agent_name="Test Agent") + assert os.path.exists(output_path) + + def test_report_contains_agent_name(self, tmp_path): + history_path = _make_history([_record("gsm8k", 80)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path, agent_name="My Research Bot") + assert "My Research Bot" in content + + def test_report_contains_scenario_name(self, tmp_path): + history_path = _make_history([_record("gsm8k", 82)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "gsm8k" in content + + def test_report_contains_baseline_score(self, tmp_path): + history_path = _make_history([_record("gsm8k", 82)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "82" in content + + def test_report_contains_shields_badge(self, tmp_path): + history_path = _make_history([_record("gsm8k", 82)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "shields.io" in content + assert "EvalMonkey" in content + + def test_report_with_baseline_and_chaos(self, tmp_path): + records = [ + _record("gsm8k", 82, "baseline"), + _record("gsm8k", 65, "chaos"), + ] + history_path = _make_history(records, tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "82" in content + assert "65" in content + + def test_report_empty_history(self, tmp_path): + history_path = _make_history([], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + # Should still produce a valid file with placeholder row + assert "no runs recorded" in content + + def test_report_multiple_scenarios(self, tmp_path): + records = [ + _record("gsm8k", 82, "baseline"), + _record("mmlu", 75, "baseline"), + _record("arc", 90, "baseline"), + ] + history_path = _make_history(records, tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "gsm8k" in content + assert "mmlu" in content + assert "arc" in content + + def test_report_includes_production_reliability_column(self, tmp_path): + records = [ + _record("gsm8k", 80, "baseline"), + _record("gsm8k", 60, "chaos"), + ] + history_path = _make_history(records, tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + assert "Production Reliability" in content + + def test_report_includes_badge_markdown_snippet(self, tmp_path): + history_path = _make_history([_record("gsm8k", 80)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + content = generate_report(output_path=output_path) + # Should include the copy-paste badge snippet + assert "```markdown" in content + assert "[![EvalMonkey" in content + + def test_report_returns_string_content(self, tmp_path): + history_path = _make_history([_record("gsm8k", 80)], tmp_path) + output_path = str(tmp_path / "report.md") + with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path): + result = generate_report(output_path=output_path) + assert isinstance(result, str) + assert len(result) > 0