Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,32 @@ LANGFUSE_SECRET_KEY="sk-lf-..."
# Used by: evalmonkey generate-evals --langfuse-dataset <name>
# demo_rag_app.sh (automatic if keys are set)
# LANGFUSE_DATASET="evalmonkey_failures"

# ----------------------------------------
# 5. Regression Guard (Optional)
# ----------------------------------------
# Score drop (in points) that triggers a regression warning after run-benchmark
# and causes `evalmonkey guard` to exit with code 1.
# Default: 5 (i.e. a drop of 5+ points vs the previous baseline is flagged)
EVAL_REGRESSION_THRESHOLD=5

# ----------------------------------------
# 6. External Dataset Providers (Optional)
# ----------------------------------------
# Use EvalMonkey as the chaos + scoring harness on top of datasets you already
# maintain in eval platforms you subscribe to.
#
# Confident AI (DeepEval cloud)
# Get your key: https://app.confident-ai.com → Settings → API Keys
# Usage: evalmonkey run-benchmark --scenario confident-ai::<dataset_id>
CONFIDENT_AI_API_KEY="conf-..."

# Braintrust
# Get your key: https://www.braintrustdata.com → Settings → API Keys
# Usage: evalmonkey run-benchmark --scenario braintrust::<project>/<dataset>
BRAINTRUST_API_KEY="bt-..."

# LangSmith (LangChain)
# Get your key: https://smith.langchain.com → Settings → API Keys
# Usage: evalmonkey run-benchmark --scenario langsmith::<dataset_id>
LANGSMITH_API_KEY="ls__..."
20 changes: 20 additions & 0 deletions evalmonkey/config/agent_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@
},
}

# Maps each agent_type to the most relevant standard benchmark IDs.
# Used by `evalmonkey recommend` to surface a curated suite instead of all 22.
AGENT_TYPE_BENCHMARKS: dict[str, list[str]] = {
"research_agent": ["hotpotqa", "drop", "natural-questions", "gaia-benchmark"],
"coding_agent": ["human-eval", "mbpp", "apps", "swe-bench"],
"rag_agent": ["hotpotqa", "natural-questions", "drop", "truthfulqa"],
"customer_support": ["daily-dialog", "multiwoz", "mt-bench", "alpacaeval"],
"voice_agent": ["daily-dialog", "multiwoz", "spokentext-cleanup"],
"safety_agent": ["truthfulqa", "toxigen", "arc", "bbh"],
"general": ["gsm8k", "mmlu", "arc", "truthfulqa"],
}


@dataclass
class AgentConfig:
Expand All @@ -86,6 +98,8 @@ class AgentConfig:
eval_model: str = ""
agent_command: str = "" # shell command to start the agent server
agent_startup_wait: int = 3 # seconds to wait after spawning before sending traffic
agent_type: str = "general" # Used by `evalmonkey recommend` to surface relevant benchmarks
private_benchmarks: list = field(default_factory=list) # Custom REST dataset configs
extra: dict = field(default_factory=dict)


Expand Down Expand Up @@ -116,6 +130,8 @@ def load_config(config_path: Optional[str] = None) -> Optional[AgentConfig]:
eval_model=str(raw.get("eval_model", os.getenv("EVAL_MODEL", ""))),
agent_command=str(agent_raw.get("agent_command", "")),
agent_startup_wait=int(agent_raw.get("agent_startup_wait", 3)),
agent_type=str(agent_raw.get("agent_type", "general")),
private_benchmarks=list(raw.get("private_benchmarks", [])),
extra=raw,
)
return None
Expand Down Expand Up @@ -154,6 +170,10 @@ def generate_config_yaml(framework: str, name: str, port: int) -> str:
# How EvalMonkey reads the answer back (dot-notation for nested fields)
response_path: {preset['response_path']} # dot-path to extract the answer text

# Agent type — drives `evalmonkey recommend` to show only relevant benchmarks
# Options: general | research_agent | coding_agent | rag_agent | customer_support | voice_agent | safety_agent
agent_type: general

# Which LLM EvalMonkey uses as the judge (can also be set via EVAL_MODEL env var)
eval_model: "gpt-4o" # or: anthropic.claude-3-haiku-20240307-v1:0, ollama/llama3, etc.
"""
27 changes: 27 additions & 0 deletions evalmonkey/reporting/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,33 @@ def get_history(scenario: str = None) -> list:
history = [h for h in history if h.get("scenario") == scenario]
return history

def detect_regression(scenario: str, current_score: int, threshold: int = 5) -> dict | None:
"""
Compares the current baseline score against the previous baseline run for the same scenario.
Returns a dict with regression details if score dropped by >= threshold points, otherwise None.

Note: call this *after* record_run() has already saved the current score, so the history
contains at least two baselines — we compare [-1] (current) against [-2] (previous).
"""
records = get_history(scenario=scenario)
baselines = [r for r in records if r.get("run_type") == "baseline"]
# Sort ascending by timestamp to ensure correct ordering
baselines_sorted = sorted(baselines, key=lambda r: r.get("timestamp", ""))
if len(baselines_sorted) < 2:
return None # Not enough history to compare
prev_score = baselines_sorted[-2].get("score", 0)
drop = prev_score - current_score
if drop >= threshold:
return {
"scenario": scenario,
"prev_score": prev_score,
"current_score": current_score,
"drop": drop,
"threshold": threshold,
}
return None


def calculate_production_reliability(scenario: str = None) -> float:
"""
Calculates the 'Production Reliability' metric.
Expand Down
40 changes: 40 additions & 0 deletions evalmonkey/reporting/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,43 @@ def print_history_trends(scenario_name: str, history: list, production_reliabili
rel_color = "green" if production_reliability > 80 else "yellow" if production_reliability > 60 else "red"
console.print(f"\n🚀 [bold white]Production Reliability Metric:[/bold white] [bold {rel_color}]{production_reliability:.1f} / 100.0[/bold {rel_color}]")
console.print("[dim](Calculated as 60% of most recent baseline capability + 40% most recent chaos resilience)[/dim]\n")


def print_regression_warning(scenario: str, prev_score: int, curr_score: int, drop: int) -> None:
"""Prints a loud red regression-detected panel to the terminal."""
content = Text()
content.append(f"Scenario: {scenario}\n", style="bold white")
content.append(f"Previous Score: {prev_score} → Current Score: {curr_score} ", style="white")
content.append(f"(drop: {drop} pts)\n", style="bold red")
content.append("\nYour agent's baseline score regressed versus the last run.", style="dim yellow")
content.append(f"\n\nDebug: evalmonkey history --scenario {scenario}", style="dim")
content.append(f"\nFix: evalmonkey generate-evals --traces-file <output-dir>/traces.json", style="dim")
panel = Panel(
content,
title="[bold red]⚠️ REGRESSION DETECTED[/bold red]",
border_style="red",
expand=False,
padding=(1, 2),
)
console.print("\n")
console.print(Align.center(panel))


def print_recommend_suite(agent_type: str, benchmarks: dict, categories: dict) -> None:
"""Prints a curated benchmark recommendation table for the given agent_type."""
console.print(f"\n[bold cyan]🐵 EvalMonkey — Recommended Benchmarks for: [bold white]{agent_type}[/bold white][/bold cyan]")
console.print(f"[dim]Based on agent_type in your evalmonkey.yaml. Run 'evalmonkey list-benchmarks' to see all.[/dim]\n")

table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
table.add_column("Scenario ID", style="bold white")
table.add_column("Category", style="cyan")
table.add_column("Description")

for b_id, desc in benchmarks.items():
table.add_row(b_id, categories.get(b_id, ""), desc)

console.print(table)
console.print(
"\n[dim]Run: evalmonkey run-benchmark --scenario <id> --target-url <url>[/dim]"
"\n[dim]Run all: evalmonkey run-benchmark --scenario <id> for each scenario above[/dim]\n"
)
116 changes: 116 additions & 0 deletions evalmonkey/reporting/report_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
Report Generator
================
Generates a shareable Markdown Agent Card from local EvalMonkey run history.

Usage:
evalmonkey report [--output evalmonkey_report.md] [--agent-name "My Agent"]
"""
from __future__ import annotations

import os
from collections import defaultdict
from datetime import datetime, timezone
from typing import Optional

from evalmonkey.reporting.history import get_history, calculate_production_reliability


def _badge_color(score: int) -> str:
if score >= 80:
return "brightgreen"
elif score >= 60:
return "yellow"
else:
return "red"


def _badge_url(score: int) -> str:
"""Generate a shields.io badge URL for the given overall score."""
color = _badge_color(score)
label = f"Score%3A{score}"
return f"https://img.shields.io/badge/EvalMonkey-{label}-{color}"


def generate_report(
output_path: str = "evalmonkey_report.md",
agent_name: str = "My Agent",
) -> str:
"""
Reads ~/.evalmonkey/history.json, aggregates the latest baseline and chaos
scores per scenario, and writes a Markdown report to output_path.

Returns the full Markdown content as a string.
"""
history = get_history()

# Group: scenario → { baseline: int|None, chaos: int|None }
scores: dict[str, dict[str, Optional[int]]] = defaultdict(lambda: {"baseline": None, "chaos": None})

for record in history:
scenario = record.get("scenario", "unknown")
run_type = record.get("run_type", "")
score = record.get("score")
if run_type == "baseline" and score is not None:
scores[scenario]["baseline"] = score
elif run_type == "chaos" and score is not None:
scores[scenario]["chaos"] = score

# Overall score = average of all latest baseline scores
baseline_scores = [v["baseline"] for v in scores.values() if v["baseline"] is not None]
overall_score = int(sum(baseline_scores) / len(baseline_scores)) if baseline_scores else 0
badge_url = _badge_url(overall_score)

now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")

lines: list[str] = [
f"# Agent Benchmark Report — {agent_name}",
"",
f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
"",
f"> Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) on {now_str}",
"",
"## Benchmark Scores",
"",
"| Scenario | Baseline | Chaos | Production Reliability |",
"|----------|:--------:|:-----:|:----------------------:|",
]

for scenario in sorted(scores.keys()):
s = scores[scenario]
baseline = s["baseline"]
chaos = s["chaos"]
reliability = calculate_production_reliability(scenario)

b_str = f"**{baseline}**" if baseline is not None else "—"
c_str = str(chaos) if chaos is not None else "—"
r_str = f"{reliability:.1f}" if reliability else "—"
lines.append(f"| `{scenario}` | {b_str} | {c_str} | {r_str} |")

if not scores:
lines.append("| *(no runs recorded yet)* | — | — | — |")

lines += [
"",
"## What is Production Reliability?",
"",
"Production Reliability = `(baseline_score × 0.6) + (chaos_score × 0.4)`",
"",
"It combines how well your agent performs on clean inputs with how resilient it is",
"under adversarial conditions (typos, prompt injection, schema mutations, etc.).",
"",
"---",
"",
f"*Embed this badge in your README:*",
f"```markdown",
f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
f"```",
]

content = "\n".join(lines)

os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)

return content
Loading
Loading