Corbell-AI · himmi-01 · May 28, 2026 · May 28, 2026
diff --git a/.env.example b/.env.example
@@ -69,3 +69,32 @@ LANGFUSE_SECRET_KEY="sk-lf-..."
 # Used by: evalmonkey generate-evals --langfuse-dataset <name>
 #          demo_rag_app.sh (automatic if keys are set)
 # LANGFUSE_DATASET="evalmonkey_failures"
+
+# ----------------------------------------
+# 5. Regression Guard (Optional)
+# ----------------------------------------
+# Score drop (in points) that triggers a regression warning after run-benchmark
+# and causes `evalmonkey guard` to exit with code 1.
+# Default: 5  (i.e. a drop of 5+ points vs the previous baseline is flagged)
+EVAL_REGRESSION_THRESHOLD=5
+
+# ----------------------------------------
+# 6. External Dataset Providers (Optional)
+# ----------------------------------------
+# Use EvalMonkey as the chaos + scoring harness on top of datasets you already
+# maintain in eval platforms you subscribe to.
+#
+# Confident AI (DeepEval cloud)
+# Get your key: https://app.confident-ai.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario confident-ai::<dataset_id>
+CONFIDENT_AI_API_KEY="conf-..."
+
+# Braintrust
+# Get your key: https://www.braintrustdata.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario braintrust::<project>/<dataset>
+BRAINTRUST_API_KEY="bt-..."
+
+# LangSmith (LangChain)
+# Get your key: https://smith.langchain.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario langsmith::<dataset_id>
+LANGSMITH_API_KEY="ls__..."
diff --git a/evalmonkey/config/agent_config.py b/evalmonkey/config/agent_config.py
@@ -74,6 +74,18 @@
     },
 }
 
+# Maps each agent_type to the most relevant standard benchmark IDs.
+# Used by `evalmonkey recommend` to surface a curated suite instead of all 22.
+AGENT_TYPE_BENCHMARKS: dict[str, list[str]] = {
+    "research_agent":    ["hotpotqa", "drop", "natural-questions", "gaia-benchmark"],
+    "coding_agent":      ["human-eval", "mbpp", "apps", "swe-bench"],
+    "rag_agent":         ["hotpotqa", "natural-questions", "drop", "truthfulqa"],
+    "customer_support":  ["daily-dialog", "multiwoz", "mt-bench", "alpacaeval"],
+    "voice_agent":       ["daily-dialog", "multiwoz", "spokentext-cleanup"],
+    "safety_agent":      ["truthfulqa", "toxigen", "arc", "bbh"],
+    "general":           ["gsm8k", "mmlu", "arc", "truthfulqa"],
+}
+
 
 @dataclass
 class AgentConfig:
@@ -86,6 +98,8 @@ class AgentConfig:
     eval_model: str = ""
     agent_command: str = ""         # shell command to start the agent server
     agent_startup_wait: int = 3     # seconds to wait after spawning before sending traffic
+    agent_type: str = "general"     # Used by `evalmonkey recommend` to surface relevant benchmarks
+    private_benchmarks: list = field(default_factory=list)  # Custom REST dataset configs
     extra: dict = field(default_factory=dict)
 
 
@@ -116,6 +130,8 @@ def load_config(config_path: Optional[str] = None) -> Optional[AgentConfig]:
                 eval_model=str(raw.get("eval_model", os.getenv("EVAL_MODEL", ""))),
                 agent_command=str(agent_raw.get("agent_command", "")),
                 agent_startup_wait=int(agent_raw.get("agent_startup_wait", 3)),
+                agent_type=str(agent_raw.get("agent_type", "general")),
+                private_benchmarks=list(raw.get("private_benchmarks", [])),
                 extra=raw,
             )
     return None
@@ -154,6 +170,10 @@ def generate_config_yaml(framework: str, name: str, port: int) -> str:
   # How EvalMonkey reads the answer back (dot-notation for nested fields)
   response_path: {preset['response_path']}   # dot-path to extract the answer text
 
+  # Agent type — drives `evalmonkey recommend` to show only relevant benchmarks
+  # Options: general | research_agent | coding_agent | rag_agent | customer_support | voice_agent | safety_agent
+  agent_type: general
+
 # Which LLM EvalMonkey uses as the judge (can also be set via EVAL_MODEL env var)
 eval_model: "gpt-4o"   # or: anthropic.claude-3-haiku-20240307-v1:0, ollama/llama3, etc.
 """
diff --git a/evalmonkey/reporting/history.py b/evalmonkey/reporting/history.py
@@ -39,6 +39,33 @@ def get_history(scenario: str = None) -> list:
         history = [h for h in history if h.get("scenario") == scenario]
     return history
 
+def detect_regression(scenario: str, current_score: int, threshold: int = 5) -> dict | None:
+    """
+    Compares the current baseline score against the previous baseline run for the same scenario.
+    Returns a dict with regression details if score dropped by >= threshold points, otherwise None.
+
+    Note: call this *after* record_run() has already saved the current score, so the history
+    contains at least two baselines — we compare [-1] (current) against [-2] (previous).
+    """
+    records = get_history(scenario=scenario)
+    baselines = [r for r in records if r.get("run_type") == "baseline"]
+    # Sort ascending by timestamp to ensure correct ordering
+    baselines_sorted = sorted(baselines, key=lambda r: r.get("timestamp", ""))
+    if len(baselines_sorted) < 2:
+        return None  # Not enough history to compare
+    prev_score = baselines_sorted[-2].get("score", 0)
+    drop = prev_score - current_score
+    if drop >= threshold:
+        return {
+            "scenario": scenario,
+            "prev_score": prev_score,
+            "current_score": current_score,
+            "drop": drop,
+            "threshold": threshold,
+        }
+    return None
+
+
 def calculate_production_reliability(scenario: str = None) -> float:
     """
     Calculates the 'Production Reliability' metric.

diff --git a/evalmonkey/reporting/markdown.py b/evalmonkey/reporting/markdown.py
@@ -101,3 +101,43 @@ def print_history_trends(scenario_name: str, history: list, production_reliabili
     rel_color = "green" if production_reliability > 80 else "yellow" if production_reliability > 60 else "red"
     console.print(f"\n🚀 [bold white]Production Reliability Metric:[/bold white] [bold {rel_color}]{production_reliability:.1f} / 100.0[/bold {rel_color}]")
     console.print("[dim](Calculated as 60% of most recent baseline capability + 40% most recent chaos resilience)[/dim]\n")
+
+
+def print_regression_warning(scenario: str, prev_score: int, curr_score: int, drop: int) -> None:
+    """Prints a loud red regression-detected panel to the terminal."""
+    content = Text()
+    content.append(f"Scenario: {scenario}\n", style="bold white")
+    content.append(f"Previous Score: {prev_score}  →  Current Score: {curr_score}  ", style="white")
+    content.append(f"(drop: {drop} pts)\n", style="bold red")
+    content.append("\nYour agent's baseline score regressed versus the last run.", style="dim yellow")
+    content.append(f"\n\nDebug:  evalmonkey history --scenario {scenario}", style="dim")
+    content.append(f"\nFix:    evalmonkey generate-evals --traces-file <output-dir>/traces.json", style="dim")
+    panel = Panel(
+        content,
+        title="[bold red]⚠️  REGRESSION DETECTED[/bold red]",
+        border_style="red",
+        expand=False,
+        padding=(1, 2),
+    )
+    console.print("\n")
+    console.print(Align.center(panel))
+
+
+def print_recommend_suite(agent_type: str, benchmarks: dict, categories: dict) -> None:
+    """Prints a curated benchmark recommendation table for the given agent_type."""
+    console.print(f"\n[bold cyan]🐵 EvalMonkey — Recommended Benchmarks for: [bold white]{agent_type}[/bold white][/bold cyan]")
+    console.print(f"[dim]Based on agent_type in your evalmonkey.yaml. Run 'evalmonkey list-benchmarks' to see all.[/dim]\n")
+
+    table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
+    table.add_column("Scenario ID", style="bold white")
+    table.add_column("Category", style="cyan")
+    table.add_column("Description")
+
+    for b_id, desc in benchmarks.items():
+        table.add_row(b_id, categories.get(b_id, ""), desc)
+
+    console.print(table)
+    console.print(
+        "\n[dim]Run: evalmonkey run-benchmark --scenario <id> --target-url <url>[/dim]"
+        "\n[dim]Run all: evalmonkey run-benchmark --scenario <id> for each scenario above[/dim]\n"
+    )
diff --git a/evalmonkey/reporting/report_generator.py b/evalmonkey/reporting/report_generator.py
@@ -0,0 +1,116 @@
+"""
+Report Generator
+================
+Generates a shareable Markdown Agent Card from local EvalMonkey run history.
+
+Usage:
+    evalmonkey report [--output evalmonkey_report.md] [--agent-name "My Agent"]
+"""
+from __future__ import annotations
+
+import os
+from collections import defaultdict
+from datetime import datetime, timezone
+from typing import Optional
+
+from evalmonkey.reporting.history import get_history, calculate_production_reliability
+
+
+def _badge_color(score: int) -> str:
+    if score >= 80:
+        return "brightgreen"
+    elif score >= 60:
+        return "yellow"
+    else:
+        return "red"
+
+
+def _badge_url(score: int) -> str:
+    """Generate a shields.io badge URL for the given overall score."""
+    color = _badge_color(score)
+    label = f"Score%3A{score}"
+    return f"https://img.shields.io/badge/EvalMonkey-{label}-{color}"
+
+
+def generate_report(
+    output_path: str = "evalmonkey_report.md",
+    agent_name: str = "My Agent",
+) -> str:
+    """
+    Reads ~/.evalmonkey/history.json, aggregates the latest baseline and chaos
+    scores per scenario, and writes a Markdown report to output_path.
+
+    Returns the full Markdown content as a string.
+    """
+    history = get_history()
+
+    # Group: scenario → { baseline: int|None, chaos: int|None }
+    scores: dict[str, dict[str, Optional[int]]] = defaultdict(lambda: {"baseline": None, "chaos": None})
+
+    for record in history:
+        scenario = record.get("scenario", "unknown")
+        run_type = record.get("run_type", "")
+        score = record.get("score")
+        if run_type == "baseline" and score is not None:
+            scores[scenario]["baseline"] = score
+        elif run_type == "chaos" and score is not None:
+            scores[scenario]["chaos"] = score
+
+    # Overall score = average of all latest baseline scores
+    baseline_scores = [v["baseline"] for v in scores.values() if v["baseline"] is not None]
+    overall_score = int(sum(baseline_scores) / len(baseline_scores)) if baseline_scores else 0
+    badge_url = _badge_url(overall_score)
+
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+    lines: list[str] = [
+        f"# Agent Benchmark Report — {agent_name}",
+        "",
+        f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
+        "",
+        f"> Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) on {now_str}",
+        "",
+        "## Benchmark Scores",
+        "",
+        "| Scenario | Baseline | Chaos | Production Reliability |",
+        "|----------|:--------:|:-----:|:----------------------:|",
+    ]
+
+    for scenario in sorted(scores.keys()):
+        s = scores[scenario]
+        baseline = s["baseline"]
+        chaos = s["chaos"]
+        reliability = calculate_production_reliability(scenario)
+
+        b_str = f"**{baseline}**" if baseline is not None else "—"
+        c_str = str(chaos) if chaos is not None else "—"
+        r_str = f"{reliability:.1f}" if reliability else "—"
+        lines.append(f"| `{scenario}` | {b_str} | {c_str} | {r_str} |")
+
+    if not scores:
+        lines.append("| *(no runs recorded yet)* | — | — | — |")
+
+    lines += [
+        "",
+        "## What is Production Reliability?",
+        "",
+        "Production Reliability = `(baseline_score × 0.6) + (chaos_score × 0.4)`",
+        "",
+        "It combines how well your agent performs on clean inputs with how resilient it is",
+        "under adversarial conditions (typos, prompt injection, schema mutations, etc.).",
+        "",
+        "---",
+        "",
+        f"*Embed this badge in your README:*",
+        f"```markdown",
+        f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
+        f"```",
+    ]
+
+    content = "\n".join(lines)
+
+    os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+    return content