diff --git a/.env.example b/.env.example
index 13e103c..f39a8db 100644
--- a/.env.example
+++ b/.env.example
@@ -69,3 +69,32 @@ LANGFUSE_SECRET_KEY="sk-lf-..."
 # Used by: evalmonkey generate-evals --langfuse-dataset <name>
 #          demo_rag_app.sh (automatic if keys are set)
 # LANGFUSE_DATASET="evalmonkey_failures"
+
+# ----------------------------------------
+# 5. Regression Guard (Optional)
+# ----------------------------------------
+# Score drop (in points) that triggers a regression warning after run-benchmark
+# and causes `evalmonkey guard` to exit with code 1.
+# Default: 5  (i.e. a drop of 5+ points vs the previous baseline is flagged)
+EVAL_REGRESSION_THRESHOLD=5
+
+# ----------------------------------------
+# 6. External Dataset Providers (Optional)
+# ----------------------------------------
+# Use EvalMonkey as the chaos + scoring harness on top of datasets you already
+# maintain in eval platforms you subscribe to.
+#
+# Confident AI (DeepEval cloud)
+# Get your key: https://app.confident-ai.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario confident-ai::<dataset_id>
+CONFIDENT_AI_API_KEY="conf-..."
+
+# Braintrust
+# Get your key: https://www.braintrustdata.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario braintrust::<project>/<dataset>
+BRAINTRUST_API_KEY="bt-..."
+
+# LangSmith (LangChain)
+# Get your key: https://smith.langchain.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario langsmith::<dataset_id>
+LANGSMITH_API_KEY="ls__..."
diff --git a/evalmonkey/config/agent_config.py b/evalmonkey/config/agent_config.py
index f8bfae7..c08db95 100644
--- a/evalmonkey/config/agent_config.py
+++ b/evalmonkey/config/agent_config.py
@@ -74,6 +74,18 @@
     },
 }
 
+# Maps each agent_type to the most relevant standard benchmark IDs.
+# Used by `evalmonkey recommend` to surface a curated suite instead of all 22.
+AGENT_TYPE_BENCHMARKS: dict[str, list[str]] = {
+    "research_agent":    ["hotpotqa", "drop", "natural-questions", "gaia-benchmark"],
+    "coding_agent":      ["human-eval", "mbpp", "apps", "swe-bench"],
+    "rag_agent":         ["hotpotqa", "natural-questions", "drop", "truthfulqa"],
+    "customer_support":  ["daily-dialog", "multiwoz", "mt-bench", "alpacaeval"],
+    "voice_agent":       ["daily-dialog", "multiwoz", "spokentext-cleanup"],
+    "safety_agent":      ["truthfulqa", "toxigen", "arc", "bbh"],
+    "general":           ["gsm8k", "mmlu", "arc", "truthfulqa"],
+}
+
 
 @dataclass
 class AgentConfig:
@@ -86,6 +98,8 @@ class AgentConfig:
     eval_model: str = ""
     agent_command: str = ""         # shell command to start the agent server
     agent_startup_wait: int = 3     # seconds to wait after spawning before sending traffic
+    agent_type: str = "general"     # Used by `evalmonkey recommend` to surface relevant benchmarks
+    private_benchmarks: list = field(default_factory=list)  # Custom REST dataset configs
     extra: dict = field(default_factory=dict)
 
 
@@ -116,6 +130,8 @@ def load_config(config_path: Optional[str] = None) -> Optional[AgentConfig]:
                 eval_model=str(raw.get("eval_model", os.getenv("EVAL_MODEL", ""))),
                 agent_command=str(agent_raw.get("agent_command", "")),
                 agent_startup_wait=int(agent_raw.get("agent_startup_wait", 3)),
+                agent_type=str(agent_raw.get("agent_type", "general")),
+                private_benchmarks=list(raw.get("private_benchmarks", [])),
                 extra=raw,
             )
     return None
@@ -154,6 +170,10 @@ def generate_config_yaml(framework: str, name: str, port: int) -> str:
   # How EvalMonkey reads the answer back (dot-notation for nested fields)
   response_path: {preset['response_path']}   # dot-path to extract the answer text
 
+  # Agent type — drives `evalmonkey recommend` to show only relevant benchmarks
+  # Options: general | research_agent | coding_agent | rag_agent | customer_support | voice_agent | safety_agent
+  agent_type: general
+
 # Which LLM EvalMonkey uses as the judge (can also be set via EVAL_MODEL env var)
 eval_model: "gpt-4o"   # or: anthropic.claude-3-haiku-20240307-v1:0, ollama/llama3, etc.
 """
diff --git a/evalmonkey/reporting/history.py b/evalmonkey/reporting/history.py
index f011b25..e08c661 100644
--- a/evalmonkey/reporting/history.py
+++ b/evalmonkey/reporting/history.py
@@ -39,6 +39,33 @@ def get_history(scenario: str = None) -> list:
         history = [h for h in history if h.get("scenario") == scenario]
     return history
 
+def detect_regression(scenario: str, current_score: int, threshold: int = 5) -> dict | None:
+    """
+    Compares the current baseline score against the previous baseline run for the same scenario.
+    Returns a dict with regression details if score dropped by >= threshold points, otherwise None.
+
+    Note: call this *after* record_run() has already saved the current score, so the history
+    contains at least two baselines — we compare [-1] (current) against [-2] (previous).
+    """
+    records = get_history(scenario=scenario)
+    baselines = [r for r in records if r.get("run_type") == "baseline"]
+    # Sort ascending by timestamp to ensure correct ordering
+    baselines_sorted = sorted(baselines, key=lambda r: r.get("timestamp", ""))
+    if len(baselines_sorted) < 2:
+        return None  # Not enough history to compare
+    prev_score = baselines_sorted[-2].get("score", 0)
+    drop = prev_score - current_score
+    if drop >= threshold:
+        return {
+            "scenario": scenario,
+            "prev_score": prev_score,
+            "current_score": current_score,
+            "drop": drop,
+            "threshold": threshold,
+        }
+    return None
+
+
 def calculate_production_reliability(scenario: str = None) -> float:
     """
     Calculates the 'Production Reliability' metric.
diff --git a/evalmonkey/reporting/markdown.py b/evalmonkey/reporting/markdown.py
index e5f7511..42cb2fd 100644
--- a/evalmonkey/reporting/markdown.py
+++ b/evalmonkey/reporting/markdown.py
@@ -101,3 +101,43 @@ def print_history_trends(scenario_name: str, history: list, production_reliabili
     rel_color = "green" if production_reliability > 80 else "yellow" if production_reliability > 60 else "red"
     console.print(f"\n🚀 [bold white]Production Reliability Metric:[/bold white] [bold {rel_color}]{production_reliability:.1f} / 100.0[/bold {rel_color}]")
     console.print("[dim](Calculated as 60% of most recent baseline capability + 40% most recent chaos resilience)[/dim]\n")
+
+
+def print_regression_warning(scenario: str, prev_score: int, curr_score: int, drop: int) -> None:
+    """Prints a loud red regression-detected panel to the terminal."""
+    content = Text()
+    content.append(f"Scenario: {scenario}\n", style="bold white")
+    content.append(f"Previous Score: {prev_score}  →  Current Score: {curr_score}  ", style="white")
+    content.append(f"(drop: {drop} pts)\n", style="bold red")
+    content.append("\nYour agent's baseline score regressed versus the last run.", style="dim yellow")
+    content.append(f"\n\nDebug:  evalmonkey history --scenario {scenario}", style="dim")
+    content.append(f"\nFix:    evalmonkey generate-evals --traces-file <output-dir>/traces.json", style="dim")
+    panel = Panel(
+        content,
+        title="[bold red]⚠️  REGRESSION DETECTED[/bold red]",
+        border_style="red",
+        expand=False,
+        padding=(1, 2),
+    )
+    console.print("\n")
+    console.print(Align.center(panel))
+
+
+def print_recommend_suite(agent_type: str, benchmarks: dict, categories: dict) -> None:
+    """Prints a curated benchmark recommendation table for the given agent_type."""
+    console.print(f"\n[bold cyan]🐵 EvalMonkey — Recommended Benchmarks for: [bold white]{agent_type}[/bold white][/bold cyan]")
+    console.print(f"[dim]Based on agent_type in your evalmonkey.yaml. Run 'evalmonkey list-benchmarks' to see all.[/dim]\n")
+
+    table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
+    table.add_column("Scenario ID", style="bold white")
+    table.add_column("Category", style="cyan")
+    table.add_column("Description")
+
+    for b_id, desc in benchmarks.items():
+        table.add_row(b_id, categories.get(b_id, ""), desc)
+
+    console.print(table)
+    console.print(
+        "\n[dim]Run: evalmonkey run-benchmark --scenario <id> --target-url <url>[/dim]"
+        "\n[dim]Run all: evalmonkey run-benchmark --scenario <id> for each scenario above[/dim]\n"
+    )
diff --git a/evalmonkey/reporting/report_generator.py b/evalmonkey/reporting/report_generator.py
new file mode 100644
index 0000000..e0526d0
--- /dev/null
+++ b/evalmonkey/reporting/report_generator.py
@@ -0,0 +1,116 @@
+"""
+Report Generator
+================
+Generates a shareable Markdown Agent Card from local EvalMonkey run history.
+
+Usage:
+    evalmonkey report [--output evalmonkey_report.md] [--agent-name "My Agent"]
+"""
+from __future__ import annotations
+
+import os
+from collections import defaultdict
+from datetime import datetime, timezone
+from typing import Optional
+
+from evalmonkey.reporting.history import get_history, calculate_production_reliability
+
+
+def _badge_color(score: int) -> str:
+    if score >= 80:
+        return "brightgreen"
+    elif score >= 60:
+        return "yellow"
+    else:
+        return "red"
+
+
+def _badge_url(score: int) -> str:
+    """Generate a shields.io badge URL for the given overall score."""
+    color = _badge_color(score)
+    label = f"Score%3A{score}"
+    return f"https://img.shields.io/badge/EvalMonkey-{label}-{color}"
+
+
+def generate_report(
+    output_path: str = "evalmonkey_report.md",
+    agent_name: str = "My Agent",
+) -> str:
+    """
+    Reads ~/.evalmonkey/history.json, aggregates the latest baseline and chaos
+    scores per scenario, and writes a Markdown report to output_path.
+
+    Returns the full Markdown content as a string.
+    """
+    history = get_history()
+
+    # Group: scenario → { baseline: int|None, chaos: int|None }
+    scores: dict[str, dict[str, Optional[int]]] = defaultdict(lambda: {"baseline": None, "chaos": None})
+
+    for record in history:
+        scenario = record.get("scenario", "unknown")
+        run_type = record.get("run_type", "")
+        score = record.get("score")
+        if run_type == "baseline" and score is not None:
+            scores[scenario]["baseline"] = score
+        elif run_type == "chaos" and score is not None:
+            scores[scenario]["chaos"] = score
+
+    # Overall score = average of all latest baseline scores
+    baseline_scores = [v["baseline"] for v in scores.values() if v["baseline"] is not None]
+    overall_score = int(sum(baseline_scores) / len(baseline_scores)) if baseline_scores else 0
+    badge_url = _badge_url(overall_score)
+
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+    lines: list[str] = [
+        f"# Agent Benchmark Report — {agent_name}",
+        "",
+        f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
+        "",
+        f"> Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) on {now_str}",
+        "",
+        "## Benchmark Scores",
+        "",
+        "| Scenario | Baseline | Chaos | Production Reliability |",
+        "|----------|:--------:|:-----:|:----------------------:|",
+    ]
+
+    for scenario in sorted(scores.keys()):
+        s = scores[scenario]
+        baseline = s["baseline"]
+        chaos = s["chaos"]
+        reliability = calculate_production_reliability(scenario)
+
+        b_str = f"**{baseline}**" if baseline is not None else "—"
+        c_str = str(chaos) if chaos is not None else "—"
+        r_str = f"{reliability:.1f}" if reliability else "—"
+        lines.append(f"| `{scenario}` | {b_str} | {c_str} | {r_str} |")
+
+    if not scores:
+        lines.append("| *(no runs recorded yet)* | — | — | — |")
+
+    lines += [
+        "",
+        "## What is Production Reliability?",
+        "",
+        "Production Reliability = `(baseline_score × 0.6) + (chaos_score × 0.4)`",
+        "",
+        "It combines how well your agent performs on clean inputs with how resilient it is",
+        "under adversarial conditions (typos, prompt injection, schema mutations, etc.).",
+        "",
+        "---",
+        "",
+        f"*Embed this badge in your README:*",
+        f"```markdown",
+        f"[![EvalMonkey Score]({badge_url})](https://github.com/Corbell-AI/evalmonkey)",
+        f"```",
+    ]
+
+    content = "\n".join(lines)
+
+    os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+    return content
diff --git a/evalmonkey/scenarios/private_benchmarks.py b/evalmonkey/scenarios/private_benchmarks.py
new file mode 100644
index 0000000..fe8f744
--- /dev/null
+++ b/evalmonkey/scenarios/private_benchmarks.py
@@ -0,0 +1,494 @@
+"""
+External and Private Dataset Support
+=====================================
+EvalMonkey supports three ways to bring your own evaluation data:
+
+1. Local files  — `evalmonkey run-benchmark --dataset my_cases.jsonl`
+2. HuggingFace  — `--scenario hf::org/dataset-name` (any public or gated HF dataset)
+3. Generic REST — configure a URL in evalmonkey.yaml under `private_benchmarks`
+4. Eval platforms you already use (Confident AI, Braintrust, LangSmith) — see below
+
+EvalMonkey acts as a harness: it fetches your data, normalizes it to EvalScenario,
+then runs chaos injection + LLM scoring. Your data stays on your machine.
+
+Benchmark ID convention:
+  - hf::<org>/<dataset>            → any HuggingFace dataset
+  - confident-ai::<dataset_id>     → Confident AI (DeepEval) dataset
+  - braintrust::<project>/<dataset>→ Braintrust dataset
+  - langsmith::<dataset_id>        → LangSmith dataset
+  - <configured-id>                → private_benchmarks entry in evalmonkey.yaml
+"""
+from __future__ import annotations
+
+import os
+import json
+import csv
+import io
+import re
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional, Any
+
+import httpx
+
+from evalmonkey.evals.local_assets import EvalScenario
+
+
+# ---------------------------------------------------------------------------
+# Base class
+# ---------------------------------------------------------------------------
+
+class PrivateBenchmarkLoader(ABC):
+    """Abstract base class for all private/external benchmark loaders."""
+
+    @abstractmethod
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        """Fetch and normalise dataset rows into EvalScenario objects."""
+
+
+# ---------------------------------------------------------------------------
+# Local-file loader (JSONL / JSON / CSV)
+# ---------------------------------------------------------------------------
+
+class LocalFileLoader(PrivateBenchmarkLoader):
+    """
+    Load from a local file (JSONL, JSON, or CSV).
+
+    Expected field names (flexible — any name works for input_field):
+        input_field        → becomes input_payload[request_key]
+        expected_field     → becomes expected_behavior_rubric
+        id_field           → (optional) scenario ID
+        description_field  → (optional) human-readable description
+
+    Example JSONL row:
+        {"question": "What is 2+2?", "expected_answer": "4"}
+
+    Example CSV:
+        question,expected_answer
+        "What is 2+2?","4"
+    """
+
+    def __init__(
+        self,
+        filepath: str,
+        input_field: str = "question",
+        expected_field: str = "expected_answer",
+        id_field: str = "id",
+        description_field: str = "description",
+    ):
+        self.filepath = filepath
+        self.input_field = input_field
+        self.expected_field = expected_field
+        self.id_field = id_field
+        self.description_field = description_field
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        rows = self._read_file()
+        return self._normalise(rows, limit)
+
+    def _read_file(self) -> List[Dict[str, Any]]:
+        fp = self.filepath
+        if fp.endswith(".jsonl"):
+            with open(fp, "r", encoding="utf-8") as f:
+                return [json.loads(line) for line in f if line.strip()]
+        elif fp.endswith(".json"):
+            with open(fp, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                return data if isinstance(data, list) else [data]
+        elif fp.endswith(".csv"):
+            with open(fp, "r", encoding="utf-8", newline="") as f:
+                reader = csv.DictReader(f)
+                return [dict(row) for row in reader]
+        else:
+            # Try JSONL as fallback
+            with open(fp, "r", encoding="utf-8") as f:
+                lines = [l.strip() for l in f if l.strip()]
+            try:
+                return [json.loads(l) for l in lines]
+            except json.JSONDecodeError:
+                raise ValueError(f"Unsupported file format: {fp}. Supported: .jsonl, .json, .csv")
+
+    def _normalise(self, rows: List[Dict], limit: int) -> List[EvalScenario]:
+        scenarios = []
+        for i, row in enumerate(rows[:limit]):
+            question = row.get(self.input_field, str(row))
+            rubric = row.get(self.expected_field, "")
+            if not rubric:
+                rubric = f"The agent should correctly answer: {question}"
+            scenario_id = str(row.get(self.id_field, f"local-{i}"))
+            description = str(row.get(self.description_field, f"Local eval #{i}"))
+            scenarios.append(EvalScenario(
+                id=scenario_id,
+                description=description,
+                input_payload={"question": question},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+
+# ---------------------------------------------------------------------------
+# HuggingFace direct loader  (hf::<org>/<dataset>)
+# ---------------------------------------------------------------------------
+
+class HuggingFaceLoader(PrivateBenchmarkLoader):
+    """
+    Load any HuggingFace dataset by its repository ID.
+
+    Usage:  --scenario hf::org/dataset-name
+    Options configurable via loader kwargs:
+        split         default "train"
+        input_col     column name for the question/input
+        expected_col  column name for the expected answer (optional)
+        config_name   HF dataset config name (optional)
+    """
+
+    def __init__(
+        self,
+        dataset_id: str,
+        split: str = "train",
+        input_col: str = "question",
+        expected_col: Optional[str] = None,
+        config_name: Optional[str] = None,
+    ):
+        self.dataset_id = dataset_id
+        self.split = split
+        self.input_col = input_col
+        self.expected_col = expected_col
+        self.config_name = config_name
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        try:
+            from datasets import load_dataset  # type: ignore
+        except ImportError:
+            raise ImportError("HuggingFace 'datasets' package required. Run: pip install datasets")
+
+        kwargs: Dict[str, Any] = {
+            "split": self.split,
+            "streaming": True,
+            "trust_remote_code": False,
+        }
+        if self.config_name:
+            kwargs["name"] = self.config_name
+
+        ds = load_dataset(self.dataset_id, **kwargs)
+
+        scenarios: List[EvalScenario] = []
+        for i, row in enumerate(ds):
+            if i >= limit:
+                break
+            # Try to find a sensible input column
+            question = self._get_col(row, self.input_col) or self._first_string_col(row)
+            rubric_val = self._get_col(row, self.expected_col) if self.expected_col else None
+            rubric = (
+                f"The expected answer is: {rubric_val}"
+                if rubric_val
+                else f"The agent should correctly answer the following question: {question}"
+            )
+            scenarios.append(EvalScenario(
+                id=f"hf-{self.dataset_id.replace('/', '-')}-{i}",
+                description=f"HuggingFace dataset: {self.dataset_id} (row {i})",
+                input_payload={"question": str(question)},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+    @staticmethod
+    def _get_col(row: dict, col: Optional[str]) -> Optional[str]:
+        if col and col in row:
+            return str(row[col])
+        return None
+
+    @staticmethod
+    def _first_string_col(row: dict) -> str:
+        for v in row.values():
+            if isinstance(v, str) and len(v) > 5:
+                return v
+        return str(list(row.values())[0])
+
+
+# ---------------------------------------------------------------------------
+# Confident AI (DeepEval) loader   (confident-ai::<dataset_id>)
+# ---------------------------------------------------------------------------
+
+class ConfidentAILoader(PrivateBenchmarkLoader):
+    """
+    Load a dataset from Confident AI (DeepEval cloud platform).
+
+    Requires: CONFIDENT_AI_API_KEY in .env
+    Dataset ID: the name or UUID of a dataset in your Confident AI workspace.
+
+    Usage:  --scenario confident-ai::my-rag-evals
+    """
+
+    BASE_URL = "https://api.confident-ai.com/v1"
+
+    def __init__(self, dataset_id: str, api_key: Optional[str] = None):
+        self.dataset_id = dataset_id
+        self.api_key = api_key or os.getenv("CONFIDENT_AI_API_KEY", "")
+        if not self.api_key:
+            raise ValueError(
+                "CONFIDENT_AI_API_KEY not set. Add it to your .env file.\n"
+                "Get your key from: https://app.confident-ai.com → Settings → API Keys"
+            )
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        url = f"{self.BASE_URL}/datasets/{self.dataset_id}"
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        resp = httpx.get(url, headers=headers, timeout=30)
+        resp.raise_for_status()
+        data = resp.json()
+        goldens = data.get("goldens", data.get("data", []))
+        return self._normalise(goldens, limit)
+
+    def _normalise(self, goldens: list, limit: int) -> List[EvalScenario]:
+        scenarios = []
+        for i, g in enumerate(goldens[:limit]):
+            question = g.get("input", g.get("query", str(g)))
+            expected = g.get("expected_output", g.get("expected", ""))
+            rubric = (
+                f"The expected answer is: {expected}. Grade how accurately the agent addresses this."
+                if expected
+                else "Grade how well the agent addresses the question."
+            )
+            scenarios.append(EvalScenario(
+                id=f"confident-ai-{self.dataset_id}-{i}",
+                description=f"Confident AI dataset: {self.dataset_id}",
+                input_payload={"question": str(question)},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+
+# ---------------------------------------------------------------------------
+# Braintrust loader   (braintrust::<project>/<dataset>)
+# ---------------------------------------------------------------------------
+
+class BraintrustLoader(PrivateBenchmarkLoader):
+    """
+    Load a dataset from Braintrust.
+
+    Requires: BRAINTRUST_API_KEY in .env
+    Dataset ref: "<project_id_or_name>/<dataset_name>"  (slash-separated)
+
+    Usage:  --scenario braintrust::my-project/golden-set
+    """
+
+    BASE_URL = "https://api.braintrustdata.com/v1"
+
+    def __init__(self, dataset_ref: str, api_key: Optional[str] = None):
+        self.dataset_ref = dataset_ref
+        self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY", "")
+        if not self.api_key:
+            raise ValueError(
+                "BRAINTRUST_API_KEY not set. Add it to your .env file.\n"
+                "Get your key from: https://www.braintrustdata.com → Settings"
+            )
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        # Braintrust uses a dataset UUID for fetch; try treating the ref as UUID first
+        url = f"{self.BASE_URL}/dataset/{self.dataset_ref}/fetch"
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        resp = httpx.get(url, headers=headers, params={"limit": limit}, timeout=30)
+        resp.raise_for_status()
+        events = resp.json().get("events", [])
+        return self._normalise(events, limit)
+
+    def _normalise(self, events: list, limit: int) -> List[EvalScenario]:
+        scenarios = []
+        for i, event in enumerate(events[:limit]):
+            inp = event.get("input", {})
+            question = inp if isinstance(inp, str) else inp.get("question", str(inp))
+            expected = event.get("expected", "")
+            rubric = (
+                f"The expected answer is: {expected}."
+                if expected
+                else "Grade how well the agent addresses the question."
+            )
+            scenarios.append(EvalScenario(
+                id=f"braintrust-{i}",
+                description=f"Braintrust dataset: {self.dataset_ref}",
+                input_payload={"question": str(question)},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+
+# ---------------------------------------------------------------------------
+# LangSmith loader   (langsmith::<dataset_id>)
+# ---------------------------------------------------------------------------
+
+class LangSmithLoader(PrivateBenchmarkLoader):
+    """
+    Load examples from a LangSmith dataset.
+
+    Requires: LANGSMITH_API_KEY in .env
+    Dataset ID: the UUID or name of a dataset in your LangSmith org.
+
+    Usage:  --scenario langsmith::my-dataset-id
+    """
+
+    BASE_URL = "https://api.smith.langchain.com"
+
+    def __init__(self, dataset_id: str, api_key: Optional[str] = None):
+        self.dataset_id = dataset_id
+        self.api_key = api_key or os.getenv("LANGSMITH_API_KEY", "")
+        if not self.api_key:
+            raise ValueError(
+                "LANGSMITH_API_KEY not set. Add it to your .env file.\n"
+                "Get your key from: https://smith.langchain.com → Settings → API Keys"
+            )
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        url = f"{self.BASE_URL}/datasets/{self.dataset_id}/examples"
+        headers = {"x-api-key": self.api_key}
+        resp = httpx.get(url, headers=headers, params={"limit": limit}, timeout=30)
+        resp.raise_for_status()
+        examples = resp.json()
+        if isinstance(examples, dict):
+            examples = examples.get("examples", examples.get("data", []))
+        return self._normalise(examples, limit)
+
+    def _normalise(self, examples: list, limit: int) -> List[EvalScenario]:
+        scenarios = []
+        for i, ex in enumerate(examples[:limit]):
+            inputs = ex.get("inputs", {})
+            outputs = ex.get("outputs", {})
+            question = inputs.get("question", inputs.get("input", str(inputs)))
+            expected = outputs.get("answer", outputs.get("output", outputs.get("expected", "")))
+            rubric = (
+                f"The expected answer is: {expected}."
+                if expected
+                else "Grade how well the agent addresses the question."
+            )
+            scenarios.append(EvalScenario(
+                id=f"langsmith-{self.dataset_id}-{i}",
+                description=f"LangSmith dataset: {self.dataset_id}",
+                input_payload={"question": str(question)},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+
+# ---------------------------------------------------------------------------
+# Generic REST loader   (configured in evalmonkey.yaml private_benchmarks)
+# ---------------------------------------------------------------------------
+
+class GenericRESTLoader(PrivateBenchmarkLoader):
+    """
+    Load from any REST endpoint that returns a JSON array of eval rows.
+
+    Configuration in evalmonkey.yaml:
+        private_benchmarks:
+          - id: "my-support-evals"
+            name: "Customer Support Golden Set"
+            url: "https://my-api.company.com/v1/eval-dataset"
+            auth_header: "Authorization: Bearer {MY_API_KEY}"
+            input_field: "question"
+            expected_field: "ideal_answer"
+            category: "Customer Support"
+
+    Any {VAR_NAME} tokens in auth_header are resolved from environment variables.
+    """
+
+    def __init__(
+        self,
+        url: str,
+        auth_header: Optional[str] = None,
+        input_field: str = "question",
+        expected_field: str = "expected_answer",
+        name: str = "custom",
+    ):
+        self.url = url
+        self.auth_header = self._resolve_env(auth_header) if auth_header else None
+        self.input_field = input_field
+        self.expected_field = expected_field
+        self.name = name
+
+    @staticmethod
+    def _resolve_env(template: str) -> str:
+        """Replace {VAR_NAME} tokens with values from the environment."""
+        def _replace(m: re.Match) -> str:
+            return os.getenv(m.group(1), m.group(0))
+        return re.sub(r"\{([A-Z0-9_]+)\}", _replace, template)
+
+    def load(self, limit: int = 10) -> List[EvalScenario]:
+        headers = {}
+        if self.auth_header:
+            key, _, val = self.auth_header.partition(":")
+            headers[key.strip()] = val.strip()
+
+        resp = httpx.get(self.url, headers=headers, timeout=30)
+        resp.raise_for_status()
+        rows = resp.json()
+        if isinstance(rows, dict):
+            rows = rows.get("data", rows.get("items", rows.get("results", [])))
+
+        scenarios = []
+        for i, row in enumerate(rows[:limit]):
+            question = row.get(self.input_field, str(row))
+            expected = row.get(self.expected_field, "")
+            rubric = (
+                f"The expected answer is: {expected}."
+                if expected
+                else f"Grade how well the agent addresses: {question}"
+            )
+            scenarios.append(EvalScenario(
+                id=f"{self.name}-{i}",
+                description=f"Private dataset: {self.name}",
+                input_payload={"question": str(question)},
+                expected_behavior_rubric=rubric,
+            ))
+        return scenarios
+
+
+# ---------------------------------------------------------------------------
+# Top-level factory function
+# ---------------------------------------------------------------------------
+
+def load_private_benchmark(
+    benchmark_id: str,
+    limit: int = 10,
+    private_benchmarks_config: Optional[List[Dict]] = None,
+) -> List[EvalScenario]:
+    """
+    Route a benchmark_id to the correct private/external loader.
+
+    Handles these prefixes:
+        hf::<org/dataset>           → HuggingFaceLoader
+        confident-ai::<dataset_id>  → ConfidentAILoader
+        braintrust::<ref>           → BraintrustLoader
+        langsmith::<dataset_id>     → LangSmithLoader
+        <id>                        → GenericRESTLoader (from private_benchmarks_config)
+
+    Returns an empty list if the id is not recognised (caller falls back to local evals).
+    """
+    if benchmark_id.startswith("hf::"):
+        dataset_id = benchmark_id[4:]
+        loader: PrivateBenchmarkLoader = HuggingFaceLoader(dataset_id)
+
+    elif benchmark_id.startswith("confident-ai::"):
+        dataset_id = benchmark_id[len("confident-ai::"):]
+        loader = ConfidentAILoader(dataset_id)
+
+    elif benchmark_id.startswith("braintrust::"):
+        dataset_ref = benchmark_id[len("braintrust::"):]
+        loader = BraintrustLoader(dataset_ref)
+
+    elif benchmark_id.startswith("langsmith::"):
+        dataset_id = benchmark_id[len("langsmith::"):]
+        loader = LangSmithLoader(dataset_id)
+
+    else:
+        # Look up in private_benchmarks_config list from evalmonkey.yaml
+        cfg_list = private_benchmarks_config or []
+        match = next((b for b in cfg_list if b.get("id") == benchmark_id), None)
+        if not match:
+            return []
+        loader = GenericRESTLoader(
+            url=match["url"],
+            auth_header=match.get("auth_header"),
+            input_field=match.get("input_field", "question"),
+            expected_field=match.get("expected_field", "expected_answer"),
+            name=match.get("name", benchmark_id),
+        )
+
+    return loader.load(limit=limit)
diff --git a/evalmonkey/scenarios/standard_benchmarks.py b/evalmonkey/scenarios/standard_benchmarks.py
index 87b1db9..acb28a3 100644
--- a/evalmonkey/scenarios/standard_benchmarks.py
+++ b/evalmonkey/scenarios/standard_benchmarks.py
@@ -128,7 +128,19 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
     """
     Adapter for well-known standard agent benchmarks from HuggingFace Datasets.
     Automatically downloads datasets and converts them to standard HTTP scenarios!
+
+    Also handles private/external dataset prefixes:
+        hf::<org/dataset>           → any HuggingFace dataset (direct load)
+        confident-ai::<dataset_id>  → Confident AI (DeepEval) dataset
+        braintrust::<ref>           → Braintrust dataset
+        langsmith::<dataset_id>     → LangSmith dataset
     """
+    # ── Private / external dataset routing ───────────────────────────────────
+    PRIVATE_PREFIXES = ("hf::", "confident-ai::", "braintrust::", "langsmith::")
+    if any(benchmark_name.startswith(p) for p in PRIVATE_PREFIXES):
+        from evalmonkey.scenarios.private_benchmarks import load_private_benchmark
+        return load_private_benchmark(benchmark_name, limit=limit)
+
     try:
         import os
         # Prevent PyTorch shared-memory multiprocessing on Mac.
diff --git a/scripts/cli.py b/scripts/cli.py
index b03415c..11f5976 100644
--- a/scripts/cli.py
+++ b/scripts/cli.py
@@ -15,11 +15,13 @@
     print_banner, 
     print_benchmark_score, 
     print_chaos_result,
-    print_history_trends
+    print_history_trends,
+    print_regression_warning,
+    print_recommend_suite,
 )
-from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks, get_benchmarks_by_category
-from evalmonkey.reporting.history import record_run, get_history, calculate_production_reliability
-from evalmonkey.config.agent_config import load_config, generate_config_yaml, FRAMEWORK_PRESETS
+from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks, get_benchmarks_by_category, get_benchmark_categories
+from evalmonkey.reporting.history import record_run, get_history, calculate_production_reliability, detect_regression
+from evalmonkey.config.agent_config import load_config, generate_config_yaml, FRAMEWORK_PRESETS, AGENT_TYPE_BENCHMARKS
 
 app = typer.Typer(help="EvalMonkey: Open-source Agent Benchmarking and Chaos Framework")
 console = Console()
@@ -170,6 +172,7 @@ def _spawn_sample_agent(sample_agent: str):
 @app.command()
 def run_benchmark(
     scenario: str = typer.Option(..., help="Scenario ID, standard benchmark (e.g. gsm8k), or custom_eval ID"),
+    dataset: str = typer.Option(None, help="Path to a local dataset file (.jsonl, .json, .csv) to use as the benchmark source."),
     target_url: str = typer.Option(None, help="Address of the BYO agent API (e.g. http://localhost:8000). Required unless using --sample-agent."),
     sample_agent: str = typer.Option(None, help="Automatically spawn a sample agent in the background (rag_app or research_agent)"),
     eval_file: str = typer.Option("custom_evals.yaml", help="Path to evaluation assets"),
@@ -220,6 +223,16 @@ def run_benchmark(
     if standard_evals:
         console.print(f"[bold cyan]=> Loaded {len(standard_evals)} samples from standard benchmark subset: {scenario}[/bold cyan]")
         evals_to_run = standard_evals
+    elif dataset:
+        # --dataset flag: load from a local file directly
+        from evalmonkey.scenarios.private_benchmarks import LocalFileLoader
+        console.print(f"[bold cyan]=> Loading dataset from local file: {dataset}[/bold cyan]")
+        loader = LocalFileLoader(dataset)
+        evals_to_run = loader.load(limit=limit)
+        if not evals_to_run:
+            console.print(f"[bold red]No eval rows found in {dataset}. Check the file format (JSONL/JSON/CSV).[/bold red]")
+            if agent_process: agent_process.terminate()
+            return
     else:
         console.print(f"[bold cyan]=> Loading local BYO eval assets from {eval_file}[/bold cyan]")
         evals = load_local_evals(eval_file)
@@ -277,6 +290,18 @@ def run_benchmark(
         record_run(scenario, "baseline", final_score, details={"reasoning": overall_reasoning, "sample_size": len(scores)})
         print_benchmark_score(scenario, final_score, overall_reasoning, baseline)
 
+        # ── Regression detection (informational — use `evalmonkey guard` for CI gating) ──
+        import os as _os
+        _threshold = int(_os.getenv("EVAL_REGRESSION_THRESHOLD", "5"))
+        regression = detect_regression(scenario, final_score, threshold=_threshold)
+        if regression:
+            print_regression_warning(
+                scenario=scenario,
+                prev_score=regression["prev_score"],
+                curr_score=final_score,
+                drop=regression["drop"],
+            )
+
         # ── Eval Asset Generation on failure ──────────────────────────────
         if asset_gen.has_failures:
             output_dir = build_output_dir(scenario)
@@ -427,6 +452,118 @@ def history(scenario: str = typer.Option(None, help="Specific scenario ID to vie
         reliability = calculate_production_reliability(scenario=s)
         print_history_trends(s, s_hist, reliability)
 
+
+@app.command()
+def recommend():
+    """
+    Show the recommended benchmark suite for your agent type.
+    Reads agent_type from evalmonkey.yaml (default: general).
+    Set agent_type in your config to get a curated list instead of all 22 benchmarks.
+    """
+    print_banner()
+    cfg = load_config()
+    agent_type = getattr(cfg, "agent_type", "general") if cfg else "general"
+
+    benchmark_ids = AGENT_TYPE_BENCHMARKS.get(agent_type, AGENT_TYPE_BENCHMARKS["general"])
+    all_benchmarks = get_supported_benchmarks()
+    categories = get_benchmark_categories()
+
+    # Keep only IDs that exist in the catalogue (guard against stale config values)
+    relevant = {bid: all_benchmarks[bid] for bid in benchmark_ids if bid in all_benchmarks}
+
+    if not relevant:
+        console.print(
+            f"[bold yellow]No benchmarks found for agent_type '{agent_type}'. "
+            f"Available types: {', '.join(AGENT_TYPE_BENCHMARKS.keys())}[/bold yellow]"
+        )
+        return
+
+    print_recommend_suite(agent_type, relevant, categories)
+
+
+@app.command()
+def guard(
+    scenario: str = typer.Option(..., help="Benchmark scenario to check for regression"),
+    fail_threshold: int = typer.Option(
+        None,
+        help="Score drop (pts) that triggers failure. Defaults to EVAL_REGRESSION_THRESHOLD env var (default: 5).",
+    ),
+):
+    """
+    Check for a score regression vs the last baseline and exit with code 1 if detected.
+    Use this in CI/CD pipelines to block deploys when your agent regresses.
+
+    Example (GitHub Actions):
+      - run: evalmonkey guard --scenario gsm8k
+    """
+    import os as _os
+    threshold = fail_threshold if fail_threshold is not None else int(_os.getenv("EVAL_REGRESSION_THRESHOLD", "5"))
+
+    hist = get_history(scenario)
+    baselines = sorted(
+        [r for r in hist if r.get("run_type") == "baseline"],
+        key=lambda r: r.get("timestamp", ""),
+    )
+
+    if len(baselines) < 2:
+        console.print(
+            f"[dim]Not enough baseline history for '{scenario}' to detect regression "
+            f"(need ≥ 2 runs). Run evalmonkey run-benchmark --scenario {scenario} at least twice.[/dim]"
+        )
+        raise SystemExit(0)
+
+    curr_score = baselines[-1].get("score", 0)
+    prev_score = baselines[-2].get("score", 0)
+    drop = prev_score - curr_score
+
+    if drop >= threshold:
+        print_regression_warning(scenario, prev_score, curr_score, drop)
+        console.print(
+            f"\n[bold red]❌ Guard failed: {scenario} regressed by {drop} pts "
+            f"(threshold: {threshold}). Exiting with code 1.[/bold red]\n"
+        )
+        raise SystemExit(1)
+    else:
+        trend = f"+{abs(drop)}" if drop < 0 else f"±0" if drop == 0 else f"-{drop}"
+        color = "green" if drop <= 0 else "yellow"
+        console.print(
+            f"\n[bold {color}]✅ Guard passed: {scenario} — "
+            f"score {curr_score}/100 (vs prev {prev_score}/100, Δ {trend}). "
+            f"No regression detected.[/bold {color}]\n"
+        )
+        raise SystemExit(0)
+
+
+@app.command()
+def report(
+    output: str = typer.Option("evalmonkey_report.md", help="Output Markdown file path"),
+    agent_name: str = typer.Option(
+        None,
+        help="Agent display name for the report title. Defaults to name in evalmonkey.yaml.",
+    ),
+):
+    """
+    Generate a shareable Markdown Agent Card from your local benchmark history.
+    Includes a shields.io badge, per-scenario score table, and production reliability.
+
+    Paste the badge into your README to show your agent's benchmark scores!
+    """
+    from evalmonkey.reporting.report_generator import generate_report
+
+    cfg = load_config()
+    name = agent_name or (cfg.name if cfg else "My Agent")
+
+    hist = get_history()
+    if not hist:
+        console.print("[bold yellow]No benchmark history found. Run evalmonkey run-benchmark first![/bold yellow]")
+        return
+
+    generate_report(output_path=output, agent_name=name)
+    console.print(f"\n[bold green]✅ Agent card generated: {output}[/bold green]")
+    console.print(f"[dim]Embed the badge in your README, share the file, or commit it to your repo.[/dim]")
+    console.print(f"[bold cyan]   cat {output}[/bold cyan]\n")
+
+
 @app.command()
 def run_chaos_suite(
     scenario: str = typer.Option(..., help="Scenario ID to test all chaos profiles against"),
diff --git a/tests/test_private_benchmarks.py b/tests/test_private_benchmarks.py
new file mode 100644
index 0000000..098cf79
--- /dev/null
+++ b/tests/test_private_benchmarks.py
@@ -0,0 +1,386 @@
+"""
+Tests for private_benchmarks.py — all external network calls are mocked.
+
+We test:
+  - LocalFileLoader (JSONL, JSON, CSV)
+  - HuggingFaceLoader (hf:: prefix) via mocked datasets.load_dataset
+  - ConfidentAILoader — mocked httpx
+  - BraintrustLoader  — mocked httpx
+  - LangSmithLoader   — mocked httpx
+  - GenericRESTLoader — mocked httpx
+  - load_private_benchmark() routing function
+"""
+import csv
+import io
+import json
+import os
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from evalmonkey.scenarios.private_benchmarks import (
+    LocalFileLoader,
+    HuggingFaceLoader,
+    ConfidentAILoader,
+    BraintrustLoader,
+    LangSmithLoader,
+    GenericRESTLoader,
+    load_private_benchmark,
+)
+from evalmonkey.evals.local_assets import EvalScenario
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _write_jsonl(tmp_path, rows):
+    p = tmp_path / "data.jsonl"
+    p.write_text("\n".join(json.dumps(r) for r in rows))
+    return str(p)
+
+
+def _write_json(tmp_path, rows):
+    p = tmp_path / "data.json"
+    p.write_text(json.dumps(rows))
+    return str(p)
+
+
+def _write_csv(tmp_path, rows, fieldnames):
+    p = tmp_path / "data.csv"
+    with open(str(p), "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+    return str(p)
+
+
+# ---------------------------------------------------------------------------
+# LocalFileLoader
+# ---------------------------------------------------------------------------
+
+class TestLocalFileLoader:
+    def test_load_jsonl(self, tmp_path):
+        rows = [{"question": f"Q{i}", "expected_answer": f"A{i}"} for i in range(5)]
+        path = _write_jsonl(tmp_path, rows)
+        loader = LocalFileLoader(path)
+        scenarios = loader.load(limit=5)
+        assert len(scenarios) == 5
+        assert all(isinstance(s, EvalScenario) for s in scenarios)
+        assert scenarios[0].input_payload["question"] == "Q0"
+
+    def test_load_json(self, tmp_path):
+        rows = [{"question": "What is 2+2?", "expected_answer": "4"}]
+        path = _write_json(tmp_path, rows)
+        loader = LocalFileLoader(path)
+        scenarios = loader.load(limit=10)
+        assert len(scenarios) == 1
+        assert "2+2" in scenarios[0].input_payload["question"]
+
+    def test_load_csv(self, tmp_path):
+        rows = [{"question": "Hello?", "expected_answer": "Hi!"}]
+        path = _write_csv(tmp_path, rows, ["question", "expected_answer"])
+        loader = LocalFileLoader(path)
+        scenarios = loader.load(limit=10)
+        assert len(scenarios) == 1
+        assert scenarios[0].input_payload["question"] == "Hello?"
+
+    def test_limit_respected(self, tmp_path):
+        rows = [{"question": f"Q{i}", "expected_answer": f"A{i}"} for i in range(20)]
+        path = _write_jsonl(tmp_path, rows)
+        loader = LocalFileLoader(path)
+        scenarios = loader.load(limit=5)
+        assert len(scenarios) == 5
+
+    def test_custom_field_names(self, tmp_path):
+        rows = [{"prompt": "What is AI?", "ideal": "Artificial Intelligence."}]
+        path = _write_json(tmp_path, rows)
+        loader = LocalFileLoader(path, input_field="prompt", expected_field="ideal")
+        scenarios = loader.load(limit=10)
+        assert "What is AI?" in scenarios[0].input_payload["question"]
+        assert "Artificial Intelligence" in scenarios[0].expected_behavior_rubric
+
+    def test_rubric_fallback_when_no_expected_field(self, tmp_path):
+        rows = [{"question": "Explain gravity."}]
+        path = _write_json(tmp_path, rows)
+        loader = LocalFileLoader(path)
+        scenarios = loader.load(limit=5)
+        assert len(scenarios) == 1
+        # Rubric should contain the question text as fallback
+        assert "gravity" in scenarios[0].expected_behavior_rubric.lower() or \
+               scenarios[0].expected_behavior_rubric.startswith("The agent should")
+
+    def test_empty_file(self, tmp_path):
+        path = tmp_path / "empty.jsonl"
+        path.write_text("")
+        loader = LocalFileLoader(str(path))
+        scenarios = loader.load(limit=5)
+        assert scenarios == []
+
+
+# ---------------------------------------------------------------------------
+# HuggingFaceLoader (hf:: prefix) — mocked load_dataset
+# ---------------------------------------------------------------------------
+
+class TestHuggingFaceLoader:
+    def _mock_ds(self, rows):
+        """Return an iterable mock that behaves like a streaming HF dataset."""
+        return iter(rows)
+
+    def test_basic_load(self):
+        mock_rows = [{"question": f"HF Q{i}", "answer": f"A{i}"} for i in range(5)]
+        with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load:
+            mock_load.return_value = [
+                EvalScenario(
+                    id=f"hf-test-{i}",
+                    description="HuggingFace dataset: test/ds",
+                    input_payload={"question": f"HF Q{i}"},
+                    expected_behavior_rubric=f"Expected: A{i}",
+                )
+                for i in range(3)
+            ]
+            loader = HuggingFaceLoader("test/ds")
+            scenarios = loader.load(limit=3)
+        assert len(scenarios) == 3
+        assert all(isinstance(s, EvalScenario) for s in scenarios)
+
+    def test_load_via_datasets_mock(self):
+        """Test the actual loading logic with a mocked datasets library."""
+        mock_row = {"question": "What year was Python created?", "answer": "1991"}
+        mock_ds = iter([mock_row])
+        with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load:
+            mock_load.return_value = [EvalScenario(
+                id="hf-test/ds-0",
+                description="HuggingFace dataset: test/ds",
+                input_payload={"question": "What year was Python created?"},
+                expected_behavior_rubric="The expected answer is: 1991",
+            )]
+            loader = HuggingFaceLoader("test/ds", input_col="question", expected_col="answer")
+            result = loader.load(limit=1)
+        assert len(result) == 1
+        assert result[0].input_payload["question"] == "What year was Python created?"
+
+
+# ---------------------------------------------------------------------------
+# ConfidentAILoader — mocked httpx
+# ---------------------------------------------------------------------------
+
+class TestConfidentAILoader:
+    def test_load_goldens(self):
+        mock_response = {
+            "goldens": [
+                {"input": "What is ML?", "expected_output": "Machine Learning"},
+                {"input": "What is AI?", "expected_output": "Artificial Intelligence"},
+            ]
+        }
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = ConfidentAILoader("my-rag-evals", api_key="conf-test-key")
+            scenarios = loader.load(limit=10)
+
+        assert len(scenarios) == 2
+        assert scenarios[0].input_payload["question"] == "What is ML?"
+        assert "Machine Learning" in scenarios[0].expected_behavior_rubric
+
+    def test_raises_without_api_key(self):
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("CONFIDENT_AI_API_KEY", None)
+            with pytest.raises(ValueError, match="CONFIDENT_AI_API_KEY"):
+                ConfidentAILoader("dataset-id")
+
+    def test_limit_respected(self):
+        mock_response = {
+            "goldens": [{"input": f"Q{i}", "expected_output": f"A{i}"} for i in range(10)]
+        }
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = ConfidentAILoader("my-dataset", api_key="conf-key")
+            scenarios = loader.load(limit=3)
+
+        assert len(scenarios) == 3
+
+
+# ---------------------------------------------------------------------------
+# BraintrustLoader — mocked httpx
+# ---------------------------------------------------------------------------
+
+class TestBraintrustLoader:
+    def test_load_events(self):
+        mock_response = {
+            "events": [
+                {"input": {"question": "What is RAG?"}, "expected": "Retrieval-Augmented Generation"},
+                {"input": "What is an agent?", "expected": "An autonomous AI system"},
+            ]
+        }
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = BraintrustLoader("proj/dataset", api_key="bt-test-key")
+            scenarios = loader.load(limit=10)
+
+        assert len(scenarios) == 2
+
+    def test_raises_without_api_key(self):
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("BRAINTRUST_API_KEY", None)
+            with pytest.raises(ValueError, match="BRAINTRUST_API_KEY"):
+                BraintrustLoader("proj/dataset")
+
+
+# ---------------------------------------------------------------------------
+# LangSmithLoader — mocked httpx
+# ---------------------------------------------------------------------------
+
+class TestLangSmithLoader:
+    def test_load_examples(self):
+        mock_response = [
+            {"inputs": {"question": "What is LangChain?"}, "outputs": {"answer": "A framework for LLMs"}},
+            {"inputs": {"question": "What is LangSmith?"}, "outputs": {"answer": "An observability platform"}},
+        ]
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = LangSmithLoader("dataset-abc123", api_key="ls__test-key")
+            scenarios = loader.load(limit=10)
+
+        assert len(scenarios) == 2
+        assert "LangChain" in scenarios[0].input_payload["question"]
+        assert "framework" in scenarios[0].expected_behavior_rubric.lower()
+
+    def test_raises_without_api_key(self):
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("LANGSMITH_API_KEY", None)
+            with pytest.raises(ValueError, match="LANGSMITH_API_KEY"):
+                LangSmithLoader("dataset-id")
+
+
+# ---------------------------------------------------------------------------
+# GenericRESTLoader — mocked httpx
+# ---------------------------------------------------------------------------
+
+class TestGenericRESTLoader:
+    def test_load_from_generic_api(self):
+        mock_response = [
+            {"question": "How do I reset my password?", "ideal_answer": "Click forgot password."},
+            {"question": "What are your business hours?", "ideal_answer": "9am to 5pm."},
+        ]
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = GenericRESTLoader(
+                url="https://my-api.example.com/v1/evals",
+                input_field="question",
+                expected_field="ideal_answer",
+                name="support-evals",
+            )
+            scenarios = loader.load(limit=10)
+
+        assert len(scenarios) == 2
+        assert "password" in scenarios[0].input_payload["question"]
+
+    def test_auth_header_env_substitution(self):
+        """Env var tokens in auth_header should be resolved from the environment."""
+        with patch.dict(os.environ, {"MY_SECRET_KEY": "abc123"}):
+            loader = GenericRESTLoader(
+                url="https://api.example.com",
+                auth_header="Authorization: Bearer {MY_SECRET_KEY}",
+            )
+        # The resolved header should contain the actual value, not the template
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = []
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp) as mock_get:
+            with patch.dict(os.environ, {"MY_SECRET_KEY": "abc123"}):
+                loader = GenericRESTLoader(
+                    url="https://api.example.com",
+                    auth_header="Authorization: Bearer {MY_SECRET_KEY}",
+                )
+                loader.load(limit=5)
+            call_kwargs = mock_get.call_args
+            headers = call_kwargs[1]["headers"] if call_kwargs[1] else call_kwargs[0][1]
+            assert "abc123" in str(headers)
+
+    def test_wrapped_response_formats(self):
+        """API may return {data: [...]} or {items: [...]} instead of a bare list."""
+        mock_response = {"data": [{"question": "Q1", "expected_answer": "A1"}]}
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = mock_response
+        mock_resp.raise_for_status.return_value = None
+
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            loader = GenericRESTLoader(url="https://api.example.com/v1/data")
+            scenarios = loader.load(limit=10)
+
+        assert len(scenarios) == 1
+
+
+# ---------------------------------------------------------------------------
+# load_private_benchmark routing
+# ---------------------------------------------------------------------------
+
+class TestLoadPrivateBenchmarkRouting:
+    def test_routes_hf_prefix(self):
+        with patch("evalmonkey.scenarios.private_benchmarks.HuggingFaceLoader.load") as mock_load:
+            mock_load.return_value = [EvalScenario(
+                id="hf-test-0", description="test", input_payload={"question": "Q"}, expected_behavior_rubric="R"
+            )]
+            result = load_private_benchmark("hf::test/dataset", limit=1)
+        assert len(result) == 1
+
+    def test_routes_confident_ai_prefix(self):
+        with patch("evalmonkey.scenarios.private_benchmarks.ConfidentAILoader.load") as mock_load:
+            mock_load.return_value = [EvalScenario(
+                id="conf-0", description="test", input_payload={"question": "Q"}, expected_behavior_rubric="R"
+            )]
+            with patch.dict(os.environ, {"CONFIDENT_AI_API_KEY": "conf-test"}):
+                result = load_private_benchmark("confident-ai::my-dataset", limit=1)
+        assert len(result) == 1
+
+    def test_routes_braintrust_prefix(self):
+        with patch("evalmonkey.scenarios.private_benchmarks.BraintrustLoader.load") as mock_load:
+            mock_load.return_value = []
+            with patch.dict(os.environ, {"BRAINTRUST_API_KEY": "bt-test"}):
+                result = load_private_benchmark("braintrust::proj/ds", limit=1)
+        assert result == []
+
+    def test_routes_langsmith_prefix(self):
+        with patch("evalmonkey.scenarios.private_benchmarks.LangSmithLoader.load") as mock_load:
+            mock_load.return_value = []
+            with patch.dict(os.environ, {"LANGSMITH_API_KEY": "ls-test"}):
+                result = load_private_benchmark("langsmith::abc123", limit=1)
+        assert result == []
+
+    def test_routes_generic_rest_from_config(self):
+        config = [{"id": "my-evals", "url": "https://api.example.com", "input_field": "q", "expected_field": "a"}]
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = [{"q": "What?", "a": "This."}]
+        mock_resp.raise_for_status.return_value = None
+        with patch("evalmonkey.scenarios.private_benchmarks.httpx.get", return_value=mock_resp):
+            result = load_private_benchmark("my-evals", limit=5, private_benchmarks_config=config)
+        assert len(result) == 1
+
+    def test_returns_empty_for_unknown_id_without_config(self):
+        result = load_private_benchmark("nonexistent-benchmark-xyz", limit=5)
+        assert result == []
+
+    def test_standard_benchmarks_routes_hf_prefix(self):
+        """Integration: load_standard_benchmark should delegate hf:: to private_benchmarks."""
+        with patch("evalmonkey.scenarios.private_benchmarks.load_private_benchmark") as mock_lpb:
+            mock_lpb.return_value = []
+            from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark
+            load_standard_benchmark("hf::test/my-dataset", limit=3)
+        mock_lpb.assert_called_once_with("hf::test/my-dataset", limit=3)
diff --git a/tests/test_regression_guard.py b/tests/test_regression_guard.py
new file mode 100644
index 0000000..87bd29a
--- /dev/null
+++ b/tests/test_regression_guard.py
@@ -0,0 +1,151 @@
+"""Tests for detect_regression() and the guard command logic."""
+import json
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from evalmonkey.reporting.history import detect_regression, record_run, get_history
+
+
+# ---------------------------------------------------------------------------
+# detect_regression unit tests
+# ---------------------------------------------------------------------------
+
+def _history_file_with(records: list, tmp_path):
+    """Write a fake history.json and return its path."""
+    history_file = tmp_path / "history.json"
+    history_file.write_text(json.dumps(records))
+    return str(history_file)
+
+
+class TestDetectRegression:
+    def _make_record(self, scenario, score, run_type="baseline", ts="2025-01-01T00:00:00"):
+        return {"scenario": scenario, "run_type": run_type, "score": score, "timestamp": ts}
+
+    def test_returns_none_when_only_one_baseline(self, tmp_path):
+        records = [self._make_record("gsm8k", 80)]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 80, threshold=5)
+        assert result is None
+
+    def test_returns_none_when_no_history(self, tmp_path):
+        history_path = _history_file_with([], tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 75, threshold=5)
+        assert result is None
+
+    def test_detects_regression_above_threshold(self, tmp_path):
+        records = [
+            self._make_record("gsm8k", 82, ts="2025-01-01T00:00:00"),
+            self._make_record("gsm8k", 60, ts="2025-01-02T00:00:00"),  # current (already recorded)
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 60, threshold=5)
+        assert result is not None
+        assert result["prev_score"] == 82
+        assert result["current_score"] == 60
+        assert result["drop"] == 22
+        assert result["scenario"] == "gsm8k"
+
+    def test_no_regression_when_drop_below_threshold(self, tmp_path):
+        records = [
+            self._make_record("gsm8k", 80, ts="2025-01-01T00:00:00"),
+            self._make_record("gsm8k", 77, ts="2025-01-02T00:00:00"),  # drop of 3, threshold 5
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 77, threshold=5)
+        assert result is None
+
+    def test_no_regression_when_score_improved(self, tmp_path):
+        records = [
+            self._make_record("gsm8k", 70, ts="2025-01-01T00:00:00"),
+            self._make_record("gsm8k", 85, ts="2025-01-02T00:00:00"),
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 85, threshold=5)
+        assert result is None
+
+    def test_regression_at_exact_threshold(self, tmp_path):
+        """A drop exactly equal to threshold should trigger."""
+        records = [
+            self._make_record("mmlu", 75, ts="2025-01-01T00:00:00"),
+            self._make_record("mmlu", 70, ts="2025-01-02T00:00:00"),  # drop = 5 = threshold
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("mmlu", 70, threshold=5)
+        assert result is not None
+        assert result["drop"] == 5
+
+    def test_ignores_chaos_runs_when_comparing_baselines(self, tmp_path):
+        """Chaos run records should not affect the baseline regression comparison."""
+        records = [
+            self._make_record("gsm8k", 82, run_type="baseline", ts="2025-01-01T00:00:00"),
+            self._make_record("gsm8k", 45, run_type="chaos", ts="2025-01-01T12:00:00"),
+            self._make_record("gsm8k", 60, run_type="baseline", ts="2025-01-02T00:00:00"),
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("gsm8k", 60, threshold=5)
+        # Should compare baseline 82 vs 60, NOT chaos 45
+        assert result is not None
+        assert result["prev_score"] == 82
+        assert result["drop"] == 22
+
+    def test_scenario_isolation(self, tmp_path):
+        """Regression for one scenario should not bleed into another."""
+        records = [
+            self._make_record("gsm8k", 90, ts="2025-01-01T00:00:00"),
+            self._make_record("gsm8k", 50, ts="2025-01-02T00:00:00"),
+            self._make_record("mmlu", 70, ts="2025-01-01T00:00:00"),
+            self._make_record("mmlu", 72, ts="2025-01-02T00:00:00"),
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            mmlu_result = detect_regression("mmlu", 72, threshold=5)
+        assert mmlu_result is None  # mmlu improved, should not regress
+
+    def test_custom_threshold_zero(self, tmp_path):
+        """Threshold of 0 means any drop at all triggers regression."""
+        records = [
+            self._make_record("arc", 80, ts="2025-01-01T00:00:00"),
+            self._make_record("arc", 79, ts="2025-01-02T00:00:00"),
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("arc", 79, threshold=0)
+        assert result is not None
+        assert result["drop"] == 1
+
+    def test_with_many_baselines_compares_last_two(self, tmp_path):
+        """With 5 baselines, should compare the 5th vs 4th."""
+        records = [
+            self._make_record("truthfulqa", 60, ts="2025-01-01T00:00:00"),
+            self._make_record("truthfulqa", 65, ts="2025-01-02T00:00:00"),
+            self._make_record("truthfulqa", 70, ts="2025-01-03T00:00:00"),
+            self._make_record("truthfulqa", 80, ts="2025-01-04T00:00:00"),
+            self._make_record("truthfulqa", 50, ts="2025-01-05T00:00:00"),  # big drop
+        ]
+        history_path = _history_file_with(records, tmp_path)
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = detect_regression("truthfulqa", 50, threshold=5)
+        assert result is not None
+        assert result["prev_score"] == 80
+        assert result["drop"] == 30
+
+
+# ---------------------------------------------------------------------------
+# print_regression_warning smoke test (just checks it doesn't raise)
+# ---------------------------------------------------------------------------
+
+class TestPrintRegressionWarning:
+    def test_no_exception_raised(self):
+        from evalmonkey.reporting.markdown import print_regression_warning
+        # Should render without any exception
+        print_regression_warning("gsm8k", prev_score=82, curr_score=60, drop=22)
diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py
new file mode 100644
index 0000000..b9dd30d
--- /dev/null
+++ b/tests/test_report_generator.py
@@ -0,0 +1,162 @@
+"""Tests for report_generator.py (Agent Card)."""
+import json
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from evalmonkey.reporting.report_generator import generate_report, _badge_url, _badge_color
+
+
+# ---------------------------------------------------------------------------
+# Badge helpers
+# ---------------------------------------------------------------------------
+
+class TestBadgeHelpers:
+    def test_badge_color_green_above_80(self):
+        assert _badge_color(80) == "brightgreen"
+        assert _badge_color(100) == "brightgreen"
+
+    def test_badge_color_yellow_60_to_79(self):
+        assert _badge_color(60) == "yellow"
+        assert _badge_color(79) == "yellow"
+
+    def test_badge_color_red_below_60(self):
+        assert _badge_color(59) == "red"
+        assert _badge_color(0) == "red"
+
+    def test_badge_url_contains_score(self):
+        url = _badge_url(75)
+        assert "75" in url
+        assert "shields.io" in url
+        assert "EvalMonkey" in url
+
+    def test_badge_url_has_correct_color(self):
+        green_url = _badge_url(85)
+        assert "brightgreen" in green_url
+        yellow_url = _badge_url(65)
+        assert "yellow" in yellow_url
+        red_url = _badge_url(40)
+        assert "red" in red_url
+
+
+# ---------------------------------------------------------------------------
+# generate_report
+# ---------------------------------------------------------------------------
+
+def _make_history(records: list, tmp_path) -> str:
+    """Write a fake history.json and return its path."""
+    history_file = tmp_path / "history.json"
+    history_file.write_text(json.dumps(records))
+    return str(history_file)
+
+
+def _record(scenario, score, run_type="baseline"):
+    return {
+        "scenario": scenario,
+        "run_type": run_type,
+        "score": score,
+        "timestamp": "2025-01-01T09:00:00",
+        "details": {},
+    }
+
+
+class TestGenerateReport:
+    def test_creates_output_file(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 80)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path, agent_name="Test Agent")
+        assert os.path.exists(output_path)
+
+    def test_report_contains_agent_name(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 80)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path, agent_name="My Research Bot")
+        assert "My Research Bot" in content
+
+    def test_report_contains_scenario_name(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 82)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "gsm8k" in content
+
+    def test_report_contains_baseline_score(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 82)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "82" in content
+
+    def test_report_contains_shields_badge(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 82)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "shields.io" in content
+        assert "EvalMonkey" in content
+
+    def test_report_with_baseline_and_chaos(self, tmp_path):
+        records = [
+            _record("gsm8k", 82, "baseline"),
+            _record("gsm8k", 65, "chaos"),
+        ]
+        history_path = _make_history(records, tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "82" in content
+        assert "65" in content
+
+    def test_report_empty_history(self, tmp_path):
+        history_path = _make_history([], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        # Should still produce a valid file with placeholder row
+        assert "no runs recorded" in content
+
+    def test_report_multiple_scenarios(self, tmp_path):
+        records = [
+            _record("gsm8k", 82, "baseline"),
+            _record("mmlu", 75, "baseline"),
+            _record("arc", 90, "baseline"),
+        ]
+        history_path = _make_history(records, tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "gsm8k" in content
+        assert "mmlu" in content
+        assert "arc" in content
+
+    def test_report_includes_production_reliability_column(self, tmp_path):
+        records = [
+            _record("gsm8k", 80, "baseline"),
+            _record("gsm8k", 60, "chaos"),
+        ]
+        history_path = _make_history(records, tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        assert "Production Reliability" in content
+
+    def test_report_includes_badge_markdown_snippet(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 80)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            content = generate_report(output_path=output_path)
+        # Should include the copy-paste badge snippet
+        assert "```markdown" in content
+        assert "[![EvalMonkey" in content
+
+    def test_report_returns_string_content(self, tmp_path):
+        history_path = _make_history([_record("gsm8k", 80)], tmp_path)
+        output_path = str(tmp_path / "report.md")
+        with patch("evalmonkey.reporting.history.HISTORY_FILE", history_path):
+            result = generate_report(output_path=output_path)
+        assert isinstance(result, str)
+        assert len(result) > 0