From f932d98d89120de5fc5f73ef30e030b6cc2507d4 Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Tue, 7 Apr 2026 10:03:42 -0700 Subject: [PATCH 1/2] feat: extend Foundry cloud evaluator coverage to 22 built-in evaluators (#51) - Expand evaluator frozensets: add response_completeness, groundedness_pro, retrieval, tool_selection to existing sets - Add new frozensets: _EVALUATORS_NEEDING_TOOL_DEFS_ONLY (tool_input_accuracy, tool_output_utilization, tool_call_success), _EVALUATORS_NEEDING_OUTPUT_ITEMS (task_adherence) - Fix NLP evaluator names (bleu_score, rouge_score, etc.) to match _to_builtin_evaluator_name conversion - Add default initialization_parameters for RougeScoreEvaluator (rouge_type) - Build item_schema dynamically: include tool_definitions and context_field when evaluators need them - Refactor _default_foundry_input_mapping to frozenset-based routing - Improve error handling: log evaluator errors when score is null, improve runner error message with --verbose hint - Add CI/CD integration models documentation: PR gate, scheduled, post-deploy, multi-env promotion, Azure DevOps pipeline - Add gating best practices: threshold design, evaluator selection by scenario - Add supported evaluators reference table (22 evaluators by category) - Add ~20 unit tests for all new evaluator data_mapping patterns - All 22 evaluators verified end-to-end with live Foundry cloud evaluation Closes #51 --- CHANGELOG.md | 14 + docs/analysis-issue-51-cicd-field-insights.md | 445 +++++++++++++++++ docs/analysis-issue-51-two-track.md | 447 ++++++++++++++++++ docs/ci-github-actions.md | 343 ++++++++++++++ src/agentops/backends/foundry_backend.py | 130 ++++- src/agentops/services/runner.py | 13 +- tests/unit/test_foundry_backend.py | 118 +++++ 7 files changed, 1486 insertions(+), 24 deletions(-) create mode 100644 docs/analysis-issue-51-cicd-field-insights.md create mode 100644 docs/analysis-issue-51-two-track.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a26980..16ba59f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,20 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres ## [Unreleased] ### Added +- Extend Foundry cloud evaluation to support 22 built-in evaluators (up from 8), covering quality, agent, safety, RAG, tool, and NLP evaluator categories. Verified end-to-end with live Foundry cloud evaluation. + - Quality: `CoherenceEvaluator`, `FluencyEvaluator`, `RelevanceEvaluator` + - Agent: `IntentResolutionEvaluator`, `TaskCompletionEvaluator`, `TaskAdherenceEvaluator` + - Similarity: `ResponseCompletenessEvaluator` + - RAG: `GroundednessProEvaluator`, `RetrievalEvaluator` + - Safety: `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` + - Tool: `ToolSelectionEvaluator`, `ToolInputAccuracyEvaluator`, `ToolOutputUtilizationEvaluator`, `ToolCallSuccessEvaluator` +- Add dynamic `item_schema` building — automatically includes `tool_definitions` and `context` fields when the enabled evaluators require them. +- Add CI/CD integration models documentation: PR quality gate, scheduled regression, post-deployment validation, multi-environment promotion, Azure DevOps pipeline. +- Add gating best practices: threshold design, scenario-specific evaluator selection, comparison-based regression detection. +- Add supported evaluators reference table to CI/CD documentation. +- Improve error messages when evaluators return no score (e.g. safety evaluators in unsupported regions) — surface the service error and suggest `--verbose`. +- Fix NLP evaluator names in frozensets to match `_to_builtin_evaluator_name` conversion (`bleu_score`, `rouge_score`, `gleu_score`, `meteor_score` instead of `bleu`, `rouge`, `gleu`, `meteor`). +- Add default `initialization_parameters` for `RougeScoreEvaluator` (`rouge_type: rouge1`). - Implement `agentops eval compare --runs ,` for baseline comparison of evaluation runs. - Produces `comparison.json` (structured metric deltas, threshold flips, item-level changes) and `comparison.md` (human-readable report). - Exits with code `0` (no regressions), `2` (regressions detected), or `1` (error). diff --git a/docs/analysis-issue-51-cicd-field-insights.md b/docs/analysis-issue-51-cicd-field-insights.md new file mode 100644 index 0000000..36e51e2 --- /dev/null +++ b/docs/analysis-issue-51-cicd-field-insights.md @@ -0,0 +1,445 @@ +# Issue #51 — Review CI/CD Based on Field Insights + +**Date:** 2026-04-03 +**Issue:** https://github.com/Azure/agentops/issues/51 +**Author:** placerda +**Reference repo:** https://github.com/hrprtkaur88/foundrycicdbasic + +--- + +## 1. Executive Summary + +This analysis evaluates how well AgentOps Toolkit serves as a CI/CD-ready +evaluation tool based on real-world pipeline patterns observed in Harpreet's +Foundry CI/CD reference repository. The goal is to identify what prevents teams +like Harpreet's from replacing their custom Python scripts with +`agentops eval run`, and what AgentOps must improve to be viable in real +CI/CD environments. + +**Key finding:** AgentOps has strong CI/CD foundations (exit codes, artifacts, +declarative config, generated workflow) but is missing critical evaluator +coverage and data-source patterns that real-world pipelines require. A team +using Harpreet's pipeline today cannot switch to AgentOps without losing +evaluator coverage. + +--- + +## 2. Task Analysis + +### Task 1: Review Harpreet repository and pipeline structure + +**What the repo is:** +A reference implementation showing how to create, test, evaluate, and red-team +Foundry agents using raw Python scripts orchestrated by CI/CD pipelines. + +**Repository structure:** + +``` +foundrycicdbasic/ +├── createagent.py # Creates a Foundry agent via Agent Framework SDK +├── exagent.py # Smoke-tests an existing agent with a real query +├── agenteval.py # Runs cloud evaluation via OpenAI Evals API +├── agenteval_classic.py # Local evaluation fallback +├── redteam.py # Red-team safety evaluation +├── redteam_classic.py # Red-team local fallback +├── requirements.txt # Unpinned runtime dependencies +├── sample.env # Example environment variables +├── data_folder/ # Red-team taxonomy + output files +├── .github/workflows/ +│ ├── create-agent-multi-env.yml # GitHub Actions: deploy agent (dev→test→prod) +│ └── agent-consumption-multi-env.yml # GitHub Actions: test→eval→redteam (dev→test→prod) +├── cicd/ +│ ├── createagentpipeline.yml # Azure DevOps: deploy agent +│ └── agentconsumptionpipeline.yml # Azure DevOps: test→eval→redteam +└── cicd_patterns/ + └── foundry-cicd-workflow.pptx # Presentation on patterns +``` + +**Pipeline flow (agent-consumption-multi-env.yml):** + +``` +build (validate syntax) + → test-dev (exagent.py — smoke-test agent) + → evaluate-test (agenteval.py — cloud evaluation) + → red-team-test (redteam.py — safety evaluation) + → verify-prod (exagent.py — production verification) +``` + +**Key observations:** + +1. **All evaluation logic is imperative** — evaluator names, data mappings, + test data, and testing criteria are hardcoded in Python scripts. +2. **No thresholds or gating** — every eval/redteam step uses + `continue-on-error: true`. The pipeline never blocks on quality. +3. **Authentication uses service principal JSON blobs** — stored as + `AZURE_CREDENTIALS_*` secrets, not OIDC. +4. **Dual platform** — same pipelines exist for both GitHub Actions and + Azure DevOps (manually duplicated). +5. **Inline test data** — `agenteval.py` has query/response/tool_definitions + hardcoded in the script, not in external data files. + +### Task 2: Identify evaluation patterns used in real scenarios + +The following evaluation patterns are used in Harpreet's pipeline. Each is +mapped to AgentOps support status. + +#### Pattern A: Agent smoke test (exagent.py) + +**What it does:** Retrieves an existing agent by name, sends a real query, +handles MCP approval requests, and prints the response with citations. + +**Purpose in CI/CD:** Validates the agent is alive and responsive before +running expensive evaluations. + +**AgentOps equivalent:** None. AgentOps has no "health check" or "smoke test" +concept. The `agentops eval run` command goes straight to evaluation. + +**Gap severity:** Low. This is a convenience — users can add a custom step +before `agentops eval run` in their pipeline. + +#### Pattern B: Cloud evaluation with inline data (agenteval.py) + +**What it does:** +1. Creates an OpenAI client from the Foundry project client +2. Defines `data_source_config` with `type: custom` and an item schema +3. Defines `testing_criteria` — a list of `azure_ai_evaluator` entries +4. Calls `client.evals.create()` to create an eval group +5. Calls `client.evals.runs.create()` with inline JSONL data +6. Polls until completion +7. Retrieves output items + +**Evaluators used:** + +| Category | Evaluator | Builtin name | AgentOps support | +|---|---|---|---| +| System | Task Completion | `builtin.task_completion` | **Not supported** | +| System | Task Adherence | `builtin.task_adherence` | **Not supported** | +| System | Intent Resolution | `builtin.intent_resolution` | **Not supported** | +| RAG | Groundedness | `builtin.groundedness` | Supported | +| RAG | Relevance | `builtin.relevance` | **Not supported** | +| Process | Tool Call Accuracy | `builtin.tool_call_accuracy` | Supported | +| Process | Tool Selection | `builtin.tool_selection` | **Not supported** | +| Process | Tool Input Accuracy | `builtin.tool_input_accuracy` | **Not supported** | +| Process | Tool Output Utilization | `builtin.tool_output_utilization` | **Not supported** | + +**Data format used:** +- `query`: array of message objects (system + user messages) +- `response`: array of message objects (assistant + tool_call + tool_result) +- `tool_definitions`: array of tool schemas +- `tool_calls`: null (derived from response) + +**AgentOps data format:** +- `input`: string (simple text field from JSONL) +- `expected`: string (simple text field from JSONL) +- `context`: optional string + +**Gap severity:** **Critical.** 7 of 9 evaluators used in the field are not +supported by AgentOps. The data format is also incompatible — Harpreet uses +conversation-format arrays while AgentOps expects simple string fields. + +#### Pattern C: Red-team / safety evaluation (redteam.py) + +**What it does:** +1. Creates an agent version via `project_client.agents.create_version()` +2. Defines safety testing criteria: + - `builtin.prohibited_actions` + - `builtin.task_adherence` + - `builtin.sensitive_data_leakage` + - `builtin.self_harm` + - `builtin.violence` + - `builtin.sexual` + - `builtin.hate_unfairness` +3. Creates evaluation taxonomy via `project_client.evaluation_taxonomies.create()` +4. Creates eval run with `data_source.type: azure_ai_red_team` +5. Uses `attack_strategies: ["Flip", "Base64"]` with generated adversarial inputs +6. Polls until completion, saves results to JSON + +**AgentOps equivalent:** None. AgentOps has no concept of: +- Red-team data sources (`azure_ai_red_team`) +- Safety evaluators (prohibited_actions, sensitive_data_leakage, violence, etc.) +- Attack strategies +- Evaluation taxonomies + +**Gap severity:** **High.** Red-team testing is a major field requirement. +However, this may be better addressed as a separate `agentops redteam` command +rather than extending `agentops eval run`, since the data source model is +fundamentally different (generated adversarial inputs vs. user-provided JSONL). + +#### Pattern D: Multi-environment sequential deployment + +**What it does:** Runs the same scripts across dev → test → prod environments, +with each stage depending on the previous. Production requires manual approval +via GitHub Environment protection rules. + +**AgentOps equivalent:** Not directly relevant to the AgentOps tool — this is +a pipeline orchestration pattern. AgentOps's `project_endpoint_env` config +already supports being called in different environments by varying the +endpoint secret. No tool change needed. + +**Gap severity:** None for the tool. Documentation gap only. + +#### Pattern E: Scheduled security scans + +**What it does:** Weekly cron trigger (`0 2 * * 1`) runs the full +test → eval → redteam pipeline on Monday mornings. + +**AgentOps equivalent:** Not relevant to the tool — this is a pipeline trigger +pattern. `agentops eval run` works fine when invoked by a cron job. + +**Gap severity:** None for the tool. Documentation gap only. + +### Task 3: Define supported CI/CD integration models + +Based on field analysis, AgentOps should support these integration models: + +| Model | Description | Tool readiness | +|---|---|---| +| **PR gating** | `agentops eval run` in a PR workflow; exit code 2 blocks merge | **Ready** — implemented and documented | +| **Scheduled regression** | Cron-triggered eval run to detect drift | **Ready** — CLI works, needs documentation | +| **Post-deployment validation** | Run eval after deploying to an environment | **Ready** — CLI works, needs documentation | +| **Multi-config matrix** | Run multiple eval configs in parallel | **Ready** — documented with matrix strategy | +| **Advisory mode** | Run eval and report results without blocking | **Partially ready** — exit code 2 blocks; no `--no-fail` flag | + +### Task 4: Define best practices for gating deployments based on evaluations + +**What AgentOps provides today:** + +| Capability | Status | Evidence | +|---|---|---| +| Exit code contract (0/1/2) | Implemented | `cli/app.py` raises `typer.Exit(code=2)` on threshold failure | +| Declarative thresholds in YAML | Implemented | `bundles/*.yaml` with `thresholds[]` | +| Per-metric threshold criteria | Implemented | `>=`, `>`, `<=`, `<`, `==`, `true`/`false` in `thresholds.py` | +| Per-row threshold evaluation | Implemented | `runner.py` `_evaluate_item_thresholds()` | +| PR comment with report | Implemented | Workflow template posts/updates PR comment | +| Job summary | Implemented | Workflow writes to `$GITHUB_STEP_SUMMARY` | +| Artifacts on failure | Implemented | `if: always()` on artifact upload step | + +**What's missing for real-world gating:** + +| Gap | Impact | +|---|---| +| No `--no-fail` / `--advisory` flag | Teams can't run eval in "observe only" mode (like Harpreet's `continue-on-error`) | +| `agentops config validate` not implemented | Teams can't fail-fast on bad config before running expensive evaluations | +| No threshold on safety evaluators | Can't gate on red-team results since safety evaluators aren't supported | + +### Task 5: Identify gaps in current CLI for CI/CD usage + +| Gap | Category | Severity | Detail | +|---|---|---|---| +| Missing cloud evaluators | Evaluator coverage | **Critical** | 7 of 9 evaluators used in field are unsupported: `task_completion`, `task_adherence`, `intent_resolution`, `relevance`, `tool_selection`, `tool_input_accuracy`, `tool_output_utilization` | +| No conversation-format data | Data model | **High** | Field uses array-of-messages for query/response; AgentOps only supports simple string fields | +| No red-team support | Feature | **High** | No safety evaluators, no `azure_ai_red_team` data source, no attack strategies | +| No `--no-fail` flag | CLI | **Medium** | Can't run in advisory mode without `continue-on-error` in the pipeline YAML | +| `config validate` not implemented | CLI | **Medium** | Can't pre-validate configs in CI before running eval | +| `dataset validate` not implemented | CLI | **Medium** | Can't verify dataset integrity in CI | +| No Azure DevOps template | Documentation | **Low** | `agentops config cicd` only generates GitHub Actions; ADO users must write their own | + +--- + +## 3. Acceptance Criteria Assessment + +### AC 1: CI/CD integration patterns are clearly defined + +**Verdict: PARTIALLY MET** + +**What exists:** +- `docs/ci-github-actions.md` — comprehensive guide covering triggers, auth, + exit codes, artifacts, PR comments, job summary, troubleshooting +- Generated workflow template via `agentops config cicd` +- Matrix strategy documentation for multi-config runs +- Internal CI/CD workflows documented for contributors + +**What's missing:** +- No documentation for Azure DevOps integration +- No documentation for "advisory mode" (run without gating) +- No documentation for scheduled evaluation pattern +- The patterns are defined for the *simple case* (model-direct with similarity) + but not for the *real-world case* (agent evaluation with process/system + evaluators) + +**To close:** Document Azure DevOps integration pattern. Document advisory +mode. Ensure patterns cover agent evaluation scenarios, not just model-direct. + +### AC 2: Pipelines support evaluation as a gating mechanism + +**Verdict: MET (for supported evaluators)** + +**Evidence:** +- Exit code 0/1/2 contract is implemented and tested +- Workflow template uses `exit $EXIT_CODE` — non-zero fails the job +- Threshold evaluation supports multiple criteria operators +- Per-row and aggregate threshold evaluation is implemented +- CLI propagates exit code 2 via `raise typer.Exit(code=2)` + +**Caveat:** Gating only works for the evaluators AgentOps supports. Since most +field-used evaluators are unsupported, the gating mechanism exists but can't +be applied to the metrics teams actually care about (task_completion, +intent_resolution, etc.). + +### AC 3: Exit codes are correctly interpreted in CI/CD + +**Verdict: MET** + +**Evidence:** +- Workflow template maps exit codes to step summary messages + (0 → pass, 2 → threshold fail, else → error) +- Exit code saved to `$GITHUB_OUTPUT` for downstream consumption +- `test_cicd.py` asserts `EXIT_CODE` and `exit $EXIT_CODE` are in template +- GitHub Actions natively fails on non-zero — no special handling needed +- Exit code semantics documented in `docs/ci-github-actions.md` + +### AC 4: Artifacts are generated and usable in pipeline context + +**Verdict: MET** + +**Evidence:** +- Workflow uploads 6 artifact files: `results.json`, `report.md`, + `backend_metrics.json`, `cloud_evaluation.json`, `backend.stdout.log`, + `backend.stderr.log` +- Upload uses `if: always()` — artifacts available even on failure +- `results.json` has versioned Pydantic schema — machine-readable +- `report.md` is human-readable and posted as PR comment +- `cloud_evaluation.json` includes `report_url` for Foundry portal deep-link +- `agentops report --in results.json` can regenerate reports from artifacts + +### AC 5: At least one reference pipeline is documented + +**Verdict: MET** + +**Evidence:** +- `docs/ci-github-actions.md` is a complete reference pipeline guide +- `agentops config cicd` generates a tested, ready-to-use workflow +- Template includes inline comments explaining every step +- Quick start, auth setup, customization, and troubleshooting covered + +### AC 6: Integration works with real-world scenarios + +**Verdict: NOT MET** + +**Evidence from field analysis:** + +Harpreet's pipeline represents a real-world scenario. To replace their +`agenteval.py` with `agentops eval run`, a user would need to: + +1. **Define evaluators in a bundle YAML** — but 7 of 9 evaluators they use + are not supported by AgentOps +2. **Provide test data in JSONL** — but the field uses conversation-format + arrays (query as message list, response as message list with tool calls), + while AgentOps expects simple string fields +3. **Get evaluation results** — AgentOps produces `results.json` and + `report.md`, which is better than Harpreet's raw stdout, but the results + won't contain the metrics teams need +4. **Gate on results** — AgentOps has threshold gating, which Harpreet's + pipeline lacks, but it can only gate on supported evaluators + +**What a user would need to do today to use AgentOps in Harpreet's pipeline:** + +```yaml +# What they want to write: +bundle: + evaluators: + - name: TaskCompletionEvaluator # ❌ not supported + - name: TaskAdherenceEvaluator # ❌ not supported + - name: IntentResolutionEvaluator # ❌ not supported + - name: GroundednessEvaluator # ✅ supported + - name: RelevanceEvaluator # ❌ not supported + - name: ToolCallAccuracyEvaluator # ✅ supported + - name: ToolSelectionEvaluator # ❌ not supported + +# What they can actually use today: +bundle: + evaluators: + - name: GroundednessEvaluator # ✅ + - name: ToolCallAccuracyEvaluator # ✅ + # ...that's it +``` + +**Blockers preventing real-world adoption:** + +| Blocker | Why it blocks | +|---|---| +| Missing evaluators | Teams can't measure what matters to them | +| String-only data format | Teams can't provide conversation-format test data | +| No red-team | Teams must maintain a separate `redteam.py` alongside AgentOps | + +--- + +## 4. Gap Prioritization for Closing the Issue + +### Priority 1 — Critical (blocks AC 6) + +| Item | What to do | Effort | +|---|---|---| +| Add system evaluators | Add `task_completion`, `task_adherence`, `intent_resolution` to `_cloud_evaluator_data_mapping` | Low — mapping only, no new API calls | +| Add RAG evaluator: relevance | Add `relevance` alongside existing `groundedness` | Low | +| Add process evaluators | Add `tool_selection`, `tool_input_accuracy`, `tool_output_utilization` to `_EVALUATORS_NEEDING_TOOL_CALLS` or a new set | Low-Medium — need to verify data_mapping for each | + +These evaluators all use the same `azure_ai_evaluator` type and +`builtin.` pattern that AgentOps already supports. The gap is in the +`_cloud_evaluator_data_mapping` function, which doesn't know how to build +`data_mapping` for these evaluators. Each new evaluator needs: +- An entry in the appropriate frozenset (or a new one) +- The correct `data_mapping` fields (query, response, tool_calls, tool_definitions, etc.) + +### Priority 2 — High (improves real-world viability) + +| Item | What to do | Effort | +|---|---|---| +| Conversation-format data support | Allow JSONL rows with array-of-messages for query/response fields | Medium — requires dataset format model changes | +| `--no-fail` / `--advisory` flag | Add CLI flag that makes exit code always 0 (report thresholds but don't gate) | Low | +| `config validate` command | Implement the planned command to pre-validate configs in CI | Medium | + +### Priority 3 — Medium (documentation) + +| Item | What to do | Effort | +|---|---|---| +| Azure DevOps integration pattern | Document how to use `agentops eval run` in an ADO pipeline | Low — docs only | +| Scheduled evaluation pattern | Document cron-triggered eval for drift detection | Low — docs only | +| Advisory mode pattern | Document how to run eval without gating (once `--no-fail` exists) | Low — docs only | +| Multi-environment pattern | Document how to use `project_endpoint_env` across environments | Low — docs only | + +### Priority 4 — Future (separate feature) + +| Item | What to do | Effort | +|---|---|---| +| Red-team support | New command or new data source type — fundamentally different flow | High — new feature | +| Safety evaluators | `prohibited_actions`, `sensitive_data_leakage`, `violence`, etc. | Medium — requires red-team data source | + +--- + +## 5. Recommendation + +**To close issue #51, focus on Priority 1 (missing evaluators).** This is the +single biggest blocker for real-world CI/CD adoption. The evaluators all follow +the same `azure_ai_evaluator` / `builtin.` pattern that AgentOps already +implements — the gap is mechanical, not architectural. + +Adding 7 evaluators to `foundry_backend.py` would change the AC 6 verdict from +"NOT MET" to "PARTIALLY MET" (still missing conversation-format data and +red-team, but the core evaluation flow would work for the majority of +field-used evaluators). + +Red-team support (Priority 4) should be tracked as a separate issue — it +requires a different data source model (`azure_ai_red_team` with attack +strategies and taxonomy generation) that doesn't fit the current +`agentops eval run` flow. + +--- + +## 6. Summary Scorecard + +| Acceptance Criterion | Verdict | +|---|---| +| AC 1: CI/CD integration patterns clearly defined | ⚠️ Partially met | +| AC 2: Pipelines support evaluation as gating mechanism | ✅ Met | +| AC 3: Exit codes correctly interpreted in CI/CD | ✅ Met | +| AC 4: Artifacts generated and usable in pipeline context | ✅ Met | +| AC 5: At least one reference pipeline documented | ✅ Met | +| AC 6: Integration works with real-world scenarios | ❌ Not met | + +**Overall: 4/6 met, 1/6 partially met, 1/6 not met.** + +The blocking gap is evaluator coverage. AgentOps has the right architecture +for CI/CD integration — declarative config, exit-code gating, artifact +production, generated workflows — but it cannot evaluate the metrics that +real-world Foundry agent pipelines need. diff --git a/docs/analysis-issue-51-two-track.md b/docs/analysis-issue-51-two-track.md new file mode 100644 index 0000000..b320c71 --- /dev/null +++ b/docs/analysis-issue-51-two-track.md @@ -0,0 +1,447 @@ +# Issue #51 — Two-Track Analysis + +**Date:** 2026-04-03 + +--- + +## Track 1: How to Fully Support Foundry Default Evaluators + +### Current Architecture + +The cloud evaluation path in `foundry_backend.py` builds evaluators like this: + +```python +builtin_name = _to_builtin_evaluator_name(evaluator.name) # "SimilarityEvaluator" → "similarity" +criterion = { + "type": "azure_ai_evaluator", + "name": evaluator.name, + "evaluator_name": f"builtin.{builtin_name}", + "data_mapping": _cloud_evaluator_data_mapping(builtin_name, input_field, expected_field, context_field), +} +if _cloud_evaluator_needs_model(builtin_name): + criterion["initialization_parameters"] = {"deployment_name": settings.model} +``` + +The `_cloud_evaluator_data_mapping` function routes evaluators to the correct +`data_mapping` based on frozenset membership: + +``` +default path → {"query": "{{item.X}}", "response": "{{sample.output_text}}"} +_NLP_ONLY_EVALUATORS → no "query", just "response" +_GROUND_TRUTH → adds "ground_truth": "{{item.Y}}" +_CONTEXT → adds "context": "{{item.Z}}" +_TOOL_CALLS → adds "tool_calls": "{{sample.tool_calls}}", "tool_definitions": "{{item.tool_definitions}}" +``` + +### Problem: Only 8 of ~35 evaluators are routed correctly + +Any evaluator NOT in any frozenset falls to the default path (`query` + `response`). +This accidentally works for some evaluators (like `coherence`) but silently sends +wrong data_mappings for many others. + +### What Each Evaluator Actually Needs + +Based on Foundry cloud evaluation docs (2026-04-02), here are the correct +`data_mapping` patterns for every built-in evaluator: + +#### Pattern 1: query + response (simplest — default path) + +Works with current default path. No code change needed. + +| Evaluator | builtin name | Needs model | Status | +|---|---|---|---| +| CoherenceEvaluator | `coherence` | Yes | ✅ Works today (falls to default) | +| FluencyEvaluator | `fluency` | Yes | ✅ Works today | +| RelevanceEvaluator | `relevance` | Yes | ✅ Works today | +| IntentResolutionEvaluator | `intent_resolution` | Yes | ✅ Works today | +| TaskCompletionEvaluator | `task_completion` | Yes | ✅ Works today | +| ViolenceEvaluator | `violence` | Yes | ✅ Works today | +| SexualEvaluator | `sexual` | Yes | ✅ Works today | +| SelfHarmEvaluator | `self_harm` | Yes | ✅ Works today | +| HateUnfairnessEvaluator | `hate_unfairness` | Yes | ✅ Works today | +| ContentSafetyEvaluator | `content_safety` | Yes | ✅ Works today | +| ProtectedMaterialEvaluator | `protected_material` | Yes | ✅ Works today | +| CodeVulnerabilityEvaluator | `code_vulnerability` | Yes | ✅ Works today | +| UngroundedAttributesEvaluator | `ungrounded_attributes` | Yes | ✅ Works today | +| IndirectAttackEvaluator | `indirect_attack` | Yes | ✅ Works today | + +**Verdict:** These 14 evaluators already work with the current code — users +just don't know they can use them because they're not documented/tested. + +#### Pattern 2: query + response (output_items) — agent structured output + +`task_adherence` needs `{{sample.output_items}}` instead of +`{{sample.output_text}}` for the response field, because it needs to see the +full structured agent output (tool calls, intermediate steps). + +| Evaluator | builtin name | response field | Status | +|---|---|---|---| +| TaskAdherenceEvaluator | `task_adherence` | `{{sample.output_items}}` | ❌ **Broken** — sends `output_text` | + +**Fix required:** Add `task_adherence` to a new set +`_EVALUATORS_NEEDING_OUTPUT_ITEMS` and map `response` to +`{{sample.output_items}}` instead of `{{sample.output_text}}`. + +#### Pattern 3: response + ground_truth (existing) + +Already implemented via `_EVALUATORS_NEEDING_GROUND_TRUTH`. + +| Evaluator | builtin name | Status | +|---|---|---| +| SimilarityEvaluator | `similarity` | ✅ Supported | +| ResponseCompletenessEvaluator | `response_completeness` | ❌ Missing from frozenset | + +**Fix required:** Add `response_completeness` to `_EVALUATORS_NEEDING_GROUND_TRUTH`. + +#### Pattern 4: NLP only — no query, no model (existing) + +Already implemented via `_NLP_ONLY_EVALUATORS`. + +| Evaluator | builtin name | Status | +|---|---|---| +| F1ScoreEvaluator | `f1_score` | ✅ Supported | +| BleuScoreEvaluator | `bleu` | ✅ Supported | +| GleuScoreEvaluator | `gleu` | ✅ Supported | +| RougeScoreEvaluator | `rouge` | ✅ Supported | +| MeteorScoreEvaluator | `meteor` | ✅ Supported | + +#### Pattern 5: response + context (existing) + +Already implemented via `_EVALUATORS_NEEDING_CONTEXT`. + +| Evaluator | builtin name | Status | +|---|---|---| +| GroundednessEvaluator | `groundedness` | ✅ Supported | +| GroundednessProEvaluator | `groundedness_pro` | ❌ Missing from frozenset | +| RetrievalEvaluator | `retrieval` | ❌ Missing from frozenset | + +**Fix required:** Add `groundedness_pro` and `retrieval` to +`_EVALUATORS_NEEDING_CONTEXT`. + +#### Pattern 6: tool evaluators (existing) + +Already implemented via `_EVALUATORS_NEEDING_TOOL_CALLS`. + +| Evaluator | builtin name | data_mapping | Status | +|---|---|---|---| +| ToolCallAccuracyEvaluator | `tool_call_accuracy` | query, response, tool_calls, tool_definitions | ✅ Supported | +| ToolSelectionEvaluator | `tool_selection` | query, response, tool_calls, tool_definitions | ❌ Missing from frozenset | +| ToolInputAccuracyEvaluator | `tool_input_accuracy` | query, response, tool_definitions | ❌ Missing (needs tool_definitions but not tool_calls) | +| ToolOutputUtilizationEvaluator | `tool_output_utilization` | query, response, tool_definitions | ❌ Missing | +| ToolCallSuccessEvaluator | `tool_call_success` | response, tool_definitions | ❌ Missing | + +**Fix required:** +- Add `tool_selection` to `_EVALUATORS_NEEDING_TOOL_CALLS` +- For `tool_input_accuracy` and `tool_output_utilization`: need + `tool_definitions` but NOT `tool_calls` — need a new set + `_EVALUATORS_NEEDING_TOOL_DEFINITIONS_ONLY` +- For `tool_call_success`: needs `response` + `tool_definitions` only + +#### Pattern 7: Special — Graders + +Azure OpenAI graders use `type: "azure_openai_grader"` instead of +`type: "azure_ai_evaluator"`. These are a different testing criteria type. + +| Evaluator | Status | +|---|---| +| AzureOpenAILabelGrader | ❌ Not supported — different type | +| AzureOpenAIStringCheckGrader | ❌ Not supported — different type | +| AzureOpenAITextSimilarityGrader | ❌ Not supported — different type | +| AzureOpenAIGrader | ❌ Not supported — different type | + +**Out of scope for now.** Graders require a fundamentally different config +model (rubric templates, scoring criteria). Can be tracked separately. + +#### Pattern 8: Special — Red team + +Red team evaluators use a different data source type +(`azure_ai_red_team`) with attack strategies and taxonomy generation. + +| Evaluator | Status | +|---|---| +| ProhibitedActionsEvaluator | ❌ Different flow | +| SensitiveDataLeakageEvaluator | ❌ Different flow | + +**Out of scope for now.** Red team requires a separate execution flow. + +### Summary: What Needs to Change in `foundry_backend.py` + +| Change | Affected evaluators | Effort | +|---|---|---| +| Add to `_EVALUATORS_NEEDING_GROUND_TRUTH` | `response_completeness` | 1 line | +| Add to `_EVALUATORS_NEEDING_CONTEXT` | `groundedness_pro`, `retrieval` | 1 line | +| Add to `_EVALUATORS_NEEDING_TOOL_CALLS` | `tool_selection` | 1 line | +| New set: `_EVALUATORS_NEEDING_TOOL_DEFS_ONLY` | `tool_input_accuracy`, `tool_output_utilization`, `tool_call_success` | ~10 lines | +| New set: `_EVALUATORS_NEEDING_OUTPUT_ITEMS` | `task_adherence` | ~5 lines | +| Document that default path works | `coherence`, `fluency`, `relevance`, `intent_resolution`, `task_completion`, all safety evaluators | 0 lines (docs only) | + +### Data Model Gap: item_schema + +The current code builds `item_schema` with only two string fields: + +```python +item_schema = { + "type": "object", + "properties": { + input_field: {"type": "string"}, + expected_field: {"type": "string"}, + }, + "required": [input_field, expected_field], +} +``` + +For tool evaluators to work, the schema must also declare `tool_definitions` +(and `tool_calls` if present in the dataset). The schema needs to be +dynamically built based on which evaluators are enabled. + +**Fix required:** When any evaluator in `_EVALUATORS_NEEDING_TOOL_CALLS` or +`_EVALUATORS_NEEDING_TOOL_DEFS_ONLY` is enabled, add `tool_definitions` to +`item_schema.properties`. Similarly, add `context_field` when context +evaluators are used. + +### Data Model Gap: DatasetFormat + +`DatasetFormat` currently has `input_field`, `expected_field`, and +`context_field`. It does NOT have: +- `tool_definitions_field` — needed for tool evaluators +- `tool_calls_field` — needed for `tool_call_accuracy`, `tool_selection` + +**Fix required:** Add optional fields to `DatasetFormat` model: + +```python +class DatasetFormat(BaseModel): + type: str + input_field: str + expected_field: str + context_field: Optional[str] = None + tool_definitions_field: Optional[str] = None # NEW + tool_calls_field: Optional[str] = None # NEW +``` + +### Revised Evaluator Support Count + +After the fixes above: + +| Category | Before | After | +|---|---|---| +| Works correctly today | 8 (NLP + similarity + groundedness + tool_call_accuracy) | 8 | +| Accidentally works (default path) | 0 recognized | 14 newly recognized | +| Fixed by adding to frozensets | 0 | 5 (response_completeness, groundedness_pro, retrieval, tool_selection, task_adherence) | +| Fixed by new sets | 0 | 3 (tool_input_accuracy, tool_output_utilization, tool_call_success) | +| **Total supported** | **8** | **30** | +| Remaining unsupported | | 5 (4 graders + documentation_retrieval) | + +--- + +## Track 2: Evaluation Patterns from Real Scenarios (Harpreet) + +### Pattern A: Cloud Agent Evaluation with Inline Data + +**Source:** `agenteval.py` + +**Flow:** +1. Connect to Foundry project via `AIProjectClient` +2. Get OpenAI client via `project_client.get_openai_client()` +3. Define `data_source_config` with `type: custom` and item_schema +4. Define `testing_criteria` — array of `azure_ai_evaluator` entries +5. Call `client.evals.create()` with testing_criteria +6. Call `client.evals.runs.create()` with inline JSONL data +7. Poll `client.evals.runs.retrieve()` until completed/failed +8. Retrieve output items via `client.evals.runs.output_items.list()` + +**Data format used:** + +```python +data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": {"anyOf": [{"type": "string"}, {"type": "array"}]}, + "tool_definitions": {"anyOf": [{"type": "object"}, {"type": "array"}]}, + "tool_calls": {"anyOf": [{"type": "object"}, {"type": "array"}]}, + "response": {"anyOf": [{"type": "string"}, {"type": "array"}]}, + }, + "required": ["query", "response", "tool_definitions"], + }, + "include_sample_schema": True, +} +``` + +**Key observation:** The field types use `anyOf` with string OR array. This +allows both simple string queries AND structured conversation-format arrays. +AgentOps hardcodes `{"type": "string"}` — this works for simple eval but +blocks conversation-format data. + +**Evaluators used (9 total):** + +| # | Name | Category | data_mapping | +|---|---|---|---| +| 1 | task_completion | System | query, response, tool_definitions | +| 2 | task_adherence | System | query, response, tool_definitions | +| 3 | intent_resolution | System | query, response, tool_definitions | +| 4 | groundedness | RAG | query, tool_definitions, response | +| 5 | relevance | RAG | query, response | +| 6 | tool_call_accuracy | Process | query, tool_definitions, tool_calls, response | +| 7 | tool_selection | Process | query, response, tool_calls, tool_definitions | +| 8 | tool_input_accuracy | Process | query, response, tool_definitions | +| 9 | tool_output_utilization | Process | query, response, tool_definitions | + +**AgentOps compatibility after Track 1 fixes:** 9/9 evaluators would be +supported. The remaining gap is the `item_schema` format — Harpreet uses +`anyOf` types while AgentOps hardcodes `string`. + +### Pattern B: Red Team Safety Evaluation + +**Source:** `redteam.py` + +**Flow:** +1. Connect to Foundry project client +2. Create an agent version via `project_client.agents.create_version()` +3. Define safety testing criteria (7 evaluators) +4. Create evaluation taxonomy via `project_client.evaluation_taxonomies.create()` +5. Create eval run with `data_source.type: azure_ai_red_team` +6. Uses generated adversarial inputs with attack strategies `["Flip", "Base64"]` +7. Poll until completion, save results to JSON + +**Data source:** `azure_ai_red_team` — fundamentally different from the +`custom`/`completions`/`azure_ai_target_completions` data sources that +AgentOps supports. + +**Safety evaluators used (7 total):** + +| # | Name | builtin name | +|---|---|---| +| 1 | Prohibited Actions | `builtin.prohibited_actions` | +| 2 | Task Adherence | `builtin.task_adherence` | +| 3 | Sensitive Data Leakage | `builtin.sensitive_data_leakage` | +| 4 | Self Harm | `builtin.self_harm` | +| 5 | Violence | `builtin.violence` | +| 6 | Sexual | `builtin.sexual` | +| 7 | Hate Unfairness | `builtin.hate_unfairness` | + +**Key observations:** +- Safety evaluators like `violence`, `self_harm`, `sexual`, `hate_unfairness` + CAN be used in normal cloud evaluation (Pattern A) with `query + response` + data mapping — they don't REQUIRE the red team data source. +- `prohibited_actions` and `sensitive_data_leakage` are red-team-specific. +- `task_adherence` is reused across both patterns. + +**AgentOps compatibility:** The safety evaluators (items 4-7) would work in +normal eval after Track 1 (they use the default `query + response` pattern). +The red-team flow itself (attack strategies, taxonomy generation) is a +separate feature. + +### Pattern C: Agent Smoke Test + +**Source:** `exagent.py` + +**Flow:** +1. Connect to Foundry project client +2. Get existing agent by name via `project_client.agents.get()` +3. Get OpenAI client via `project_client.get_openai_client()` +4. Send a query via `openai_client.responses.create()` with agent reference +5. Handle MCP approval requests (auto-approve) +6. Poll for response completion +7. Display response text and citations + +**AgentOps compatibility:** Not relevant to evaluation. This is a +pre-evaluation health check. Users can add this as a custom pipeline step +before `agentops eval run`. No tool change needed. + +### Pattern D: Data Format — Conversation vs. String + +**The critical data model difference:** + +Harpreet's `agenteval.py` provides data in **conversation format**: + +```python +query = [ + {"role": "system", "content": "You are a weather report agent."}, + {"role": "user", "content": [{"type": "text", "text": "Can you send me..."}]}, +] + +response = [ + {"role": "assistant", "content": [{"type": "tool_call", "name": "fetch_weather", ...}]}, + {"role": "tool", "content": [{"type": "tool_result", ...}]}, + {"role": "assistant", "content": [{"type": "text", "text": "I have successfully..."}]}, +] + +tool_definitions = [ + {"name": "fetch_weather", "description": "...", "parameters": {...}}, + {"name": "send_email", "description": "...", "parameters": {...}}, +] +``` + +AgentOps datasets use **simple string format**: + +```jsonl +{"input": "What is the weather?", "expected": "Sunny, 25°C"} +``` + +**When does this matter?** + +- **For model-direct evaluation:** Simple strings work fine. The model receives + the query and generates a response — evaluators compare output_text. +- **For agent evaluation with tool calls:** The conversation format is needed + when evaluating tool-using agents on pre-computed responses. But when using + `azure_ai_target_completions` with a live agent target, the agent generates + structured responses at runtime — so simple string queries work. +- **For dataset (offline) evaluation:** If users want to evaluate + pre-computed agent conversations (not calling the agent at runtime), + they need conversation-format JSONL rows. + +**Impact on AgentOps:** + +The current `item_schema` hardcodes `{"type": "string"}`. This blocks: +1. Dataset evaluation with pre-computed structured responses +2. Tool evaluators that need `tool_definitions` in the dataset rows + +It does NOT block: +1. Live agent evaluation (agent generates structured output at runtime) +2. Live model evaluation (model generates text at runtime) + +**Fix:** Make `item_schema.properties` type flexible — use `anyOf` when the +evaluator requires structured data, or infer from JSONL row content. + +--- + +## Synthesis: Combined Gap Map + +| # | Gap | Track | Severity | Fix | +|---|---|---|---|---| +| 1 | 14 evaluators work but aren't documented | Track 1 | Low | Document and add tests | +| 2 | `response_completeness` missing from ground_truth set | Track 1 | Low | 1 line | +| 3 | `groundedness_pro`, `retrieval` missing from context set | Track 1 | Low | 1 line | +| 4 | `tool_selection` missing from tool_calls set | Track 1 | Low | 1 line | +| 5 | `tool_input_accuracy`, `tool_output_utilization`, `tool_call_success` need new set | Track 1 | Medium | ~10 lines | +| 6 | `task_adherence` needs `{{sample.output_items}}` response mapping | Track 1 | Medium | ~5 lines | +| 7 | `item_schema` hardcodes `{"type": "string"}` | Track 1+2 | High | Dynamic schema building | +| 8 | `DatasetFormat` lacks `tool_definitions_field` | Track 1+2 | High | Model change + wire through | +| 9 | `item_schema` doesn't include context_field | Track 1 | Medium | Dynamic schema building | +| 10 | Red team flow not supported | Track 2 | Future | Separate feature | +| 11 | Graders not supported | Track 1 | Future | Different testing_criteria type | + +### Recommended Implementation Order + +**Phase 1 — Quick wins (unblock 14 more evaluators):** +- Add evaluators to existing frozensets (#2, #3, #4) +- Create new frozensets (#5, #6) +- Update `_cloud_evaluator_data_mapping` for new patterns +- Add unit tests +- Update evaluator reference doc + +**Phase 2 — Schema flexibility (unblock tool evaluators with dataset data):** +- Add `tool_definitions_field` and `tool_calls_field` to `DatasetFormat` +- Build `item_schema` dynamically based on enabled evaluators +- Add `context_field` to `item_schema` when context evaluators are used +- Use `anyOf` types when field content may be structured + +**Phase 3 — Documentation (confirm patterns work end-to-end):** +- Document which evaluators work for each scenario +- Add bundle examples for agent evaluation with tool evaluators +- Document conversation-format dataset rows + +**Phase 4 — Future:** +- Red team data source support +- Azure OpenAI grader support diff --git a/docs/ci-github-actions.md b/docs/ci-github-actions.md index 48e5fc6..e368c74 100644 --- a/docs/ci-github-actions.md +++ b/docs/ci-github-actions.md @@ -241,6 +241,347 @@ Remove or comment out the "Post report as PR comment" step in the workflow. --- +## CI/CD Integration Models + +AgentOps supports several integration models depending on your team's workflow. Choose the one that fits your CI/CD strategy. + +### PR Quality Gate (default) + +Run evaluations on every pull request. The evaluation result gates whether the PR can merge. + +``` +PR opened → agentops eval run → exit code 0 → merge allowed + exit code 2 → merge blocked (thresholds failed) +``` + +This is what the generated workflow template provides out of the box. Use this when evaluation quality should directly block code changes. + +**When to use:** Teams that want to prevent quality regressions before merging. + +### Scheduled Regression Detection + +Run evaluations on a schedule (nightly, weekly) to detect model or agent degradation over time without blocking PRs. + +Add a `schedule` trigger to the workflow: + +```yaml +on: + schedule: + - cron: '0 2 * * 1' # Every Monday at 2 AM UTC + workflow_dispatch: +``` + +Combine with `agentops eval compare --runs latest,previous` to detect regressions across runs. + +**When to use:** Teams that need ongoing quality monitoring independent of code changes (e.g. model deployment changes, data drift). + +### Post-Deployment Validation + +Run evaluations after deploying to an environment to verify the deployed agent or model meets quality standards. + +```yaml +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Deploy agent + run: az ai agent deploy ... + + validate: + needs: deploy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + - run: pip install agentops-toolkit + - run: agentops eval run --config .agentops/run.yaml +``` + +**When to use:** Teams that deploy agents independently and want to verify quality post-deployment. + +### Multi-Environment Promotion + +Run evaluations across environments (dev → test → prod) using the same evaluation config but different Foundry project endpoints. Each environment uses GitHub Environment protection rules. + +```yaml +jobs: + eval-dev: + environment: dev + env: + AZURE_AI_FOUNDRY_PROJECT_ENDPOINT: ${{ secrets.AZURE_AI_FOUNDRY_PROJECT_ENDPOINT }} + steps: + - run: agentops eval run + + eval-test: + needs: eval-dev + environment: test + env: + AZURE_AI_FOUNDRY_PROJECT_ENDPOINT: ${{ secrets.AZURE_AI_FOUNDRY_PROJECT_ENDPOINT }} + steps: + - run: agentops eval run + + eval-prod: + needs: eval-test + environment: production # requires approval + env: + AZURE_AI_FOUNDRY_PROJECT_ENDPOINT: ${{ secrets.AZURE_AI_FOUNDRY_PROJECT_ENDPOINT }} + steps: + - run: agentops eval run +``` + +The key principle: **the evaluation policy is environment-invariant**. The same `run.yaml`, bundle, and thresholds evaluate the same agent across environments. Only `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` changes — set as a per-environment secret via GitHub Environments. + +The `needs:` dependency ensures each stage only runs if the previous one passes (exit code 0). GitHub Environment protection rules can require manual approval for production. + +**When to use:** Enterprise teams with dev/test/prod environments that need sequential validation before production. + +### Multi-Config Matrix + +Run several evaluation configs in parallel (already documented above in [Running multiple evaluations](#running-multiple-evaluations)). + +**When to use:** Teams that run different bundles (model-direct, RAG, agent tools) in a single pipeline. + +### Azure DevOps Pipelines + +AgentOps works in Azure DevOps pipelines the same way — the CLI exit codes and artifacts are CI-system-agnostic. Here is a minimal Azure DevOps pipeline: + +```yaml +trigger: + branches: + include: + - main + - develop + +pool: + vmImage: 'ubuntu-latest' + +variables: + - group: agentops-vars # contains AZURE_AI_FOUNDRY_PROJECT_ENDPOINT + +steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.11' + + - task: AzureCLI@2 + displayName: 'Run AgentOps Evaluation' + inputs: + azureSubscription: 'your-service-connection' + scriptType: 'bash' + scriptLocation: 'inlineScript' + inlineScript: | + pip install agentops-toolkit + agentops eval run --config .agentops/run.yaml + EXIT_CODE=$? + if [ $EXIT_CODE -eq 0 ]; then + echo "##[section]Evaluation Passed" + elif [ $EXIT_CODE -eq 2 ]; then + echo "##[error]Evaluation Failed — Threshold(s) Not Met" + else + echo "##[error]Evaluation Error (exit code $EXIT_CODE)" + fi + exit $EXIT_CODE + + - task: PublishBuildArtifacts@1 + displayName: 'Upload evaluation results' + condition: always() + inputs: + PathtoPublish: '.agentops/results/latest' + ArtifactName: 'agentops-eval-results' +``` + +Key differences from GitHub Actions: +- Use `AzureCLI@2` task for authentication (service connection). +- Use `PublishBuildArtifacts@1` for artifact upload. +- Use ADO variable groups for secrets. +- Exit codes are interpreted the same way — ADO fails the task on non-zero. + +--- + +## Best Practices for Gating Deployments + +### Design thresholds for your scenario + +Set thresholds based on your evaluation scenario and risk tolerance: + +```yaml +# Model-direct: text quality matters +thresholds: + - evaluator: CoherenceEvaluator + criteria: ">=" + value: 4 # High bar for coherence + - evaluator: SimilarityEvaluator + criteria: ">=" + value: 3 # Moderate similarity to expected answers + +# Agent with tools: functional correctness matters +thresholds: + - evaluator: TaskCompletionEvaluator + criteria: ">=" + value: 3 + - evaluator: ToolCallAccuracyEvaluator + criteria: ">=" + value: 3 + - evaluator: IntentResolutionEvaluator + criteria: ">=" + value: 4 + +# Safety-critical: zero tolerance +thresholds: + - evaluator: ViolenceEvaluator + criteria: "<=" + value: 0 # Must be zero + - evaluator: SelfHarmEvaluator + criteria: "<=" + value: 0 +``` + +### Use per-row thresholds for consistency + +AgentOps evaluates thresholds per-row, not just on averages. A single failing row fails the evaluation — this catches outlier regressions that averages would hide. + +### Start lenient, tighten over time + +Begin with low thresholds to establish a baseline, then raise them as your agent improves: + +1. First run: set thresholds low (`>= 1`) to establish passing baseline +2. Review `report.md` scores to understand typical ranges +3. Raise thresholds to just below the current average +4. Iterate as the agent improves + +### Combine quality and safety evaluators + +Run both in a single bundle so a single pipeline stage covers all dimensions: + +```yaml +evaluators: + # Quality + - name: CoherenceEvaluator + source: foundry + enabled: true + - name: RelevanceEvaluator + source: foundry + enabled: true + # Safety + - name: ViolenceEvaluator + source: foundry + enabled: true + - name: HateUnfairnessEvaluator + source: foundry + enabled: true +``` + +### Use comparison for regression detection + +After each evaluation, compare against a known-good baseline: + +```bash +agentops eval run --config .agentops/run.yaml +agentops eval compare --runs latest,2026-03-15_120000 +``` + +Exit code `2` from compare means regressions were detected. + +### Choose the right evaluators for your scenario + +AgentOps supports all Foundry built-in evaluators. Select the ones that match your scenario: + +| Scenario | Recommended evaluators | +| --- | --- | +| Model-direct (text generation) | CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, RelevanceEvaluator | +| RAG (retrieval-augmented) | GroundednessEvaluator, RelevanceEvaluator, ResponseCompletenessEvaluator | +| Agent with tools | TaskCompletionEvaluator, TaskAdherenceEvaluator, IntentResolutionEvaluator, ToolCallAccuracyEvaluator, ToolSelectionEvaluator | +| Safety-critical | ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator | +| Text similarity (NLP) | F1ScoreEvaluator, BleuScoreEvaluator, RougeScoreEvaluator, MeteorScoreEvaluator | + +### Keep evaluation config in Git + +All evaluation policy — bundles, datasets, thresholds — should be committed to the repository. This ensures: + +- Evaluation changes are PR-reviewable YAML diffs +- Every evaluation is reproducible from a git commit +- No configuration drift between environments + +--- + +## Supported Evaluators + +AgentOps supports the following Foundry built-in evaluators in cloud evaluation mode. All evaluators use the `azure_ai_evaluator` testing criteria type with `builtin.` designator. + +### Quality Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| CoherenceEvaluator | `coherence` | query, response | Yes | +| FluencyEvaluator | `fluency` | query, response | Yes | +| RelevanceEvaluator | `relevance` | query, response | Yes | + +### Agent Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| IntentResolutionEvaluator | `intent_resolution` | query, response | Yes | +| TaskCompletionEvaluator | `task_completion` | query, response | Yes | +| TaskAdherenceEvaluator | `task_adherence` | query, response (output_items) | Yes | + +### Similarity / Ground Truth Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| SimilarityEvaluator | `similarity` | query, response, ground_truth | Yes | +| ResponseCompletenessEvaluator | `response_completeness` | query, response, ground_truth | Yes | + +### RAG / Context Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| GroundednessEvaluator | `groundedness` | query, response, context | Yes | +| GroundednessProEvaluator | `groundedness_pro` | query, response, context | Yes | +| RetrievalEvaluator | `retrieval` | query, response, context | Yes | + +RAG evaluators use the `context_field` from your dataset format config. If not set, they fall back to `expected_field`. + +### Safety Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| ViolenceEvaluator | `violence` | query, response | Yes | +| SexualEvaluator | `sexual` | query, response | Yes | +| SelfHarmEvaluator | `self_harm` | query, response | Yes | +| HateUnfairnessEvaluator | `hate_unfairness` | query, response | Yes | + +Safety evaluators require a Foundry project in a [region that supports content safety](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators#foundry-project-configuration-and-region-support). If your region does not support them, the evaluators will return errors — run with `--verbose` to see details. + +### Tool Evaluators + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| ToolCallAccuracyEvaluator | `tool_call_accuracy` | query, response, tool_calls, tool_definitions | Yes | +| ToolSelectionEvaluator | `tool_selection` | query, response, tool_calls, tool_definitions | Yes | +| ToolInputAccuracyEvaluator | `tool_input_accuracy` | query, response, tool_definitions | Yes | +| ToolOutputUtilizationEvaluator | `tool_output_utilization` | query, response, tool_definitions | Yes | +| ToolCallSuccessEvaluator | `tool_call_success` | response, tool_definitions | Yes | + +Tool evaluators require `tool_definitions` in your JSONL dataset rows. For evaluators that also need `tool_calls`, the agent's runtime tool call output is used automatically via `{{sample.tool_calls}}`. + +### NLP Evaluators (Non-LLM) + +| Evaluator | `builtin.` name | Inputs | Needs model | +| --- | --- | --- | --- | +| F1ScoreEvaluator | `f1_score` | response, ground_truth | No | +| BleuScoreEvaluator | `bleu_score` | response, ground_truth | No | +| GleuScoreEvaluator | `gleu_score` | response, ground_truth | No | +| RougeScoreEvaluator | `rouge_score` | response, ground_truth | No | +| MeteorScoreEvaluator | `meteor_score` | response, ground_truth | No | + +NLP evaluators compare the generated response against `ground_truth` (the `expected_field` in your dataset) using text-matching algorithms. They do not require a model deployment. + +--- + ## Troubleshooting | Problem | Solution | @@ -250,6 +591,8 @@ Remove or comment out the "Post report as PR comment" step in the workflow. | Missing artifacts | Ensure `.agentops/results/latest/` is not in `.gitignore` — the workflow reads this path | | Authentication errors | Verify the federated credential entity matches your repo/branch; check that `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID` are set as repository variables; confirm the app registration has access to the Foundry project | | `agentops: command not found` | Ensure `pip install agentops-toolkit` runs before the eval step | +| Safety evaluators return no scores | Your Foundry project must be in a [region that supports content safety](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators#foundry-project-configuration-and-region-support). Run with `--verbose` to see the specific error from the service. | +| `Missing scores for enabled evaluators` | One or more evaluators returned no score. Run with `--verbose` to see per-evaluator error messages. Common causes: region restrictions (safety), missing `tool_definitions` in dataset (tool evaluators), or unsupported evaluator name. | --- diff --git a/src/agentops/backends/foundry_backend.py b/src/agentops/backends/foundry_backend.py index e64e374..3850bea 100644 --- a/src/agentops/backends/foundry_backend.py +++ b/src/agentops/backends/foundry_backend.py @@ -111,33 +111,51 @@ def _parse_agent_name_version(agent_id: str) -> tuple[str, str | None]: _NLP_ONLY_EVALUATORS = frozenset( { "f1_score", - "bleu", - "rouge", - "meteor", - "gleu", + "bleu_score", + "rouge_score", + "meteor_score", + "gleu_score", } ) _EVALUATORS_NEEDING_GROUND_TRUTH = frozenset( { "similarity", + "response_completeness", "f1_score", - "bleu", - "rouge", - "meteor", - "gleu", + "bleu_score", + "rouge_score", + "meteor_score", + "gleu_score", } ) _EVALUATORS_NEEDING_CONTEXT = frozenset( { "groundedness", + "groundedness_pro", + "retrieval", } ) _EVALUATORS_NEEDING_TOOL_CALLS = frozenset( { "tool_call_accuracy", + "tool_selection", + } +) + +_EVALUATORS_NEEDING_TOOL_DEFS_ONLY = frozenset( + { + "tool_input_accuracy", + "tool_output_utilization", + "tool_call_success", + } +) + +_EVALUATORS_NEEDING_OUTPUT_ITEMS = frozenset( + { + "task_adherence", } ) @@ -156,7 +174,10 @@ def _cloud_evaluator_data_mapping( mapping: Dict[str, str] = {} if builtin_name not in _NLP_ONLY_EVALUATORS: mapping["query"] = item_input - mapping["response"] = sample_response + if builtin_name in _EVALUATORS_NEEDING_OUTPUT_ITEMS: + mapping["response"] = "{{sample.output_items}}" + else: + mapping["response"] = sample_response if builtin_name in _EVALUATORS_NEEDING_GROUND_TRUTH: mapping["ground_truth"] = item_expected elif builtin_name in _EVALUATORS_NEEDING_CONTEXT: @@ -167,6 +188,8 @@ def _cloud_evaluator_data_mapping( elif builtin_name in _EVALUATORS_NEEDING_TOOL_CALLS: mapping["tool_calls"] = "{{sample.tool_calls}}" mapping["tool_definitions"] = "{{item.tool_definitions}}" + elif builtin_name in _EVALUATORS_NEEDING_TOOL_DEFS_ONLY: + mapping["tool_definitions"] = "{{item.tool_definitions}}" return mapping @@ -175,6 +198,13 @@ def _cloud_evaluator_needs_model(builtin_name: str) -> bool: return builtin_name not in _NLP_ONLY_EVALUATORS +# Default initialization_parameters for evaluators that require them but are +# not AI-assisted (so they don't get deployment_name automatically). +_NLP_DEFAULT_INIT_PARAMS: Dict[str, Dict[str, Any]] = { + "rouge_score": {"rouge_type": "rouge1"}, +} + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -407,13 +437,14 @@ def _to_snake_case(value: str) -> str: def _default_foundry_input_mapping(name: str) -> Dict[str, str]: - if name == "SimilarityEvaluator": + builtin = _to_builtin_evaluator_name(name) + if builtin in _EVALUATORS_NEEDING_GROUND_TRUTH: return { "query": "$prompt", "response": "$prediction", "ground_truth": "$expected", } - if name == "GroundednessEvaluator": + if builtin in _EVALUATORS_NEEDING_CONTEXT: return { "query": "$prompt", "response": "$prediction", @@ -422,19 +453,35 @@ def _default_foundry_input_mapping(name: str) -> Dict[str, str]: # if your dataset column has a different name. "context": "$row.context", } - if name == "TaskCompletionEvaluator": + if builtin in _EVALUATORS_NEEDING_TOOL_CALLS: return { "query": "$prompt", "response": "$prediction", + "tool_calls": "$row.tool_calls", + "tool_definitions": "$row.tool_definitions", } - if name == "ToolCallAccuracyEvaluator": + if builtin in _EVALUATORS_NEEDING_TOOL_DEFS_ONLY: return { "query": "$prompt", "response": "$prediction", - "tool_calls": "$row.tool_calls", "tool_definitions": "$row.tool_definitions", } - return {} + if builtin in _EVALUATORS_NEEDING_OUTPUT_ITEMS: + return { + "query": "$prompt", + "response": "$prediction", + } + if builtin in _NLP_ONLY_EVALUATORS: + return { + "response": "$prediction", + "ground_truth": "$expected", + } + # Default: query + response (works for coherence, fluency, relevance, + # intent_resolution, task_completion, safety evaluators, etc.) + return { + "query": "$prompt", + "response": "$prediction", + } def _default_score_keys(name: str) -> List[str]: @@ -1207,6 +1254,10 @@ def _execute_cloud_evaluation( criterion["initialization_parameters"] = { "deployment_name": settings.model, } + elif builtin_name in _NLP_DEFAULT_INIT_PARAMS: + criterion["initialization_parameters"] = dict( + _NLP_DEFAULT_INIT_PARAMS[builtin_name] + ) testing_criteria.append(criterion) # --- Acquire token for Foundry Project Evals API -------------------- @@ -1246,12 +1297,35 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: ) # --- Data schema ---------------------------------------------------- + # Determine which extra fields the enabled evaluators need so that + # the item_schema declares them and the Foundry service validates + # dataset rows correctly. + builtin_names = frozenset( + _to_builtin_evaluator_name(e.name) for e in foundry_evaluators + ) + needs_tool_defs = bool( + builtin_names + & (_EVALUATORS_NEEDING_TOOL_CALLS | _EVALUATORS_NEEDING_TOOL_DEFS_ONLY) + ) + needs_context = bool(builtin_names & _EVALUATORS_NEEDING_CONTEXT) + + schema_properties: Dict[str, Any] = { + input_field: {"type": "string"}, + expected_field: {"type": "string"}, + } + if needs_context and dataset_config.format.context_field: + schema_properties[dataset_config.format.context_field] = {"type": "string"} + if needs_tool_defs: + schema_properties["tool_definitions"] = { + "anyOf": [ + {"type": "array", "items": {"type": "object"}}, + {"type": "object"}, + ] + } + item_schema: Dict[str, Any] = { "type": "object", - "properties": { - input_field: {"type": "string"}, - expected_field: {"type": "string"}, - }, + "properties": schema_properties, "required": [input_field, expected_field], } @@ -1444,6 +1518,24 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: break value = float(metric_score) row_metric_entries.append({"name": metric_name, "value": value}) + elif isinstance(metric_name, str) and metric_score is None: + # Evaluator returned null score — check for error details. + sample_data = result.get("sample", {}) or {} + error_info = sample_data.get("error", {}) or {} + error_msg = error_info.get("message", "") + if error_msg: + logger.warning( + "Evaluator '%s' returned no score (row %d): %s", + metric_name, + index, + error_msg, + ) + else: + logger.warning( + "Evaluator '%s' returned no score for row %d", + metric_name, + index, + ) # Only emit local evaluator metrics if they are configured in the bundle. if "exact_match" in enabled_local_names: diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py index 37731ae..5e5cc30 100644 --- a/src/agentops/services/runner.py +++ b/src/agentops/services/runner.py @@ -270,7 +270,10 @@ def _validate_enabled_evaluators_scored( missing = [name for name in evaluator_names if name not in scored_names] if missing: raise ValueError( - "Missing scores for enabled evaluators: " + ", ".join(sorted(missing)) + "Missing scores for enabled evaluators: " + + ", ".join(sorted(missing)) + + ". These evaluators returned no score from the cloud evaluation. " + "Run with --verbose to see details (e.g. region restrictions for safety evaluators)." ) @@ -367,7 +370,9 @@ def _append_run_metric(name: str, value: float) -> None: def run_evaluation( - config_path: Path | None = None, output_override: Path | None = None, report_format: str = "md", + config_path: Path | None = None, + output_override: Path | None = None, + report_format: str = "md", ) -> EvalRunServiceResult: run_config_path = ( config_path.resolve() if config_path is not None else _default_run_config_path() @@ -512,9 +517,7 @@ def run_evaluation( report_path = md_path if report_format in ("html", "all"): html_path = output_dir / "report.html" - html_path.write_text( - generate_report_html(normalized_result), encoding="utf-8" - ) + html_path.write_text(generate_report_html(normalized_result), encoding="utf-8") report_path = html_path if report_format == "all": report_path = md_path diff --git a/tests/unit/test_foundry_backend.py b/tests/unit/test_foundry_backend.py index 128a387..6bbbf8d 100644 --- a/tests/unit/test_foundry_backend.py +++ b/tests/unit/test_foundry_backend.py @@ -442,3 +442,121 @@ def test_default_foundry_input_mapping_tool_call_accuracy() -> None: assert mapping["response"] == "$prediction" assert mapping["tool_calls"] == "$row.tool_calls" assert mapping["tool_definitions"] == "$row.tool_definitions" + + +# --------------------------------------------------------------------------- +# Extended evaluator coverage (issue #51) +# --------------------------------------------------------------------------- + + +def test_cloud_evaluator_data_mapping_response_completeness() -> None: + mapping = _cloud_evaluator_data_mapping( + "response_completeness", "input", "expected" + ) + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + assert mapping["ground_truth"] == "{{item.expected}}" + + +def test_cloud_evaluator_data_mapping_groundedness_pro() -> None: + mapping = _cloud_evaluator_data_mapping( + "groundedness_pro", "input", "expected", context_field="context" + ) + assert mapping["context"] == "{{item.context}}" + assert mapping["query"] == "{{item.input}}" + assert "ground_truth" not in mapping + + +def test_cloud_evaluator_data_mapping_retrieval() -> None: + mapping = _cloud_evaluator_data_mapping("retrieval", "input", "expected") + assert mapping["context"] == "{{item.expected}}" + assert mapping["query"] == "{{item.input}}" + + +def test_cloud_evaluator_data_mapping_tool_selection() -> None: + mapping = _cloud_evaluator_data_mapping("tool_selection", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + assert mapping["tool_calls"] == "{{sample.tool_calls}}" + assert mapping["tool_definitions"] == "{{item.tool_definitions}}" + + +def test_cloud_evaluator_data_mapping_tool_input_accuracy() -> None: + mapping = _cloud_evaluator_data_mapping("tool_input_accuracy", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + assert mapping["tool_definitions"] == "{{item.tool_definitions}}" + assert "tool_calls" not in mapping + + +def test_cloud_evaluator_data_mapping_tool_output_utilization() -> None: + mapping = _cloud_evaluator_data_mapping( + "tool_output_utilization", "input", "expected" + ) + assert mapping["query"] == "{{item.input}}" + assert mapping["tool_definitions"] == "{{item.tool_definitions}}" + assert "tool_calls" not in mapping + + +def test_cloud_evaluator_data_mapping_tool_call_success() -> None: + mapping = _cloud_evaluator_data_mapping("tool_call_success", "input", "expected") + assert mapping["tool_definitions"] == "{{item.tool_definitions}}" + assert "tool_calls" not in mapping + + +def test_cloud_evaluator_data_mapping_task_adherence_uses_output_items() -> None: + mapping = _cloud_evaluator_data_mapping("task_adherence", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_items}}" + assert "ground_truth" not in mapping + + +def test_cloud_evaluator_data_mapping_coherence_default_path() -> None: + mapping = _cloud_evaluator_data_mapping("coherence", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + assert "ground_truth" not in mapping + assert "context" not in mapping + assert "tool_calls" not in mapping + + +def test_cloud_evaluator_data_mapping_violence_default_path() -> None: + mapping = _cloud_evaluator_data_mapping("violence", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + assert "ground_truth" not in mapping + + +def test_cloud_evaluator_data_mapping_intent_resolution_default_path() -> None: + mapping = _cloud_evaluator_data_mapping("intent_resolution", "input", "expected") + assert mapping["query"] == "{{item.input}}" + assert mapping["response"] == "{{sample.output_text}}" + + +def test_default_foundry_input_mapping_tool_selection() -> None: + mapping = _default_foundry_input_mapping("ToolSelectionEvaluator") + assert mapping["tool_calls"] == "$row.tool_calls" + assert mapping["tool_definitions"] == "$row.tool_definitions" + + +def test_default_foundry_input_mapping_tool_input_accuracy() -> None: + mapping = _default_foundry_input_mapping("ToolInputAccuracyEvaluator") + assert mapping["tool_definitions"] == "$row.tool_definitions" + assert "tool_calls" not in mapping + + +def test_default_foundry_input_mapping_coherence() -> None: + mapping = _default_foundry_input_mapping("CoherenceEvaluator") + assert mapping["query"] == "$prompt" + assert mapping["response"] == "$prediction" + assert "ground_truth" not in mapping + + +def test_default_foundry_input_mapping_response_completeness() -> None: + mapping = _default_foundry_input_mapping("ResponseCompletenessEvaluator") + assert mapping["ground_truth"] == "$expected" + + +def test_default_foundry_input_mapping_groundedness_pro() -> None: + mapping = _default_foundry_input_mapping("GroundednessProEvaluator") + assert mapping["context"] == "$row.context" From 46ede700fa5ba1e78cf5b64ee69c12100a5951f1 Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Tue, 7 Apr 2026 11:43:50 -0700 Subject: [PATCH 2/2] docs: align all documentation with current implementation - Fix skill paths: plugins/agentops/skills/ (not .github/plugins/) across README, tutorial-copilot-skills (6 instances) - Fix CLI contract: add eval compare and config cicd as implemented commands in AGENTS.md, copilot-instructions.md, how-it-works.md - Fix source tree listings: add cicd.py, comparison.py, telemetry.py, workflows/ across AGENTS.md, how-it-works.md - Fix test listings: add test_cicd, test_cli_commands, test_comparison, test_telemetry across AGENTS.md, copilot-instructions.md, how-it-works.md - Fix agent_tools_baseline: TaskCompletionEvaluator + ToolCallAccuracyEvaluator (not SimilarityEvaluator placeholder) in README, AGENTS.md, how-it-works.md - Fix JSONL path: data/.jsonl (not datasets/) in ci-github-actions.md - Fix init flag: --dir (not --path) in README - Fix evaluator guidance: add frozenset names and NLP_DEFAULT_INIT_PARAMS to copilot-instructions.md - Add context_field to dataset format docs in AGENTS.md - Add rouge_type default note to evaluator reference doc - Update planned command message to list all 5 available commands - Add --format flag to CLI usage examples --- .github/copilot-instructions.md | 11 ++++++-- AGENTS.md | 27 ++++++++++++++----- README.md | 15 ++++++----- docs/ci-github-actions.md | 3 +-- ...ndry-evaluation-sdk-built-in-evaluators.md | 4 +-- docs/how-it-works.md | 26 ++++++++++++------ docs/tutorial-copilot-skills.md | 12 ++++----- docs/tutorial-rag.md | 2 +- src/agentops/cli/app.py | 3 ++- 9 files changed, 67 insertions(+), 36 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 91a05b3..c106919 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -54,7 +54,9 @@ Only the following commands are in scope: - `agentops init` - `agentops eval run --config [--output ]` +- `agentops eval compare --runs ,[,ID3,...] [--output ]` - `agentops report --in [--out ]` +- `agentops config cicd [--force] [--dir ]` Do not add new commands or flags unless explicitly discussed. @@ -80,7 +82,7 @@ See `docs/how-it-works.md` for the full source-code map and architecture diagram - Keep CLI command handlers **thin** (`cli/app.py`) — only parse args and call `services/` - Place business logic in: - `core/` — config loading, Pydantic models, thresholds, report generation. **Must have zero Azure SDK imports and zero network calls.** - - `services/` — orchestration (runner), Foundry publishing, workspace init, report regen + - `services/` — orchestration (runner), comparison, CI/CD workflow generation, Foundry publishing, workspace init, report regen - `backends/` — execution backends (Foundry, subprocess). Each implements the `Backend` protocol from `base.py`. - Use `pathlib.Path` everywhere (no raw string paths) - No side effects at import time @@ -130,6 +132,7 @@ The Foundry backend (`backends/foundry_backend.py`) is the largest and most comp - Auto-derive Azure OpenAI endpoint from the project endpoint via `_derive_openai_endpoint_from_project()` — users should not need to set `AZURE_OPENAI_ENDPOINT` manually. - Agent invocation supports both reference-based and threads-based API calls. - Evaluator names map from class names to builtins: `SimilarityEvaluator` → `builtin.similarity`. +- Cloud evaluator routing uses frozensets: `_EVALUATORS_NEEDING_GROUND_TRUTH`, `_EVALUATORS_NEEDING_CONTEXT`, `_EVALUATORS_NEEDING_TOOL_CALLS`, `_EVALUATORS_NEEDING_TOOL_DEFS_ONLY`, `_EVALUATORS_NEEDING_OUTPUT_ITEMS`. NLP evaluators with required init params use `_NLP_DEFAULT_INIT_PARAMS`. ### Environment Variables @@ -208,6 +211,10 @@ When cloud evaluation is used, a `cloud_evaluation.json` is also produced contai - Foundry backend helpers (`test_foundry_backend.py`) - Subprocess backend (`test_subprocess_backend.py`) - Initializer (`test_initializer.py`) + - CI/CD workflow generation (`test_cicd.py`) + - CLI command behavior (`test_cli_commands.py`) + - Eval comparison logic (`test_comparison.py`) + - OTLP telemetry instrumentation (`test_telemetry.py`) - Integration test for: - `agentops eval run` end-to-end using a fake subprocess backend (`test_eval_run_integration.py`) - Tests must assert correct **exit codes** @@ -248,7 +255,7 @@ When generating or modifying code: - Azure SDK imports must be **lazy** (inside functions, not top-level) - Never hardcode Azure API versions — let the SDK handle versioning - Keep user-facing log output clean — no warning cascades or retry noise -- When adding evaluator support, update both cloud (`_cloud_evaluator_data_mapping` + `_cloud_evaluator_needs_model`) and local paths +- When adding evaluator support, add the builtin name to the correct frozenset in `foundry_backend.py` (`_EVALUATORS_NEEDING_GROUND_TRUTH`, `_EVALUATORS_NEEDING_CONTEXT`, `_EVALUATORS_NEEDING_TOOL_CALLS`, `_EVALUATORS_NEEDING_TOOL_DEFS_ONLY`, or `_EVALUATORS_NEEDING_OUTPUT_ITEMS`), update `_NLP_DEFAULT_INIT_PARAMS` if init params are required, and update both cloud (`_cloud_evaluator_data_mapping` + `_cloud_evaluator_needs_model`) and local paths - All new logic must have corresponding unit tests in `tests/unit/` - Always mock Azure SDK calls in tests — tests must run without credentials - The `core/` package must remain free of Azure imports and I/O diff --git a/AGENTS.md b/AGENTS.md index 73521af..1ce0bb7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,9 +17,10 @@ Primary capabilities: Public CLI contract: - `agentops init` -- `agentops eval run --config [--output ]` -- `agentops eval compare --runs ,` -- `agentops report --in [--out ]` +- `agentops eval run --config [--output ] [--format md|html|all]` +- `agentops eval compare --runs ,[,ID3,...] [--output ]` +- `agentops report --in [--out ] [--format md|html|all]` +- `agentops config cicd [--force] [--dir ]` Planned CLI stubs (not implemented in this release): - `agentops run list|show` @@ -27,7 +28,7 @@ Planned CLI stubs (not implemented in this release): - `agentops report show|export` - `agentops bundle list|show` - `agentops dataset validate|describe|import` -- `agentops config validate|show|cicd` +- `agentops config validate|show` - `agentops trace init` - `agentops monitor setup|dashboard|alert` - `agentops model list` @@ -114,6 +115,8 @@ src/ │ ├── runner.py # Main evaluation orchestration │ ├── initializer.py # `.agentops/` workspace scaffolding │ ├── reporting.py # `results.json` -> `report.md` + │ ├── comparison.py # `agentops eval compare` logic + │ ├── cicd.py # CI/CD workflow generation │ └── foundry_evals.py # Foundry evaluation publishing helpers │ ├── backends/ @@ -129,10 +132,13 @@ src/ └── templates/ ├── config.yaml # Seed workspace config ├── run.yaml # Seed run config + ├── run-agent.yaml # Seed agent run config + ├── run-rag.yaml # Seed RAG run config ├── .gitignore # Seed `.agentops/.gitignore` ├── bundles/ # Starter bundle YAML files ├── datasets/ # Starter dataset YAML configs - └── data/ # Starter dataset JSONL rows + ├── data/ # Starter dataset JSONL rows + └── workflows/ # CI/CD workflow templates ``` ### Tests @@ -149,7 +155,11 @@ tests/ ├── test_reporter.py # Report generation and threshold output ├── test_foundry_backend.py # Foundry backend helpers ├── test_subprocess_backend.py # Subprocess backend behavior - └── test_initializer.py # `.agentops/` scaffold behavior + ├── test_initializer.py # `.agentops/` scaffold behavior + ├── test_cicd.py # CI/CD workflow generation + ├── test_cli_commands.py # CLI command behavior + ├── test_comparison.py # Eval comparison logic + └── test_telemetry.py # OTLP telemetry instrumentation ``` ### Documentation @@ -242,6 +252,7 @@ Key sections: - `format.type` - `format.input_field` - `format.expected_field` +- `format.context_field` Dataset rows live separately in `.agentops/data/*.jsonl`. @@ -351,7 +362,9 @@ Common derived run metrics: ### Agent with Tools - Target: Foundry agent - Bundle: `agent_tools_baseline.yaml` -- Current status: placeholder baseline ready for expansion +- Evaluators: `TaskCompletionEvaluator`, `ToolCallAccuracyEvaluator`, `avg_latency_seconds` +- Typical row fields: `input`, `expected`, `tool_definitions` +- Primary evaluator pattern: task completion + tool accuracy + latency --- diff --git a/README.md b/README.md index 26ef35a..5388daf 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ Starter bundles created by `agentops init`: |---|---|---| | `model_direct_baseline` (default) | `SimilarityEvaluator` + `avg_latency_seconds` | Model-direct QA checks | | `rag_retrieval_baseline` | `GroundednessEvaluator` + `avg_latency_seconds` | RAG groundedness checks | -| `agent_tools_baseline` | `SimilarityEvaluator` + `avg_latency_seconds` | Agent-with-tools baseline (placeholder) | +| `agent_tools_baseline` | `TaskCompletionEvaluator` + `ToolCallAccuracyEvaluator` + `avg_latency_seconds` | Agent-with-tools baseline | `datasets/` stores YAML dataset definitions. `data/` stores JSONL rows referenced by dataset definitions. @@ -168,7 +168,7 @@ Starter bundles created by `agentops init`: | Command | Description | Status | |---|---|---| | `agentops --version` | Show installed version | ✅ | -| `agentops init [--path DIR]` | Scaffold project workspace and starter files | ✅ | +| `agentops init [--dir DIR]` | Scaffold project workspace and starter files | ✅ | | `agentops eval run` | Evaluate a dataset against a bundle | ✅ | | `agentops eval compare --runs ID1,ID2` | Compare two past runs | ✅ | | `agentops run list\|show` | List or inspect past runs | 🚧 | @@ -188,9 +188,10 @@ Implemented command usage: ```bash agentops --version -agentops init [--path ] -agentops eval run [--config ] [--output ] -agentops report [--in ] [--out ] +agentops init [--dir ] +agentops eval run [--config ] [--output ] [--format md|html|all] +agentops eval compare --runs ID1,ID2 [--output ] [--format md|html|all] +agentops report [--in ] [--out ] [--format md|html|all] agentops config cicd [--force] [--dir ] ``` @@ -237,13 +238,13 @@ Skills are distributed from this GitHub repository. Install them in VS Code: 1. Open **VS Code** with **GitHub Copilot Chat** enabled. 2. Use the Copilot skill install command and point to this repository: - Source: `Azure/agentops` - - Skills are located under `.github/plugins/agentops/skills/` + - Skills are located under `plugins/agentops/skills/` 3. Once installed, Copilot will automatically use the skills when you ask about AgentOps evaluation, regressions, or observability. Alternatively, you can copy the skill files manually: ```bash # Copy skills to your user-level skills directory -cp -r .github/plugins/agentops/skills/* ~/.agents/skills/ +cp -r plugins/agentops/skills/* ~/.agents/skills/ ``` ### For Repository Contributors diff --git a/docs/ci-github-actions.md b/docs/ci-github-actions.md index e368c74..fa1a76f 100644 --- a/docs/ci-github-actions.md +++ b/docs/ci-github-actions.md @@ -36,8 +36,7 @@ Your repository must contain these files for the workflow to succeed: | --------------------------------- | --------------------------------------------------------------- | | `.agentops/run.yaml` | Run specification — references the bundle, dataset, and backend | | `.agentops/bundles/.yaml` | Evaluation bundle — evaluators + thresholds | -| `.agentops/datasets/.yaml` | Dataset metadata | -| `.agentops/datasets/.jsonl` | Dataset rows (JSONL format) | +| `.agentops/datasets/.yaml` | Dataset metadata |\n| `.agentops/data/.jsonl` | Dataset rows (JSONL format) |", "oldString": "| `.agentops/datasets/.yaml` | Dataset metadata |\n| `.agentops/datasets/.jsonl` | Dataset rows (JSONL format) | All paths in `run.yaml` are relative to the `.agentops/` directory. diff --git a/docs/foundry-evaluation-sdk-built-in-evaluators.md b/docs/foundry-evaluation-sdk-built-in-evaluators.md index 6e7b131..d221b48 100644 --- a/docs/foundry-evaluation-sdk-built-in-evaluators.md +++ b/docs/foundry-evaluation-sdk-built-in-evaluators.md @@ -44,7 +44,7 @@ evaluators: | `F1ScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | none beyond class init defaults | `input_mapping(response,ground_truth)` | | `BleuScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | none beyond class init defaults | `input_mapping(response,ground_truth)` | | `GleuScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | none beyond class init defaults | `input_mapping(response,ground_truth)` | -| `RougeScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | optional `rouge_type` in `init` | `input_mapping(response,ground_truth)` | +| `RougeScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | `rouge_type` in `init` (AgentOps defaults to `rouge1`) | `input_mapping(response,ground_truth)` | | `MeteorScoreEvaluator` | Textual similarity (NLP) | `response`, `ground_truth` | optional `alpha/beta/gamma/threshold` in `init` | `input_mapping(response,ground_truth)` | | `RetrievalEvaluator` | RAG | usually `query`, `response`, `context` | `model_config` (AI-assisted) | `input_mapping(query,response,context)` | | `DocumentRetrievalEvaluator` | RAG | retrieval outputs + `ground_truth` | check SDK class contract | explicit `input_mapping` recommended | @@ -215,6 +215,6 @@ AgentOps provides sensible defaults so you don't need to configure extra environ --- -**Last updated:** 2026-03-02 (UTC) +**Last updated:** 2026-04-07 (UTC) Because Foundry Evaluation SDK and evaluator signatures evolve (especially preview features), review official docs before production rollout. diff --git a/docs/how-it-works.md b/docs/how-it-works.md index 5c6a4e9..55e3b77 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -47,6 +47,8 @@ src/ │ ├── runner.py # Main evaluation orchestrator │ ├── reporting.py # Report regeneration service │ ├── initializer.py # Workspace scaffolding (agentops init) + │ ├── comparison.py # Eval comparison (agentops eval compare) + │ ├── cicd.py # CI/CD workflow generation │ └── foundry_evals.py # Foundry Evaluations panel publishing │ ├── backends/ # Execution engines — ADD new backends here @@ -56,16 +58,19 @@ src/ │ ├── utils/ # Shared helpers │ ├── yaml.py # YAML load + env-var interpolation - │ └── logging.py # Logger factory and setup + │ ├── logging.py # Logger factory and setup + │ └── telemetry.py # Optional OTLP tracing (lazy imports) │ └── templates/ # Starter files for `agentops init` ├── config.yaml ├── run.yaml ├── run-rag.yaml ├── run-agent.yaml + ├── .gitignore ├── bundles/ # Pre-built evaluation bundles ├── datasets/ # Dataset definitions (.yaml) - └── data/ # Sample dataset rows (.jsonl) + ├── data/ # Sample dataset rows (.jsonl) + └── workflows/ # CI/CD workflow templates ``` ### Where to Add New Code @@ -116,7 +121,8 @@ When you run `agentops eval run`, the following happens step by step: | `agentops report show\|export` | View or export reports | Planned (stub) | | `agentops bundle list\|show` | Browse bundle definitions | Planned (stub) | | `agentops dataset validate\|describe\|import` | Validate, describe, and import datasets | Planned (stub) | -| `agentops config validate\|show\|cicd` | Validate config and CI/CD scaffolding | Planned (stub) | +| `agentops config cicd [--force] [--dir ]` | Generate CI/CD workflow file | Available | +| `agentops config validate\|show` | Validate config and show merged config | Planned (stub) | | `agentops trace init` | Initialize tracing setup | Planned (stub) | | `agentops monitor setup\|dashboard\|alert` | Monitoring setup and operations | Planned (stub) | | `agentops model list` | List model deployments from Foundry project | Planned (stub) | @@ -174,7 +180,7 @@ The `.agentops/` directory lives in your project root and stores all evaluation - `source: local` for AgentOps-native evaluators (for example `exact_match`, `avg_latency_seconds`) - `source: foundry` for Foundry SDK evaluators (name must match evaluator class name, for example `GroundednessEvaluator`) - Supported local evaluators are explicit: `exact_match`, `latency_seconds`, `avg_latency_seconds`. -- AgentOps does not emulate Foundry evaluators locally; if you configure `SimilarityEvaluator`/`GroundednessEvaluator`, use `source: foundry`. +- AgentOps does not emulate Foundry evaluators locally; if you configure Foundry SDK evaluators (e.g. `SimilarityEvaluator`, `CoherenceEvaluator`, `ToolCallAccuracyEvaluator`, etc.), use `source: foundry`. - Foundry evaluators support generic configuration via `evaluators[].config`: - `kind`: `builtin` (default) or `custom` - `class_name`: built-in class name from `azure.ai.evaluation` (optional; defaults to evaluator `name`) @@ -244,7 +250,7 @@ For built-in Foundry evaluators, AgentOps uses `DefaultAzureCredential` by defau - Recommended evaluation scenario bundles: - `model_direct_baseline`: Model-Only — SimilarityEvaluator (no retrieval, no tools) - `rag_retrieval_baseline`: RAG — GroundednessEvaluator (retrieval-augmented) - - `agent_tools_baseline`: Agent with Tools — placeholder (to be expanded) + - `agent_tools_baseline`: Agent with Tools — TaskCompletionEvaluator + ToolCallAccuracyEvaluator - Threshold criteria: - Numeric: `>=`, `>`, `<=`, `<`, `==` (requires `value`) @@ -332,10 +338,10 @@ AgentOps supports three evaluation scenarios: - Dataset: rows with `input`, `expected`, and `context` fields - Backend config: `target: agent` (agent with knowledge base / retrieval) -### Agent with Tools (placeholder) +### Agent with Tools - Evaluates agents that use tool calls (function calling) -- Bundle: `agent_tools_baseline.yaml` (placeholder — will be expanded with tool-call evaluators) +- Bundle: `agent_tools_baseline.yaml` — `TaskCompletionEvaluator` + `ToolCallAccuracyEvaluator` + `avg_latency_seconds` - Backend config: `target: agent` ## Backend behavior @@ -545,7 +551,11 @@ tests/ ├── test_yaml_loader.py # YAML loading + env-var interpolation ├── test_foundry_backend.py # Foundry backend helpers (mocked) ├── test_subprocess_backend.py # Subprocess backend - └── test_initializer.py # Workspace scaffolding + ├── test_initializer.py # Workspace scaffolding + ├── test_cicd.py # CI/CD workflow generation + ├── test_cli_commands.py # CLI command behavior + ├── test_comparison.py # Eval comparison logic + └── test_telemetry.py # OTLP telemetry instrumentation ``` Run all tests: diff --git a/docs/tutorial-copilot-skills.md b/docs/tutorial-copilot-skills.md index ad5345c..c99ce5d 100644 --- a/docs/tutorial-copilot-skills.md +++ b/docs/tutorial-copilot-skills.md @@ -38,7 +38,7 @@ In VS Code: 1. Open **Copilot Chat**. 2. Use the skill install flow and point to this repository: - **Source:** `Azure/agentops` - - **Skill path:** `.github/plugins/agentops/skills/` + - **Skill path:** `plugins/agentops/skills/` 3. Select the skills you want to install. Once installed, the skills appear in `~/.agents/skills/` and a lock file (`~/.agents/.skill-lock.json`) tracks where they came from. Skills are available across all workspaces. @@ -50,14 +50,14 @@ If you prefer to manage skills manually: **macOS / Linux:** ```bash git clone https://github.com/Azure/agentops.git /tmp/agentops -cp -r /tmp/agentops/.github/plugins/agentops/skills/* ~/.agents/skills/ +cp -r /tmp/agentops/plugins/agentops/skills/* ~/.agents/skills/ rm -rf /tmp/agentops ``` **Windows (PowerShell):** ```powershell git clone https://github.com/Azure/agentops.git $env:TEMP\agentops -Copy-Item -Recurse "$env:TEMP\agentops\.github\plugins\agentops\skills\*" "$env:USERPROFILE\.agents\skills\" +Copy-Item -Recurse "$env:TEMP\agentops\plugins\agentops\skills\*" "$env:USERPROFILE\.agents\skills\" Remove-Item -Recurse -Force "$env:TEMP\agentops" ``` @@ -66,8 +66,8 @@ Remove-Item -Recurse -Force "$env:TEMP\agentops" If you want the skills available only within a specific repository (useful for teams with different tool versions), copy them into the project: ```bash -mkdir -p .github/plugins/agentops/skills -cp -r /.github/plugins/agentops/skills/* .github/plugins/agentops/skills/ +mkdir -p plugins/agentops/skills +cp -r /plugins/agentops/skills/* plugins/agentops/skills/ ``` This way the skills travel with the repo and every contributor gets them automatically. @@ -111,7 +111,7 @@ Pull the latest version from the repository and re-copy: ```bash git clone https://github.com/Azure/agentops.git /tmp/agentops -cp -r /tmp/agentops/.github/plugins/agentops/skills/* ~/.agents/skills/ +cp -r /tmp/agentops/plugins/agentops/skills/* ~/.agents/skills/ rm -rf /tmp/agentops ``` diff --git a/docs/tutorial-rag.md b/docs/tutorial-rag.md index abef541..a77e2e7 100644 --- a/docs/tutorial-rag.md +++ b/docs/tutorial-rag.md @@ -117,7 +117,7 @@ Each row has: - `expected` — the reference answer - `context` — the retrieved document context used by `GroundednessEvaluator` -The `GroundednessEvaluator` checks whether the agent's response is grounded in the `context` (mapped via `expected_field` → `context` in the evaluator's data mapping). +The `GroundednessEvaluator` checks whether the agent's response is grounded in the `context` column. Set `format.context_field: context` in your dataset YAML so the evaluator maps it correctly. If `context_field` is not set, the evaluator falls back to `expected_field`. > **Tip**: For a real RAG scenario, populate the `context` field with actual retrieved passages from your knowledge base. diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index a9f9e7b..d6b5eb2 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -50,7 +50,8 @@ def _planned_command(command_name: str) -> None: typer.echo( "This command is planned but not implemented in this release:\n" f" {command_name}\n" - "Please use the currently available commands (`init`, `eval run`, `report`) for now." + "Please use the currently available commands" + " (`init`, `eval run`, `eval compare`, `report`, `config cicd`) for now." ) raise typer.Exit(code=1)