EntityProcess · christso · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
@@ -122,6 +122,23 @@ score = sum(criterion_score / 10 * weight) / sum(total_weights)
 
 Write rubric criteria directly in `assertions`. If you want help choosing between plain assertions, deterministic graders, and rubric or LLM-based grading, use the `agentv-eval-writer` skill. Keep the grader choice driven by the criteria rather than one fixed recipe.
 
+## Context Available to Rubric Graders
+
+Rubric assertions automatically receive the full evaluation context, not just the agent's text answer. When present, the following are appended to the grader prompt:
+
+- **`file_changes`** — unified diff of workspace file changes (when `workspace` is configured)
+- **`tool_calls`** — formatted summary of tool calls from agent execution (tool name + key inputs)
+
+This means rubric criteria can reason about *what the agent did*, not only what it said. For example, you can check whether an agent invoked a specific skill:
+
+```yaml
+assertions:
+  - The agent invoked the acme-deploy skill
+  - The agent used Read to inspect the config file before editing
+```
+
+This is a lightweight alternative to the `skill-trigger` evaluator when you want to check tool usage with natural-language criteria.
+
 ## Combining with Other Graders
 
 Rubrics work alongside code and LLM graders:

diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -73,6 +73,7 @@ Score the response from 0.0 to 1.0 based on:
 | `expected_output` | Full resolved expected array, JSON-serialized |
 | `output` | Full provider output array, JSON-serialized |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
+| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
 ## Per-Grader Target
 
@@ -228,6 +229,7 @@ Derived strings injected into grader prompts:
 | `expected_output` | Full resolved expected array, JSON-serialized |
 | `output` | Full provider output array, JSON-serialized |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
+| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
 **Example flow:**
 

diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml
@@ -0,0 +1,39 @@
+# Tool Calls Template Variable Demo
+#
+# Demonstrates using {{ tool_calls }} with rubric assertions to check
+# whether an agent invoked the right skills — without needing the
+# skill-trigger evaluator.
+#
+# Skills live in workspace/.agents/skills/. The before_all hook copies
+# them to .claude/skills/ so copilot and other providers can discover them.
+#
+# Run:
+#   bun agentv eval examples/features/tool-calls-template/evals/eval.yaml --target copilot
+
+name: tool-calls-template
+description: Rubric assertions with {{ tool_calls }} for skill verification
+
+workspace:
+  template: ../workspace/
+  hooks:
+    before_all:
+      command:
+        - bash
+        - -c
+        - 'WS=$(python3 -c "import json,sys;print(json.load(sys.stdin)[\"workspace_path\"])") && mkdir -p "$WS/.claude" && cp -r "$WS/.agents/skills" "$WS/.claude/skills"'
+
+tests:
+  - id: deploy-skill-triggered
+    input: How do I deploy payments-api to production?
+    assertions:
+      - The agent invoked the acme-deploy skill
+
+  - id: rollback-skill-triggered
+    input: I need to roll back user-service in staging, what's the procedure?
+    assertions:
+      - The agent invoked the acme-deploy skill
+
+  - id: no-skill-for-unrelated
+    input: Write a Python function that parses JSON logs and extracts error messages.
+    assertions:
+      - The tool_calls section does not contain any entry starting with "Skill:" (file creation, Read, Edit, and Bash are fine)
diff --git a/...ples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md b/...ples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md
@@ -0,0 +1,49 @@
+---
+name: acme-deploy
+description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp
+---
+
+# Acme Corp Deployment Procedures
+
+## Overview
+
+Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline.
+
+## Deployment Commands
+
+### Deploy to staging
+```bash
+trident push --env staging --service <service-name> --tag <git-sha>
+```
+
+### Promote to production
+```bash
+trident promote --from staging --to prod --service <service-name> --approval-ticket <JIRA-ID>
+```
+Production deploys require a JIRA approval ticket (prefix: DEPLOY-).
+
+### Rollback
+```bash
+trident rollback --env <env> --service <service-name> --to-version <previous-tag>
+```
+Rollbacks auto-notify #ops-alerts in Slack.
+
+### Check deployment status
+```bash
+trident status --env <env> --service <service-name>
+```
+
+## Service Registry
+
+| Service | Owner Team | Staging URL | Prod URL |
+|---------|-----------|-------------|----------|
+| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal |
+| user-service | Identity | users.staging.acme.internal | users.acme.internal |
+| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal |
+
+## Rules
+
+- All prod deploys require a DEPLOY- JIRA ticket
+- Staging deploys are auto-approved during business hours (9am-5pm PT)
+- Rollbacks bypass approval but require post-mortem within 48h
+- Deploy freezes are announced in #engineering-announcements
diff --git a/packages/core/src/evaluation/graders/format-tool-calls.ts b/packages/core/src/evaluation/graders/format-tool-calls.ts
@@ -0,0 +1,79 @@
+/**
+ * Formats tool calls from agent output messages into a human-readable summary.
+ *
+ * Used by `{{ tool_calls }}` template variable in LLM grader prompts.
+ * Extracts key input fields per tool to keep the summary compact:
+ *   - Skill: `skill` arg
+ *   - Read/Write/Edit: `file_path`
+ *   - Bash: `command`
+ *   - Grep/Glob: `pattern`
+ *   - Other tools: first string-valued input field (if any)
+ *
+ * Returns empty string when there are no tool calls (template variable resolves to '').
+ */
+
+import type { Message } from '../providers/types.js';
+
+/**
+ * Key input fields to extract per tool name.
+ * Order matters — first matching field wins.
+ */
+const KEY_INPUT_FIELDS: ReadonlyMap<string, readonly string[]> = new Map([
+  ['Skill', ['skill']],
+  ['Read', ['file_path']],
+  ['Write', ['file_path']],
+  ['Edit', ['file_path']],
+  ['Bash', ['command']],
+  ['Grep', ['pattern']],
+  ['Glob', ['pattern']],
+]);
+
+/** Fallback: pick the first short string-valued field from input. */
+const MAX_FALLBACK_LENGTH = 120;
+
+export function formatToolCalls(output: readonly Message[] | undefined): string {
+  if (!output) return '';
+
+  const lines: string[] = [];
+
+  for (const message of output) {
+    if (!message.toolCalls) continue;
+    for (const call of message.toolCalls) {
+      const toolName = call.tool ?? 'unknown';
+      const detail = extractKeyDetail(toolName, call.input);
+      lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`);
+    }
+  }
+
+  return lines.length > 0 ? lines.join('\n') : '';
+}
+
+function extractKeyDetail(toolName: string, input: unknown): string {
+  if (!input || typeof input !== 'object') return '';
+  const record = input as Record<string, unknown>;
+
+  // Try known key fields for this tool
+  const knownFields = KEY_INPUT_FIELDS.get(toolName);
+  if (knownFields) {
+    for (const field of knownFields) {
+      const value = record[field];
+      if (typeof value === 'string' && value.length > 0) {
+        return truncate(value);
+      }
+    }
+  }
+
+  // Fallback: first short string-valued field
+  for (const value of Object.values(record)) {
+    if (typeof value === 'string' && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) {
+      return truncate(value);
+    }
+  }
+
+  return '';
+}
+
+function truncate(value: string, maxLen = 120): string {
+  if (value.length <= maxLen) return value;
+  return `${value.slice(0, maxLen)}…`;
+}
diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts
@@ -55,6 +55,8 @@ export {
 } from './llm-grader.js';
 export type { LlmGraderOptions } from './llm-grader.js';
 
+export { formatToolCalls } from './format-tool-calls.js';
+
 export { SkillTriggerGrader } from './skill-trigger.js';
 
 export { assembleLlmGraderPrompt } from './llm-grader-prompt.js';

diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
@@ -24,6 +24,7 @@ export function assembleLlmGraderPrompt(input: {
   evaluatorConfig?: LlmGraderConfig;
   output?: readonly Message[];
   fileChanges?: string;
+  toolCalls?: string;
   graderTemplateOverride?: string;
 }): LlmGraderPromptAssembly {
   const {
@@ -32,6 +33,7 @@ export function assembleLlmGraderPrompt(input: {
     promptInputs,
     evaluatorConfig,
     fileChanges,
+    toolCalls,
     graderTemplateOverride,
   } = input;
 
@@ -41,19 +43,27 @@ export function assembleLlmGraderPrompt(input: {
   if (rubrics && rubrics.length > 0) {
     const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
     if (hasScoreRanges) {
-      return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges);
+      return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
     }
-    return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
+    return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
   }
 
-  return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
+  return assembleFreeform(
+    evalCase,
+    candidate,
+    promptInputs,
+    fileChanges,
+    toolCalls,
+    graderTemplateOverride,
+  );
 }
 
 function assembleFreeform(
   evalCase: EvalTest,
   candidate: string,
   promptInputs: PromptInputs,
   fileChanges?: string,
+  toolCalls?: string,
   graderTemplateOverride?: string,
 ): LlmGraderPromptAssembly {
   const formattedQuestion =
@@ -67,6 +77,7 @@ function assembleFreeform(
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(),
     [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '',
+    [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '',
     // Deprecated aliases
     [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
     [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
@@ -77,10 +88,13 @@ function assembleFreeform(
   const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
   let userPrompt = substituteVariables(template, variables);
 
-  // Append file_changes section to default template only when present
+  // Append file_changes and tool_calls sections to default template only when present
   if (fileChanges && !graderTemplateOverride) {
     userPrompt += `\n\n[[ ## file_changes ## ]]\n${fileChanges}`;
   }
+  if (toolCalls && !graderTemplateOverride) {
+    userPrompt += `\n\n[[ ## tool_calls ## ]]\n${toolCalls}`;
+  }
 
   return {
     systemPrompt,
@@ -96,6 +110,7 @@ function assembleChecklist(
   promptInputs: PromptInputs,
   rubrics: readonly RubricItem[],
   fileChanges?: string,
+  toolCalls?: string,
 ): LlmGraderPromptAssembly {
   const formattedQuestion =
     promptInputs.question && promptInputs.question.trim().length > 0
@@ -123,6 +138,10 @@ function assembleChecklist(
     parts.push('[[ ## file_changes ## ]]', fileChanges, '');
   }
 
+  if (toolCalls) {
+    parts.push('[[ ## tool_calls ## ]]', toolCalls, '');
+  }
+
   parts.push('[[ ## rubrics ## ]]');
 
   for (const rubric of rubrics) {
@@ -150,6 +169,7 @@ function assembleScoreRange(
   promptInputs: PromptInputs,
   rubrics: readonly RubricItem[],
   fileChanges?: string,
+  toolCalls?: string,
 ): LlmGraderPromptAssembly {
   const formattedQuestion =
     promptInputs.question && promptInputs.question.trim().length > 0
@@ -178,6 +198,10 @@ function assembleScoreRange(
     parts.push('[[ ## file_changes ## ]]', fileChanges, '');
   }
 
+  if (toolCalls) {
+    parts.push('[[ ## tool_calls ## ]]', toolCalls, '');
+  }
+
   parts.push('[[ ## scoring_criteria ## ]]');
 
   for (const rubric of rubrics) {