EntityProcess · christso · Apr 12, 2026 · Apr 12, 2026
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -1,7 +1,12 @@
 import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import path from 'node:path';
 
-import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core';
+import {
+  DEFAULT_THRESHOLD,
+  type EvaluationResult,
+  type EvaluatorResult,
+  type TranscriptJsonLine,
+} from '@agentv/core';
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
 import { RESULT_INDEX_FILENAME } from './result-layout.js';
 
@@ -766,5 +771,41 @@ export async function writeArtifactsFromResults(
 
   await writeJsonlFile(indexPath, indexRecords);
 
+  // Write transcript JSONL (auto-generated on every eval run)
+  const transcriptPath = path.join(outputDir, 'transcript.jsonl');
+  const transcriptLines: TranscriptJsonLine[] = results.map((result) => {
+    let inputText = '';
+    if (typeof result.input === 'string') {
+      inputText = result.input;
+    } else if (Array.isArray(result.input)) {
+      const firstUserMsg = result.input.find((m) => m.role === 'user');
+      inputText = typeof firstUserMsg?.content === 'string' ? firstUserMsg.content : '';
+    }
+    return {
+      input: inputText,
+      output: result.output,
+      token_usage: result.tokenUsage
+        ? {
+            input: result.tokenUsage.input,
+            output: result.tokenUsage.output,
+            cached: result.tokenUsage.cached,
+          }
+        : undefined,
+      duration_ms: result.durationMs,
+      cost_usd: result.costUsd,
+      source: {
+        provider: result.target,
+        session_id: result.conversationId ?? result.testId,
+        timestamp: result.timestamp,
+      },
+    };
+  });
+  await writeFile(
+    transcriptPath,
+    transcriptLines.map((line) => JSON.stringify(line)).join('\n') +
+      (transcriptLines.length ? '\n' : ''),
+    'utf8',
+  );
+
   return { testArtifactDir, timingPath, benchmarkPath, indexPath };
 }
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -586,6 +586,7 @@ describe('writeArtifactsFromResults', () => {
       'beta',
       'index.jsonl',
       'timing.json',
+      'transcript.jsonl',
     ]);
 
     const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha'));
@@ -624,7 +625,12 @@ describe('writeArtifactsFromResults', () => {
     const paths = await writeArtifactsFromResults([], testDir);
 
     const artifactEntries = await readdir(paths.testArtifactDir);
-    expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']);
+    expect(artifactEntries.sort()).toEqual([
+      'benchmark.json',
+      'index.jsonl',
+      'timing.json',
+      'transcript.jsonl',
+    ]);
 
     const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
     expect(timing.total_tokens).toBe(0);

diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -1,120 +1,27 @@
 /**
  * Built-in skill-trigger evaluator.
  *
- * Detects whether the agent invoked a named skill as its first tool call.
- * Supports multiple provider kinds via static tool-name mappings.
- * For providers not covered here, use a code-grader instead.
+ * Detects whether the agent invoked a named skill during a session.
+ * Works with canonical tool names produced by normalizeToolCall() — no
+ * provider-specific matching logic needed.
  *
  * Detection logic:
- *   - Only the FIRST tool call matters.
- *   - Skill tool: checks input.[skillInputField] contains the skill name (case-sensitive substring).
- *   - Read tool: checks input.[readInputField] contains the skill name (case-sensitive substring).
- *   - Any other tool as first call means the skill was not triggered.
+ *   - Scans ALL tool calls (not just the first) for skill invocation evidence.
+ *   - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name.
+ *   - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path.
+ *   - Fallback: checks tool output for skill file path references.
  *   - Supports negative cases via should_trigger: false.
  *
- * To add a new provider:
- *   1. Create a ToolMatcher with the provider's tool names and input fields.
- *   2. Add entries to PROVIDER_TOOL_SEMANTICS mapping the provider kind(s) to the matcher.
- *   3. If the provider's tool-call format doesn't fit the ToolMatcher model, use a code-grader instead.
+ * Prerequisites:
+ *   All providers and import parsers must call normalizeToolCall() when
+ *   constructing ToolCall objects. This ensures canonical tool names
+ *   ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field
+ *   names (input.skill, input.file_path) regardless of provider.
  */
 
-import type { ProviderKind } from '../providers/types.js';
 import type { SkillTriggerEvaluatorConfig } from '../types.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 
-/** Tool-name semantics for different provider kinds. */
-interface ToolMatcher {
-  /** Tool names that indicate skill invocation. */
-  readonly skillTools: readonly string[];
-  /** Input field that contains the skill name for skill tools. */
-  readonly skillInputField: string;
-  /** Tool names that indicate file read. */
-  readonly readTools: readonly string[];
-  /** Input field that contains the skill name for read tools. */
-  readonly readInputField: string;
-  /** Tool-name prefixes that encode the skill directly in the tool name. */
-  readonly skillToolPrefixes?: readonly string[];
-  /** Tool-name prefixes that encode the file path directly in the tool name. */
-  readonly readToolPrefixes?: readonly string[];
-  /** Alternate input field names that may contain the file path. */
-  readonly readInputFields?: readonly string[];
-}
-
-const CLAUDE_MATCHER: ToolMatcher = {
-  skillTools: ['Skill'],
-  skillInputField: 'skill',
-  readTools: ['Read'],
-  readInputField: 'file_path',
-};
-
-/** Copilot uses ACP protocol — tool names vary by version and context. */
-const COPILOT_MATCHER: ToolMatcher = {
-  skillTools: ['Skill', 'skill'],
-  skillInputField: 'skill',
-  readTools: ['Read File', 'readFile', 'Read', 'readTextFile'],
-  readInputField: 'file_path',
-  skillToolPrefixes: ['Using skill: '],
-  readToolPrefixes: ['Viewing '],
-  readInputFields: ['file_path', 'path'],
-};
-
-/**
- * Pi CLI reads skill files using the lowercase `read` tool with a `path` argument.
- * Skills are auto-discovered from `.agents/skills/` relative to the working directory.
- *
- * Skill lookup order (workspace-scoped first):
- *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative, auto-discovered)
- *   2. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
- */
-const PI_CODING_AGENT_MATCHER: ToolMatcher = {
-  skillTools: [],
-  skillInputField: 'skill',
-  readTools: ['read'],
-  readInputField: 'path',
-  readInputFields: ['path', 'file_path', 'filePath'],
-};
-
-/**
- * Codex reads skill files via command_execution using a bash sed command containing
- * the skill file path. The skill name appears in the command string, so we match
- * any command_execution whose command field includes the skill name.
- *
- * Skill lookup order (workspace-scoped first):
- *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative)
- *   2. .codex/skills/<skill-name>/SKILL.md   (fallback)
- *   3. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
- *
- * MCP-based skill invocation (`mcp:<server>/<skill-name>`) is also supported for
- * Codex configurations that surface skills as MCP tools.
- */
-const CODEX_MATCHER: ToolMatcher = {
-  skillTools: [],
-  skillInputField: 'skill',
-  readTools: ['command_execution'],
-  readInputField: 'command',
-  skillToolPrefixes: ['mcp:'],
-  readToolPrefixes: ['mcp:'],
-  readInputFields: ['command', 'path', 'file_path', 'filePath'],
-};
-
-/**
- * Static mapping of provider kinds to their tool-name semantics.
- * Providers not listed here fall back to CLAUDE_MATCHER.
- */
-const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
-  claude: CLAUDE_MATCHER,
-  'claude-cli': CLAUDE_MATCHER,
-  'claude-sdk': CLAUDE_MATCHER,
-  codex: CODEX_MATCHER,
-  'pi-coding-agent': PI_CODING_AGENT_MATCHER,
-  'pi-cli': PI_CODING_AGENT_MATCHER,
-  'copilot-cli': COPILOT_MATCHER,
-  'copilot-log': COPILOT_MATCHER,
-  'copilot-sdk': COPILOT_MATCHER,
-  vscode: COPILOT_MATCHER,
-  'vscode-insiders': COPILOT_MATCHER,
-};
-
 export class SkillTriggerEvaluator implements Evaluator {
   readonly kind = 'skill-trigger';
 
@@ -124,19 +31,9 @@ export class SkillTriggerEvaluator implements Evaluator {
     this.config = config;
   }
 
-  private resolveMatcher(providerKind: ProviderKind | undefined): ToolMatcher {
-    if (providerKind) {
-      const match = PROVIDER_TOOL_SEMANTICS[providerKind];
-      if (match) return match;
-    }
-    return CLAUDE_MATCHER;
-  }
-
   evaluate(context: EvaluationContext): EvaluationScore {
     const skillName = this.config.skill;
     const shouldTrigger = this.config.should_trigger !== false;
-    const providerKind = context.provider?.kind as ProviderKind | undefined;
-    const matcher = this.resolveMatcher(providerKind);
 
     const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
 
@@ -147,42 +44,23 @@ export class SkillTriggerEvaluator implements Evaluator {
       const toolName = toolCall.tool ?? '';
       const input = (toolCall.input ?? {}) as Record<string, unknown>;
 
-      if (matcher.skillTools.includes(toolName)) {
-        const skillArg = String(input[matcher.skillInputField] ?? '');
+      if (toolName === 'Skill') {
+        const skillArg = String(input.skill ?? '');
         if (skillArg.includes(skillName)) {
           triggered = true;
-          evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
+          evidence = `Skill tool invoked with skill="${skillArg}"`;
           break;
         }
-      } else if (
-        matcher.skillToolPrefixes?.some(
-          (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
-        )
-      ) {
-        triggered = true;
-        evidence = `Skill tool invoked via tool name "${toolName}"`;
-        break;
-      } else if (matcher.readTools.includes(toolName)) {
-        const filePath = this.readPathFromInput(input, matcher);
-        if (filePath.includes(skillName)) {
+      } else if (toolName === 'Read') {
+        const filePath = String(input.file_path ?? '');
+        if (filePath.includes(`skills/${skillName}/`)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
           break;
         }
-      } else if (
-        matcher.readToolPrefixes?.some(
-          (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
-        )
-      ) {
-        triggered = true;
-        evidence = `Read tool loaded skill file via tool name "${toolName}"`;
-        break;
       }
 
       // Fallback: check if a tool's output contains a skill file path.
-      // Some providers (e.g., copilot-sdk) discover skill content via search
-      // tools (grep/glob) whose inputs don't reference the skill name, but
-      // whose outputs include skill file paths like ".agents/skills/<name>/SKILL.md".
       if (!triggered && toolCall.output != null) {
         const outputStr =
           typeof toolCall.output === 'string' ? toolCall.output : JSON.stringify(toolCall.output);
@@ -228,15 +106,4 @@ export class SkillTriggerEvaluator implements Evaluator {
       expectedAspectCount: 1,
     };
   }
-
-  private readPathFromInput(input: Record<string, unknown>, matcher: ToolMatcher): string {
-    const fields = matcher.readInputFields ?? [matcher.readInputField];
-    for (const field of fields) {
-      const value = input[field];
-      if (value !== undefined && value !== null) {
-        return String(value);
-      }
-    }
-    return '';
-  }
 }
diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -7,6 +7,7 @@ import path from 'node:path';
 
 import { extractTextContent, toContentArray } from './claude-content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
+import { normalizeToolCall } from './normalize-tool-call.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { ClaudeResolvedConfig } from './targets.js';
 import type {
@@ -493,11 +494,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
     }
     const p = part as Record<string, unknown>;
     if (p.type === 'tool_use' && typeof p.name === 'string') {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === 'string' ? p.id : undefined,
-      });
+      toolCalls.push(
+        normalizeToolCall('claude-cli', {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === 'string' ? p.id : undefined,
+        }),
+      );
     }
   }
   return toolCalls;

diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts
@@ -6,6 +6,7 @@ import path from 'node:path';
 
 import { extractTextContent, toContentArray } from './claude-content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
+import { normalizeToolCall } from './normalize-tool-call.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { ClaudeResolvedConfig } from './targets.js';
 import type {
@@ -297,11 +298,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
     }
     const p = part as Record<string, unknown>;
     if (p.type === 'tool_use' && typeof p.name === 'string') {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === 'string' ? p.id : undefined,
-      });
+      toolCalls.push(
+        normalizeToolCall('claude-sdk', {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === 'string' ? p.id : undefined,
+        }),
+      );
     }
   }
   return toolCalls;