diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 31a3c794f..e2e026bf2 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -1,7 +1,12 @@ import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core'; +import { + DEFAULT_THRESHOLD, + type EvaluationResult, + type EvaluatorResult, + type TranscriptJsonLine, +} from '@agentv/core'; import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; @@ -766,5 +771,41 @@ export async function writeArtifactsFromResults( await writeJsonlFile(indexPath, indexRecords); + // Write transcript JSONL (auto-generated on every eval run) + const transcriptPath = path.join(outputDir, 'transcript.jsonl'); + const transcriptLines: TranscriptJsonLine[] = results.map((result) => { + let inputText = ''; + if (typeof result.input === 'string') { + inputText = result.input; + } else if (Array.isArray(result.input)) { + const firstUserMsg = result.input.find((m) => m.role === 'user'); + inputText = typeof firstUserMsg?.content === 'string' ? firstUserMsg.content : ''; + } + return { + input: inputText, + output: result.output, + token_usage: result.tokenUsage + ? { + input: result.tokenUsage.input, + output: result.tokenUsage.output, + cached: result.tokenUsage.cached, + } + : undefined, + duration_ms: result.durationMs, + cost_usd: result.costUsd, + source: { + provider: result.target, + session_id: result.conversationId ?? result.testId, + timestamp: result.timestamp, + }, + }; + }); + await writeFile( + transcriptPath, + transcriptLines.map((line) => JSON.stringify(line)).join('\n') + + (transcriptLines.length ? '\n' : ''), + 'utf8', + ); + return { testArtifactDir, timingPath, benchmarkPath, indexPath }; } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index a601415c3..7aee68e62 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -586,6 +586,7 @@ describe('writeArtifactsFromResults', () => { 'beta', 'index.jsonl', 'timing.json', + 'transcript.jsonl', ]); const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha')); @@ -624,7 +625,12 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults([], testDir); const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']); + expect(artifactEntries.sort()).toEqual([ + 'benchmark.json', + 'index.jsonl', + 'timing.json', + 'transcript.jsonl', + ]); const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); expect(timing.total_tokens).toBe(0); diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts index 19a3908e8..7466c393d 100644 --- a/packages/core/src/evaluation/evaluators/skill-trigger.ts +++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts @@ -1,120 +1,27 @@ /** * Built-in skill-trigger evaluator. * - * Detects whether the agent invoked a named skill as its first tool call. - * Supports multiple provider kinds via static tool-name mappings. - * For providers not covered here, use a code-grader instead. + * Detects whether the agent invoked a named skill during a session. + * Works with canonical tool names produced by normalizeToolCall() — no + * provider-specific matching logic needed. * * Detection logic: - * - Only the FIRST tool call matters. - * - Skill tool: checks input.[skillInputField] contains the skill name (case-sensitive substring). - * - Read tool: checks input.[readInputField] contains the skill name (case-sensitive substring). - * - Any other tool as first call means the skill was not triggered. + * - Scans ALL tool calls (not just the first) for skill invocation evidence. + * - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name. + * - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path. + * - Fallback: checks tool output for skill file path references. * - Supports negative cases via should_trigger: false. * - * To add a new provider: - * 1. Create a ToolMatcher with the provider's tool names and input fields. - * 2. Add entries to PROVIDER_TOOL_SEMANTICS mapping the provider kind(s) to the matcher. - * 3. If the provider's tool-call format doesn't fit the ToolMatcher model, use a code-grader instead. + * Prerequisites: + * All providers and import parsers must call normalizeToolCall() when + * constructing ToolCall objects. This ensures canonical tool names + * ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field + * names (input.skill, input.file_path) regardless of provider. */ -import type { ProviderKind } from '../providers/types.js'; import type { SkillTriggerEvaluatorConfig } from '../types.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; -/** Tool-name semantics for different provider kinds. */ -interface ToolMatcher { - /** Tool names that indicate skill invocation. */ - readonly skillTools: readonly string[]; - /** Input field that contains the skill name for skill tools. */ - readonly skillInputField: string; - /** Tool names that indicate file read. */ - readonly readTools: readonly string[]; - /** Input field that contains the skill name for read tools. */ - readonly readInputField: string; - /** Tool-name prefixes that encode the skill directly in the tool name. */ - readonly skillToolPrefixes?: readonly string[]; - /** Tool-name prefixes that encode the file path directly in the tool name. */ - readonly readToolPrefixes?: readonly string[]; - /** Alternate input field names that may contain the file path. */ - readonly readInputFields?: readonly string[]; -} - -const CLAUDE_MATCHER: ToolMatcher = { - skillTools: ['Skill'], - skillInputField: 'skill', - readTools: ['Read'], - readInputField: 'file_path', -}; - -/** Copilot uses ACP protocol — tool names vary by version and context. */ -const COPILOT_MATCHER: ToolMatcher = { - skillTools: ['Skill', 'skill'], - skillInputField: 'skill', - readTools: ['Read File', 'readFile', 'Read', 'readTextFile'], - readInputField: 'file_path', - skillToolPrefixes: ['Using skill: '], - readToolPrefixes: ['Viewing '], - readInputFields: ['file_path', 'path'], -}; - -/** - * Pi CLI reads skill files using the lowercase `read` tool with a `path` argument. - * Skills are auto-discovered from `.agents/skills/` relative to the working directory. - * - * Skill lookup order (workspace-scoped first): - * 1. .agents/skills//SKILL.md (workspace-relative, auto-discovered) - * 2. ~/.agents/skills//SKILL.md (global fallback) - */ -const PI_CODING_AGENT_MATCHER: ToolMatcher = { - skillTools: [], - skillInputField: 'skill', - readTools: ['read'], - readInputField: 'path', - readInputFields: ['path', 'file_path', 'filePath'], -}; - -/** - * Codex reads skill files via command_execution using a bash sed command containing - * the skill file path. The skill name appears in the command string, so we match - * any command_execution whose command field includes the skill name. - * - * Skill lookup order (workspace-scoped first): - * 1. .agents/skills//SKILL.md (workspace-relative) - * 2. .codex/skills//SKILL.md (fallback) - * 3. ~/.agents/skills//SKILL.md (global fallback) - * - * MCP-based skill invocation (`mcp:/`) is also supported for - * Codex configurations that surface skills as MCP tools. - */ -const CODEX_MATCHER: ToolMatcher = { - skillTools: [], - skillInputField: 'skill', - readTools: ['command_execution'], - readInputField: 'command', - skillToolPrefixes: ['mcp:'], - readToolPrefixes: ['mcp:'], - readInputFields: ['command', 'path', 'file_path', 'filePath'], -}; - -/** - * Static mapping of provider kinds to their tool-name semantics. - * Providers not listed here fall back to CLAUDE_MATCHER. - */ -const PROVIDER_TOOL_SEMANTICS: Partial> = { - claude: CLAUDE_MATCHER, - 'claude-cli': CLAUDE_MATCHER, - 'claude-sdk': CLAUDE_MATCHER, - codex: CODEX_MATCHER, - 'pi-coding-agent': PI_CODING_AGENT_MATCHER, - 'pi-cli': PI_CODING_AGENT_MATCHER, - 'copilot-cli': COPILOT_MATCHER, - 'copilot-log': COPILOT_MATCHER, - 'copilot-sdk': COPILOT_MATCHER, - vscode: COPILOT_MATCHER, - 'vscode-insiders': COPILOT_MATCHER, -}; - export class SkillTriggerEvaluator implements Evaluator { readonly kind = 'skill-trigger'; @@ -124,19 +31,9 @@ export class SkillTriggerEvaluator implements Evaluator { this.config = config; } - private resolveMatcher(providerKind: ProviderKind | undefined): ToolMatcher { - if (providerKind) { - const match = PROVIDER_TOOL_SEMANTICS[providerKind]; - if (match) return match; - } - return CLAUDE_MATCHER; - } - evaluate(context: EvaluationContext): EvaluationScore { const skillName = this.config.skill; const shouldTrigger = this.config.should_trigger !== false; - const providerKind = context.provider?.kind as ProviderKind | undefined; - const matcher = this.resolveMatcher(providerKind); const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []); @@ -147,42 +44,23 @@ export class SkillTriggerEvaluator implements Evaluator { const toolName = toolCall.tool ?? ''; const input = (toolCall.input ?? {}) as Record; - if (matcher.skillTools.includes(toolName)) { - const skillArg = String(input[matcher.skillInputField] ?? ''); + if (toolName === 'Skill') { + const skillArg = String(input.skill ?? ''); if (skillArg.includes(skillName)) { triggered = true; - evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`; + evidence = `Skill tool invoked with skill="${skillArg}"`; break; } - } else if ( - matcher.skillToolPrefixes?.some( - (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName), - ) - ) { - triggered = true; - evidence = `Skill tool invoked via tool name "${toolName}"`; - break; - } else if (matcher.readTools.includes(toolName)) { - const filePath = this.readPathFromInput(input, matcher); - if (filePath.includes(skillName)) { + } else if (toolName === 'Read') { + const filePath = String(input.file_path ?? ''); + if (filePath.includes(`skills/${skillName}/`)) { triggered = true; evidence = `Read tool loaded skill file: ${filePath}`; break; } - } else if ( - matcher.readToolPrefixes?.some( - (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName), - ) - ) { - triggered = true; - evidence = `Read tool loaded skill file via tool name "${toolName}"`; - break; } // Fallback: check if a tool's output contains a skill file path. - // Some providers (e.g., copilot-sdk) discover skill content via search - // tools (grep/glob) whose inputs don't reference the skill name, but - // whose outputs include skill file paths like ".agents/skills//SKILL.md". if (!triggered && toolCall.output != null) { const outputStr = typeof toolCall.output === 'string' ? toolCall.output : JSON.stringify(toolCall.output); @@ -228,15 +106,4 @@ export class SkillTriggerEvaluator implements Evaluator { expectedAspectCount: 1, }; } - - private readPathFromInput(input: Record, matcher: ToolMatcher): string { - const fields = matcher.readInputFields ?? [matcher.readInputField]; - for (const field of fields) { - const value = input[field]; - if (value !== undefined && value !== null) { - return String(value); - } - } - return ''; - } } diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 1699810dd..adf7ddd98 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -7,6 +7,7 @@ import path from 'node:path'; import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; import type { @@ -493,11 +494,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] { } const p = part as Record; if (p.type === 'tool_use' && typeof p.name === 'string') { - toolCalls.push({ - tool: p.name, - input: p.input, - id: typeof p.id === 'string' ? p.id : undefined, - }); + toolCalls.push( + normalizeToolCall('claude-cli', { + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }), + ); } } return toolCalls; diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts index 6e8985fa4..ac29bfd47 100644 --- a/packages/core/src/evaluation/providers/claude-sdk.ts +++ b/packages/core/src/evaluation/providers/claude-sdk.ts @@ -6,6 +6,7 @@ import path from 'node:path'; import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; import type { @@ -297,11 +298,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] { } const p = part as Record; if (p.type === 'tool_use' && typeof p.name === 'string') { - toolCalls.push({ - tool: p.name, - input: p.input, - id: typeof p.id === 'string' ? p.id : undefined, - }); + toolCalls.push( + normalizeToolCall('claude-sdk', { + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }), + ); } } return toolCalls; diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index 9f2ca0ce7..4318ec5c8 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -5,6 +5,7 @@ import { mkdir } from 'node:fs/promises'; import path from 'node:path'; import { recordCodexLogEntry } from './codex-log-tracker.js'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { CodexResolvedConfig } from './targets.js'; import type { @@ -233,29 +234,35 @@ export class CodexProvider implements Provider { } if (itemType === 'command_execution') { - completedToolCalls.push({ - tool: 'command_execution', - input: { command: item.command }, - output: item.aggregated_output, - id: item.id, - }); + completedToolCalls.push( + normalizeToolCall('codex', { + tool: 'command_execution', + input: { command: item.command }, + output: item.aggregated_output, + id: item.id, + }), + ); } if (itemType === 'file_change') { - completedToolCalls.push({ - tool: 'file_change', - input: item.changes, - id: item.id, - }); + completedToolCalls.push( + normalizeToolCall('codex', { + tool: 'file_change', + input: item.changes, + id: item.id, + }), + ); } if (itemType === 'mcp_tool_call') { - completedToolCalls.push({ - tool: `mcp:${item.server}/${item.tool}`, - input: item.arguments, - output: item.result ?? item.error, - id: item.id, - }); + completedToolCalls.push( + normalizeToolCall('codex', { + tool: `mcp:${item.server}/${item.tool}`, + input: item.arguments, + output: item.result ?? item.error, + id: item.id, + }), + ); } } diff --git a/packages/core/src/evaluation/providers/copilot-cli.ts b/packages/core/src/evaluation/providers/copilot-cli.ts index 5a9c6f2eb..61b32a6c4 100644 --- a/packages/core/src/evaluation/providers/copilot-cli.ts +++ b/packages/core/src/evaluation/providers/copilot-cli.ts @@ -18,6 +18,7 @@ import { killProcess, resolvePlatformCliPath, } from './copilot-utils.js'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { CopilotCliResolvedConfig } from './targets.js'; import type { @@ -128,15 +129,17 @@ export class CopilotCliProvider implements Provider { // Tool call arrived already completed if (update.status === 'completed' || update.status === 'failed') { const toolName = update.title ?? update.kind ?? 'unknown'; - completedToolCalls.push({ - tool: toolName, - input: update.rawInput, - output: update.rawOutput, - id: callId, - startTime: new Date().toISOString(), - endTime: new Date().toISOString(), - durationMs: 0, - }); + completedToolCalls.push( + normalizeToolCall('copilot-cli', { + tool: toolName, + input: update.rawInput, + output: update.rawOutput, + id: callId, + startTime: new Date().toISOString(), + endTime: new Date().toISOString(), + durationMs: 0, + }), + ); request.streamCallbacks?.onToolCallEnd?.( toolName, update.rawInput, @@ -154,15 +157,17 @@ export class CopilotCliProvider implements Provider { if (inProgress) { toolCallsInProgress.delete(callId); const duration = Date.now() - inProgress.startMs; - completedToolCalls.push({ - tool: inProgress.tool, - input: inProgress.input, - output: update.rawOutput, - id: inProgress.id, - startTime: inProgress.startTime, - endTime: new Date().toISOString(), - durationMs: duration, - }); + completedToolCalls.push( + normalizeToolCall('copilot-cli', { + tool: inProgress.tool, + input: inProgress.input, + output: update.rawOutput, + id: inProgress.id, + startTime: inProgress.startTime, + endTime: new Date().toISOString(), + durationMs: duration, + }), + ); request.streamCallbacks?.onToolCallEnd?.( inProgress.tool, inProgress.input, diff --git a/packages/core/src/evaluation/providers/copilot-log-parser.ts b/packages/core/src/evaluation/providers/copilot-log-parser.ts index c2fb54b0e..e1b41414d 100644 --- a/packages/core/src/evaluation/providers/copilot-log-parser.ts +++ b/packages/core/src/evaluation/providers/copilot-log-parser.ts @@ -22,6 +22,7 @@ * 3. Add a test in copilot-log-parser.test.ts */ +import { normalizeToolCall } from './normalize-tool-call.js'; import type { Message, ProviderTokenUsage, ToolCall } from './types.js'; export interface CopilotSessionMeta { @@ -106,11 +107,13 @@ export function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession { case 'assistant.message': { const toolRequests = data.toolRequests as readonly Record[] | undefined; - const toolCalls: ToolCall[] = (toolRequests ?? []).map((req) => ({ - tool: String(req.name ?? req.toolName ?? ''), - input: req.arguments, - id: req.toolCallId ? String(req.toolCallId) : undefined, - })); + const toolCalls: ToolCall[] = (toolRequests ?? []).map((req) => + normalizeToolCall('copilot-log', { + tool: String(req.name ?? req.toolName ?? ''), + input: req.arguments, + id: req.toolCallId ? String(req.toolCallId) : undefined, + }), + ); messages.push({ role: 'assistant', @@ -157,12 +160,12 @@ export function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession { messages.push({ role: 'assistant', toolCalls: [ - { + normalizeToolCall('copilot-log', { tool: started.toolName, input: started.input, output: data.result, id: toolCallId, - }, + }), ], }); } diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index 6f64dbdde..80d72537e 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -11,6 +11,7 @@ import { isLogStreamingDisabled, resolvePlatformCliPath, } from './copilot-utils.js'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { CopilotSdkResolvedConfig } from './targets.js'; import type { @@ -182,15 +183,17 @@ export class CopilotSdkProvider implements Provider { if (inProgress) { toolCallsInProgress.delete(callId); const endMs = Date.now(); - completedToolCalls.push({ - tool: inProgress.tool, - input: inProgress.input, - output: data?.output ?? data?.result, - id: inProgress.id, - startTime: inProgress.startTime, - endTime: new Date().toISOString(), - durationMs: endMs - inProgress.startMs, - }); + completedToolCalls.push( + normalizeToolCall('copilot-sdk', { + tool: inProgress.tool, + input: inProgress.input, + output: data?.output ?? data?.result, + id: inProgress.id, + startTime: inProgress.startTime, + endTime: new Date().toISOString(), + durationMs: endMs - inProgress.startMs, + }), + ); } } diff --git a/packages/core/src/evaluation/providers/normalize-tool-call.ts b/packages/core/src/evaluation/providers/normalize-tool-call.ts new file mode 100644 index 000000000..53b171376 --- /dev/null +++ b/packages/core/src/evaluation/providers/normalize-tool-call.ts @@ -0,0 +1,251 @@ +/** + * Canonical ToolCall name normalization. + * + * Maps provider-native tool names and input fields to canonical values so that + * downstream consumers (evaluators, analytics, transcript writers) never need + * provider-specific matching logic. + * + * Canonical tool names (Claude's naming is the canonical set): + * - "Skill" — skill invocation + * - "Read" — file read + * - "Write" — file write + * - "Edit" — file edit + * - "Bash" — shell command execution + * + * Tools not in the mapping table pass through unchanged. + * + * To add a new provider: + * 1. Add entries to TOOL_NAME_MAP for that provider's native tool names. + * 2. If the provider encodes info in tool-name prefixes (e.g. "Using skill: X"), + * add entries to TOOL_PREFIX_MAP. + * 3. Add input-field normalizations to INPUT_FIELD_NORMALIZERS if the provider + * uses non-canonical field names (e.g. `path` instead of `file_path`). + */ + +import type { ProviderKind } from './types.js'; +import type { ToolCall } from './types.js'; + +// --------------------------------------------------------------------------- +// Canonical tool names +// --------------------------------------------------------------------------- + +type CanonicalTool = 'Skill' | 'Read' | 'Write' | 'Edit' | 'Bash'; + +// --------------------------------------------------------------------------- +// Static mapping: provider × native-name → canonical name +// --------------------------------------------------------------------------- + +/** + * Exact tool-name mapping per provider kind. + * Key = `${providerKind}::${nativeToolName}`, value = canonical name. + * + * Providers whose names already match canonical (Claude variants) still have + * entries for explicitness and forward safety. + */ +const TOOL_NAME_MAP = new Map([ + // --- Claude (already canonical) --- + ['claude::Skill', 'Skill'], + ['claude::Read', 'Read'], + ['claude::Write', 'Write'], + ['claude::Edit', 'Edit'], + ['claude::Bash', 'Bash'], + ['claude-cli::Skill', 'Skill'], + ['claude-cli::Read', 'Read'], + ['claude-cli::Write', 'Write'], + ['claude-cli::Edit', 'Edit'], + ['claude-cli::Bash', 'Bash'], + ['claude-sdk::Skill', 'Skill'], + ['claude-sdk::Read', 'Read'], + ['claude-sdk::Write', 'Write'], + ['claude-sdk::Edit', 'Edit'], + ['claude-sdk::Bash', 'Bash'], + + // --- Copilot --- + ['copilot-cli::Skill', 'Skill'], + ['copilot-cli::skill', 'Skill'], + ['copilot-cli::Read File', 'Read'], + ['copilot-cli::readFile', 'Read'], + ['copilot-cli::Read', 'Read'], + ['copilot-cli::readTextFile', 'Read'], + ['copilot-cli::writeTextFile', 'Write'], + ['copilot-cli::Write File', 'Write'], + ['copilot-cli::editFile', 'Edit'], + ['copilot-cli::Edit File', 'Edit'], + ['copilot-cli::runTerminalCommand', 'Bash'], + + ['copilot-sdk::Skill', 'Skill'], + ['copilot-sdk::skill', 'Skill'], + ['copilot-sdk::Read File', 'Read'], + ['copilot-sdk::readFile', 'Read'], + ['copilot-sdk::Read', 'Read'], + ['copilot-sdk::readTextFile', 'Read'], + ['copilot-sdk::writeTextFile', 'Write'], + ['copilot-sdk::Write File', 'Write'], + ['copilot-sdk::editFile', 'Edit'], + ['copilot-sdk::Edit File', 'Edit'], + ['copilot-sdk::runTerminalCommand', 'Bash'], + + ['copilot-log::Skill', 'Skill'], + ['copilot-log::skill', 'Skill'], + ['copilot-log::Read File', 'Read'], + ['copilot-log::readFile', 'Read'], + ['copilot-log::Read', 'Read'], + ['copilot-log::readTextFile', 'Read'], + ['copilot-log::writeTextFile', 'Write'], + ['copilot-log::Write File', 'Write'], + ['copilot-log::editFile', 'Edit'], + ['copilot-log::Edit File', 'Edit'], + ['copilot-log::runTerminalCommand', 'Bash'], + + ['vscode::Skill', 'Skill'], + ['vscode::skill', 'Skill'], + ['vscode::Read File', 'Read'], + ['vscode::readFile', 'Read'], + ['vscode::Read', 'Read'], + ['vscode::readTextFile', 'Read'], + ['vscode::writeTextFile', 'Write'], + ['vscode::Write File', 'Write'], + ['vscode::editFile', 'Edit'], + ['vscode::Edit File', 'Edit'], + ['vscode::runTerminalCommand', 'Bash'], + + ['vscode-insiders::Skill', 'Skill'], + ['vscode-insiders::skill', 'Skill'], + ['vscode-insiders::Read File', 'Read'], + ['vscode-insiders::readFile', 'Read'], + ['vscode-insiders::Read', 'Read'], + ['vscode-insiders::readTextFile', 'Read'], + ['vscode-insiders::writeTextFile', 'Write'], + ['vscode-insiders::Write File', 'Write'], + ['vscode-insiders::editFile', 'Edit'], + ['vscode-insiders::Edit File', 'Edit'], + ['vscode-insiders::runTerminalCommand', 'Bash'], + + // --- Codex --- + ['codex::command_execution', 'Bash'], + ['codex::file_change', 'Edit'], + + // --- Pi --- + ['pi-coding-agent::read', 'Read'], + ['pi-coding-agent::bash', 'Bash'], + ['pi-cli::read', 'Read'], + ['pi-cli::bash', 'Bash'], +]); + +// --------------------------------------------------------------------------- +// Prefix-based mapping: provider × prefix → canonical name +// --------------------------------------------------------------------------- + +/** + * Prefix-based tool-name mappings for providers that encode information in the + * tool name itself (e.g. Copilot's "Using skill: X" or Codex's "mcp:/..."). + * + * Checked when no exact match is found in TOOL_NAME_MAP. + */ +interface PrefixRule { + readonly prefix: string; + readonly canonical: CanonicalTool; + /** If true, extract the suffix after the prefix as input.skill */ + readonly extractSkillFromName?: boolean; +} + +const COPILOT_PREFIXES: readonly PrefixRule[] = [ + { prefix: 'Using skill: ', canonical: 'Skill', extractSkillFromName: true }, + { prefix: 'Viewing ', canonical: 'Read' }, +]; + +const CODEX_PREFIXES: readonly PrefixRule[] = [ + { prefix: 'mcp:', canonical: 'Skill', extractSkillFromName: true }, +]; + +const TOOL_PREFIX_MAP = new Map([ + ['copilot-cli', COPILOT_PREFIXES], + ['copilot-sdk', COPILOT_PREFIXES], + ['copilot-log', COPILOT_PREFIXES], + ['vscode', COPILOT_PREFIXES], + ['vscode-insiders', COPILOT_PREFIXES], + ['codex', CODEX_PREFIXES], +]); + +// --------------------------------------------------------------------------- +// Input field normalization +// --------------------------------------------------------------------------- + +/** + * After tool-name normalization, ensure canonical input field names exist. + * E.g. if a provider uses `input.path` for reads, copy it to `input.file_path`. + */ +type InputNormalizer = (input: Record) => Record; + +const normalizeSkillInput: InputNormalizer = (input) => { + if (input.skill !== undefined) return input; + return input; +}; + +const normalizeReadInput: InputNormalizer = (input) => { + if (input.file_path !== undefined) return input; + if (input.path !== undefined) return { ...input, file_path: input.path }; + if (input.filePath !== undefined) return { ...input, file_path: input.filePath }; + return input; +}; + +const INPUT_NORMALIZERS = new Map([ + ['Skill', normalizeSkillInput], + ['Read', normalizeReadInput], +]); + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Normalize a ToolCall's tool name and input fields to canonical values. + * + * This is a pure function — provider kind in, canonical ToolCall out. + * Unknown tool names pass through unchanged. + */ +export function normalizeToolCall(providerKind: ProviderKind, tc: ToolCall): ToolCall { + const nativeName = tc.tool; + + // 1. Try exact match + const exactKey = `${providerKind}::${nativeName}`; + const canonical = TOOL_NAME_MAP.get(exactKey); + if (canonical) { + return applyInputNormalization(canonical, { ...tc, tool: canonical }); + } + + // 2. Try prefix match + const prefixRules = TOOL_PREFIX_MAP.get(providerKind); + if (prefixRules) { + for (const rule of prefixRules) { + if (nativeName.startsWith(rule.prefix)) { + const suffix = nativeName.slice(rule.prefix.length); + let normalizedInput = tc.input; + + if (rule.extractSkillFromName && suffix) { + const existingInput = (tc.input as Record | undefined) ?? {}; + normalizedInput = { ...existingInput, skill: suffix }; + } + + const normalized: ToolCall = { + ...tc, + tool: rule.canonical, + input: normalizedInput, + }; + return applyInputNormalization(rule.canonical, normalized); + } + } + } + + // 3. No match — pass through unchanged + return tc; +} + +function applyInputNormalization(canonical: CanonicalTool, tc: ToolCall): ToolCall { + const normalizer = INPUT_NORMALIZERS.get(canonical); + if (!normalizer || tc.input === undefined || tc.input === null) return tc; + + const input = tc.input as Record; + const normalized = normalizer(input); + return normalized === input ? tc : { ...tc, input: normalized }; +} diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index bf8307015..b854abf24 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -16,6 +16,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; +import { normalizeToolCall } from './normalize-tool-call.js'; import { recordPiLogEntry } from './pi-log-tracker.js'; import { extractAzureResourceName, @@ -690,12 +691,14 @@ function extractToolCallsFromEvents(events: unknown[]): ToolCall[] { const toolCalls: ToolCall[] = []; for (const [id, { tool, input }] of starts) { - toolCalls.push({ - tool, - input: input as Record | undefined, - id: id.startsWith('anon-') ? undefined : id, - output: results.get(id), - }); + toolCalls.push( + normalizeToolCall('pi-cli', { + tool, + input: input as Record | undefined, + id: id.startsWith('anon-') ? undefined : id, + output: results.get(id), + }), + ); } return toolCalls; } @@ -853,17 +856,21 @@ function extractToolCalls(content: unknown): readonly ToolCall[] { if (!part || typeof part !== 'object') continue; const p = part as Record; if (p.type === 'tool_use' && typeof p.name === 'string') { - toolCalls.push({ - tool: p.name, - input: p.input, - id: typeof p.id === 'string' ? p.id : undefined, - }); + toolCalls.push( + normalizeToolCall('pi-cli', { + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }), + ); } else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') { - toolCalls.push({ - tool: p.name, - input: p.arguments ?? p.input, - id: typeof p.id === 'string' ? p.id : undefined, - }); + toolCalls.push( + normalizeToolCall('pi-cli', { + tool: p.name, + input: p.arguments ?? p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }), + ); } else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { const existing = toolCalls.find((tc) => tc.id === p.tool_use_id); if (existing) { diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index a33183ce0..d7f67156b 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -455,6 +455,8 @@ export interface CodexResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; readonly systemPrompt?: string; } @@ -467,6 +469,8 @@ export interface CopilotCliResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; readonly systemPrompt?: string; } @@ -480,6 +484,8 @@ export interface CopilotSdkResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; readonly systemPrompt?: string; /** BYOK provider type: "azure", "openai", or "anthropic". */ readonly byokType?: string; @@ -520,6 +526,8 @@ export interface PiCodingAgentResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; readonly systemPrompt?: string; } @@ -537,6 +545,8 @@ export interface PiCliResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; readonly systemPrompt?: string; } @@ -550,6 +560,8 @@ export interface ClaudeResolvedConfig { readonly maxBudgetUsd?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */ + readonly streamLog?: false | 'raw' | 'summary'; } export interface MockResolvedConfig { @@ -1273,6 +1285,11 @@ function resolveCodexConfig( target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, { allowLiteral: true, optionalEnv: true, @@ -1334,6 +1351,7 @@ function resolveCodexConfig( timeoutMs, logDir, logFormat, + streamLog: streamLogResult.streamLog, systemPrompt, }; } @@ -1352,6 +1370,63 @@ function normalizeCodexLogFormat(value: unknown): 'summary' | 'json' | undefined throw new Error("codex log format must be 'summary' or 'json'"); } +/** + * Resolve the stream_log config field, falling back to log_format with a + * deprecation warning. + * + * Resolution order: + * 1. stream_log (new canonical field) + * 2. log_format / log_output_format (deprecated, mapped to stream_log equivalent) + * 3. environment variable fallback (optional) + * + * Mapping: log_format 'json' → 'raw', log_format 'summary' → 'summary'. + */ +function resolveStreamLog( + target: { stream_log?: unknown; log_format?: unknown; log_output_format?: unknown; name: string }, + envFallback?: unknown, +): { + streamLog: false | 'raw' | 'summary' | undefined; + logFormat: 'summary' | 'json' | undefined; + deprecationWarning?: string; +} { + // 1. New stream_log field takes precedence + if (target.stream_log !== undefined && target.stream_log !== null) { + const val = target.stream_log; + if (val === false || val === 'false') { + return { streamLog: false, logFormat: undefined }; + } + if (val === 'raw') { + return { streamLog: 'raw', logFormat: 'json' }; + } + if (val === 'summary') { + return { streamLog: 'summary', logFormat: 'summary' }; + } + throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`); + } + + // 2. Fall back to log_format (deprecated) + const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback; + if (logFormatRaw === undefined || logFormatRaw === null) { + return { streamLog: undefined, logFormat: undefined }; + } + + if (typeof logFormatRaw !== 'string') { + throw new Error(`${target.name}: log_format must be 'summary' or 'json'`); + } + + const normalized = logFormatRaw.trim().toLowerCase(); + if (normalized !== 'json' && normalized !== 'summary') { + throw new Error(`${target.name}: log_format must be 'summary' or 'json'`); + } + + const streamLogEquivalent = normalized === 'json' ? 'raw' : 'summary'; + return { + streamLog: streamLogEquivalent, + logFormat: normalized as 'json' | 'summary', + deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' → stream_log: '${streamLogEquivalent}').`, + }; +} + function resolveCopilotSdkConfig( target: z.infer, env: EnvLookup, @@ -1368,6 +1443,11 @@ function resolveCopilotSdkConfig( const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, { allowLiteral: true, optionalEnv: true, @@ -1507,6 +1587,7 @@ function resolveCopilotSdkConfig( timeoutMs, logDir, logFormat, + streamLog: streamLogResult.streamLog, systemPrompt, byokType, byokBaseUrl, @@ -1532,6 +1613,11 @@ function resolveCopilotCliConfig( const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, { allowLiteral: true, @@ -1600,6 +1686,7 @@ function resolveCopilotCliConfig( timeoutMs, logDir, logFormat, + streamLog: streamLogResult.streamLog, systemPrompt, }; } @@ -1629,6 +1716,11 @@ function resolvePiCodingAgentConfig( const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const subprovider = resolveOptionalString( subproviderSource, env, @@ -1719,6 +1811,7 @@ function resolvePiCodingAgentConfig( timeoutMs, logDir, logFormat, + streamLog: streamLogResult.streamLog, systemPrompt, }; } @@ -1741,6 +1834,11 @@ function resolvePiCliConfig( const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, { allowLiteral: true, @@ -1832,6 +1930,7 @@ function resolvePiCliConfig( timeoutMs, logDir, logFormat, + streamLog: streamLogResult.streamLog, systemPrompt, }; } @@ -1850,6 +1949,11 @@ function resolveClaudeConfig( target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT; const systemPromptSource = target.system_prompt; + const streamLogResult = resolveStreamLog(target); + if (streamLogResult.deprecationWarning) { + process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); + } + const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, { allowLiteral: true, optionalEnv: true, @@ -1911,6 +2015,7 @@ function resolveClaudeConfig( maxBudgetUsd, logDir, logFormat, + streamLog: streamLogResult.streamLog, }; } diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 80222455c..57c392c97 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -350,6 +350,8 @@ export interface TargetDefinition { readonly log_directory?: string | unknown | undefined; readonly log_format?: string | unknown | undefined; readonly log_output_format?: string | unknown | undefined; + /** New stream_log field — replaces log_format. false=no stream log, 'raw'=per-event, 'summary'=consolidated. */ + readonly stream_log?: string | boolean | unknown | undefined; // System prompt (codex, copilot, claude, pi-coding-agent) readonly system_prompt?: string | unknown | undefined; // Claude Agent SDK fields diff --git a/packages/core/src/import/claude-parser.ts b/packages/core/src/import/claude-parser.ts index 03d3d2d19..baec302ab 100644 --- a/packages/core/src/import/claude-parser.ts +++ b/packages/core/src/import/claude-parser.ts @@ -23,6 +23,7 @@ * - cost_usd is null (Claude Code does not report per-session cost) */ +import { normalizeToolCall } from '../evaluation/providers/normalize-tool-call.js'; import type { Message, ToolCall } from '../evaluation/providers/types.js'; import type { TranscriptEntry, TranscriptSource } from './types.js'; @@ -286,11 +287,13 @@ function extractAssistantContent(content: string | readonly ClaudeContentBlock[] case 'tool_use': if (block.name) { - toolCalls.push({ - tool: block.name, - input: block.input, - id: block.id, - }); + toolCalls.push( + normalizeToolCall('claude', { + tool: block.name, + input: block.input, + id: block.id, + }), + ); } break; diff --git a/packages/core/src/import/codex-parser.ts b/packages/core/src/import/codex-parser.ts index 368452847..6d4527db5 100644 --- a/packages/core/src/import/codex-parser.ts +++ b/packages/core/src/import/codex-parser.ts @@ -30,6 +30,7 @@ * To add a new response_item type: add a case to the switch in parseCodexSession(). */ +import { normalizeToolCall } from '../evaluation/providers/normalize-tool-call.js'; import type { Message, ToolCall } from '../evaluation/providers/types.js'; import type { TranscriptEntry, TranscriptSource } from './types.js'; @@ -124,7 +125,11 @@ export function parseCodexSession(jsonl: string): TranscriptEntry { input = payload.arguments; } - const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const toolCall: ToolCall = normalizeToolCall('codex', { + tool: toolName, + input, + id: callId, + }); const msgIdx = messages.length; messages.push({ role: 'assistant', @@ -151,7 +156,11 @@ export function parseCodexSession(jsonl: string): TranscriptEntry { input = payload.arguments; } - const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const toolCall: ToolCall = normalizeToolCall('codex', { + tool: toolName, + input, + id: callId, + }); const msgIdx = messages.length; messages.push({ role: 'assistant', diff --git a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts index dcecf7788..0532204fe 100644 --- a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts +++ b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts @@ -30,11 +30,10 @@ function makeConfig( } describe('SkillTriggerEvaluator', () => { - describe('provider tool resolution', () => { - it('should resolve claude-cli to Claude tool names', () => { + describe('canonical tool names (provider-agnostic)', () => { + it('should detect Skill tool with matching skill name', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'claude-cli', targetName: 'test' }, output: [ { role: 'assistant', @@ -48,18 +47,17 @@ describe('SkillTriggerEvaluator', () => { expect(result.score).toBe(1); }); - it('should resolve copilot-cli to Copilot tool names', () => { + it('should detect Read tool loading skill file via file_path', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'copilot-cli', targetName: 'test' }, output: [ { role: 'assistant', content: '', toolCalls: [ { - tool: 'Read File', - input: { file_path: '/path/to/csv-analyzer/SKILL.md' }, + tool: 'Read', + input: { file_path: '/path/to/skills/csv-analyzer/SKILL.md' }, }, ], }, @@ -70,83 +68,18 @@ describe('SkillTriggerEvaluator', () => { expect(result.score).toBe(1); }); - it('should resolve copilot-log to Copilot tool names', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'copilot-log', targetName: 'test' }, - output: [ - { - role: 'assistant', - toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - expect(result.score).toBe(1); - }); - - it('should fall back to Claude defaults for unknown provider', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'openai', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - }); - - it('should detect codex mcp skill tool (skill name in tool name)', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'codex', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'mcp:claude-code/csv-analyzer', input: {} }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - expect(result.score).toBe(1); - }); - - it('should detect codex mcp skill tool with arbitrary server name', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'codex', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'mcp:skills/csv-analyzer', input: {} }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - }); - - it('should detect pi-coding-agent read tool loading skill file', () => { + it('should detect skill via tool output reference', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'pi-coding-agent', targetName: 'test' }, output: [ { role: 'assistant', content: '', toolCalls: [ { - tool: 'read', - input: { path: '/workspace/.agents/skills/csv-analyzer/SKILL.md' }, + tool: 'Bash', + input: { command: 'grep -r skill' }, + output: 'Found: .agents/skills/csv-analyzer/SKILL.md', }, ], }, @@ -154,18 +87,16 @@ describe('SkillTriggerEvaluator', () => { }); const result = evaluator.evaluate(context); expect(result.verdict).toBe('pass'); - expect(result.score).toBe(1); }); - it('should fail for pi-coding-agent with non-matching read call', () => { + it('should fail when skill name does not match', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'pi-coding-agent', targetName: 'test' }, output: [ { role: 'assistant', - content: 'some response', - toolCalls: [{ tool: 'read', input: { path: '/workspace/README.md' } }], + content: '', + toolCalls: [{ tool: 'Skill', input: { skill: 'other-skill' } }], }, ], }); @@ -173,94 +104,74 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('fail'); }); - it('should detect codex bash command_execution reading skill file', () => { + it('should fail when Read loads non-skill file', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'codex', targetName: 'test' }, output: [ { role: 'assistant', content: '', - toolCalls: [ - { - tool: 'command_execution', - input: { - command: - '/bin/bash -lc "sed -n \'1,220p\' /home/user/.agents/skills/csv-analyzer/SKILL.md"', - }, - }, - ], + toolCalls: [{ tool: 'Read', input: { file_path: '/workspace/README.md' } }], }, ], }); const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - expect(result.score).toBe(1); + expect(result.verdict).toBe('fail'); }); - it('should fail for codex with non-matching tool calls', () => { + it('should fail when only unrelated tools are called', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'codex', targetName: 'test' }, output: [ { role: 'assistant', - content: 'some response', - toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }], + content: '', + toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }], }, ], }); const result = evaluator.evaluate(context); expect(result.verdict).toBe('fail'); - expect(result.assertions.filter((a) => !a.passed)[0].text).toContain('csv-analyzer'); }); - it('should pass for codex with should_trigger: false and unrelated tool', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); + it('should handle no tool calls', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'codex', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: 'some response', - toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }], - }, - ], + output: [{ role: 'assistant', content: 'no tools used' }], }); const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); + expect(result.verdict).toBe('fail'); + expect(result.assertions.filter((a) => !a.passed)[0].text).toBe('No tool calls recorded'); }); - }); - describe('backward compatibility', () => { - it('should work with existing Claude Skill tool calls', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); + it('should work with any provider kind (provider-agnostic)', () => { + for (const kind of ['claude-cli', 'copilot-cli', 'codex', 'pi-cli', 'openai']) { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind, targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + } }); + }); - it('should work with existing Claude Read tool calls', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); + describe('should_trigger: false', () => { + it('should pass when skill is not triggered', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); const context = makeContext({ output: [ { role: 'assistant', content: '', - toolCalls: [ - { - tool: 'Read', - input: { file_path: '/skills/csv-analyzer/SKILL.md' }, - }, - ], + toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }], }, ], }); @@ -268,14 +179,14 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('pass'); }); - it('should fail when first tool is unrelated', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); + it('should fail when skill is triggered unexpectedly', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); const context = makeContext({ output: [ { role: 'assistant', content: '', - toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }], + toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], }, ], }); @@ -283,26 +194,10 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('fail'); }); - it('should handle no tool calls', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - output: [{ role: 'assistant', content: 'no tools used' }], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('fail'); - expect(result.assertions.filter((a) => !a.passed)[0].text).toBe('No tool calls recorded'); - }); - - it('should support should_trigger: false', () => { + it('should pass with no tool calls', () => { const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); const context = makeContext({ - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }], - }, - ], + output: [{ role: 'assistant', content: 'no tools used' }], }); const result = evaluator.evaluate(context); expect(result.verdict).toBe('pass'); @@ -310,17 +205,16 @@ describe('SkillTriggerEvaluator', () => { }); describe('full transcript scanning', () => { - it('should pass when skill triggers after a preamble meta-skill', () => { + it('should pass when skill triggers after a preamble skill', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'copilot-cli', targetName: 'test' }, output: [ { role: 'assistant', content: '', toolCalls: [ - { tool: 'Using skill: using-superpowers', input: {} }, - { tool: 'Using skill: csv-analyzer', input: {} }, + { tool: 'Skill', input: { skill: 'using-superpowers' } }, + { tool: 'Skill', input: { skill: 'csv-analyzer' } }, ], }, ], @@ -357,7 +251,7 @@ describe('SkillTriggerEvaluator', () => { role: 'assistant', content: '', toolCalls: [ - { tool: 'Using skill: using-superpowers', input: {} }, + { tool: 'Skill', input: { skill: 'using-superpowers' } }, { tool: 'Bash', input: { command: 'ls' } }, ], }, @@ -367,14 +261,14 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('fail'); }); - it('should pass for should_trigger:false when skill never appears in transcript', () => { + it('should pass for should_trigger:false when skill never appears', () => { const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); const context = makeContext({ output: [ { role: 'assistant', content: '', - toolCalls: [{ tool: 'Using skill: using-superpowers', input: {} }], + toolCalls: [{ tool: 'Skill', input: { skill: 'using-superpowers' } }], }, ], }); @@ -382,7 +276,7 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('pass'); }); - it('should fail for should_trigger:false when skill appears later in transcript', () => { + it('should fail for should_trigger:false when skill appears later', () => { const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); const context = makeContext({ output: [ @@ -399,82 +293,18 @@ describe('SkillTriggerEvaluator', () => { const result = evaluator.evaluate(context); expect(result.verdict).toBe('fail'); }); - }); - - describe('pi-coding-agent tools', () => { - it('should detect pi-coding-agent read tool loading skill from .agents/skills', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'pi-coding-agent', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [ - { - tool: 'read', - input: { path: '.agents/skills/csv-analyzer/SKILL.md' }, - }, - ], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - expect(result.score).toBe(1); - }); - it('should detect pi-coding-agent read tool loading skill from global path', () => { + it('should detect skill loaded via Read in .agents/skills path', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'pi-coding-agent', targetName: 'test' }, output: [ { role: 'assistant', content: '', toolCalls: [ { - tool: 'read', - input: { path: '/home/user/.agents/skills/csv-analyzer/SKILL.md' }, - }, - ], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - }); - - it('should pass for pi-coding-agent with should_trigger: false and unrelated tool', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); - const context = makeContext({ - provider: { kind: 'pi-coding-agent', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: 'some response', - toolCalls: [{ tool: 'bash', input: { command: 'ls' } }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - }); - }); - - describe('copilot-specific tools', () => { - it('should recognize readFile tool for copilot', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'copilot-cli', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [ - { - tool: 'readFile', - input: { file_path: '/csv-analyzer/SKILL.md' }, + tool: 'Read', + input: { file_path: '.agents/skills/csv-analyzer/SKILL.md' }, }, ], }, @@ -484,18 +314,17 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('pass'); }); - it('should recognize readTextFile tool for copilot', () => { + it('should detect skill loaded via Read in global path', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ - provider: { kind: 'copilot-cli', targetName: 'test' }, output: [ { role: 'assistant', content: '', toolCalls: [ { - tool: 'readTextFile', - input: { file_path: '/csv-analyzer/SKILL.md' }, + tool: 'Read', + input: { file_path: '/home/user/.agents/skills/csv-analyzer/SKILL.md' }, }, ], }, @@ -504,21 +333,5 @@ describe('SkillTriggerEvaluator', () => { const result = evaluator.evaluate(context); expect(result.verdict).toBe('pass'); }); - - it('should recognize lowercase skill tool for copilot', () => { - const evaluator = new SkillTriggerEvaluator(makeConfig()); - const context = makeContext({ - provider: { kind: 'copilot-cli', targetName: 'test' }, - output: [ - { - role: 'assistant', - content: '', - toolCalls: [{ tool: 'skill', input: { skill: 'csv-analyzer' } }], - }, - ], - }); - const result = evaluator.evaluate(context); - expect(result.verdict).toBe('pass'); - }); }); }); diff --git a/packages/core/test/evaluation/providers/codex-sdk.test.ts b/packages/core/test/evaluation/providers/codex-sdk.test.ts index 0e76d586e..ea3c903dd 100644 --- a/packages/core/test/evaluation/providers/codex-sdk.test.ts +++ b/packages/core/test/evaluation/providers/codex-sdk.test.ts @@ -288,7 +288,7 @@ describe('CodexProvider (SDK)', () => { const msg = response.output?.[0]; expect(msg?.toolCalls).toBeDefined(); expect(msg?.toolCalls?.length).toBe(1); - expect(msg?.toolCalls?.[0]?.tool).toBe('command_execution'); + expect(msg?.toolCalls?.[0]?.tool).toBe('Bash'); expect(msg?.toolCalls?.[0]?.input).toEqual({ command: 'ls -la' }); expect(msg?.toolCalls?.[0]?.output).toBe('file1.ts\nfile2.ts'); expect(msg?.toolCalls?.[0]?.id).toBe('cmd-1'); @@ -328,7 +328,7 @@ describe('CodexProvider (SDK)', () => { const msg = response.output?.[0]; expect(msg?.toolCalls?.length).toBe(1); - expect(msg?.toolCalls?.[0]?.tool).toBe('file_change'); + expect(msg?.toolCalls?.[0]?.tool).toBe('Edit'); expect(msg?.toolCalls?.[0]?.input).toEqual([{ path: 'src/index.ts', kind: 'update' }]); }); diff --git a/packages/core/test/evaluation/providers/copilot-log-parser.test.ts b/packages/core/test/evaluation/providers/copilot-log-parser.test.ts index 491763f7b..181023b70 100644 --- a/packages/core/test/evaluation/providers/copilot-log-parser.test.ts +++ b/packages/core/test/evaluation/providers/copilot-log-parser.test.ts @@ -57,7 +57,7 @@ describe('parseCopilotEvents', () => { expect(result.messages[0].role).toBe('assistant'); expect(result.messages[0].content).toBe('I will help you'); expect(result.messages[0].toolCalls).toHaveLength(1); - expect(result.messages[0].toolCalls?.[0].tool).toBe('Read File'); + expect(result.messages[0].toolCalls?.[0].tool).toBe('Read'); expect(result.messages[0].toolCalls?.[0].input).toEqual({ file_path: '/src/index.ts' }); }); @@ -96,7 +96,7 @@ describe('parseCopilotEvents', () => { const assistantMsg = result.messages.find((m) => m.role === 'assistant'); expect(assistantMsg).toBeDefined(); expect(assistantMsg?.toolCalls).toHaveLength(1); - expect(assistantMsg?.toolCalls?.[0].tool).toBe('Read File'); + expect(assistantMsg?.toolCalls?.[0].tool).toBe('Read'); expect(assistantMsg?.toolCalls?.[0].output).toBe('file contents'); }); diff --git a/packages/core/test/evaluation/providers/copilot-sdk.test.ts b/packages/core/test/evaluation/providers/copilot-sdk.test.ts index 59eb3be13..810b7ee9f 100644 --- a/packages/core/test/evaluation/providers/copilot-sdk.test.ts +++ b/packages/core/test/evaluation/providers/copilot-sdk.test.ts @@ -295,7 +295,7 @@ describe('CopilotSdkProvider', () => { expect(msg?.toolCalls).toBeDefined(); expect(msg?.toolCalls?.length).toBe(1); expect(msg?.toolCalls?.[0]?.tool).toBe('Read'); - expect(msg?.toolCalls?.[0]?.input).toEqual({ path: '/foo.ts' }); + expect(msg?.toolCalls?.[0]?.input).toEqual({ path: '/foo.ts', file_path: '/foo.ts' }); expect(msg?.toolCalls?.[0]?.output).toBe('file content'); expect(msg?.toolCalls?.[0]?.id).toBe('tc-1'); expect(msg?.toolCalls?.[0]?.durationMs).toBeDefined(); diff --git a/packages/core/test/evaluation/providers/normalize-tool-call.test.ts b/packages/core/test/evaluation/providers/normalize-tool-call.test.ts new file mode 100644 index 000000000..7ec7322eb --- /dev/null +++ b/packages/core/test/evaluation/providers/normalize-tool-call.test.ts @@ -0,0 +1,217 @@ +import { describe, expect, it } from 'vitest'; +import { normalizeToolCall } from '../../../src/evaluation/providers/normalize-tool-call.js'; +import type { ProviderKind } from '../../../src/evaluation/providers/types.js'; +import type { ToolCall } from '../../../src/evaluation/providers/types.js'; + +function tc(tool: string, input?: Record): ToolCall { + return { tool, input }; +} + +describe('normalizeToolCall', () => { + // ------------------------------------------------------------------------- + // Claude providers (already canonical — should be identity) + // ------------------------------------------------------------------------- + describe('claude providers (identity)', () => { + for (const provider of ['claude', 'claude-cli', 'claude-sdk'] as ProviderKind[]) { + it(`${provider}: Skill → Skill`, () => { + const result = normalizeToolCall(provider, tc('Skill', { skill: 'my-skill' })); + expect(result.tool).toBe('Skill'); + expect((result.input as Record).skill).toBe('my-skill'); + }); + + it(`${provider}: Read → Read`, () => { + const result = normalizeToolCall(provider, tc('Read', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Read'); + expect((result.input as Record).file_path).toBe('/foo.ts'); + }); + + it(`${provider}: Write → Write`, () => { + const result = normalizeToolCall(provider, tc('Write', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Write'); + }); + + it(`${provider}: Edit → Edit`, () => { + const result = normalizeToolCall(provider, tc('Edit', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Edit'); + }); + + it(`${provider}: Bash → Bash`, () => { + const result = normalizeToolCall(provider, tc('Bash', { command: 'ls' })); + expect(result.tool).toBe('Bash'); + }); + } + }); + + // ------------------------------------------------------------------------- + // Copilot providers + // ------------------------------------------------------------------------- + describe('copilot providers', () => { + for (const provider of [ + 'copilot-cli', + 'copilot-sdk', + 'copilot-log', + 'vscode', + 'vscode-insiders', + ] as ProviderKind[]) { + it(`${provider}: skill (lowercase) → Skill`, () => { + const result = normalizeToolCall(provider, tc('skill', { skill: 'my-skill' })); + expect(result.tool).toBe('Skill'); + }); + + it(`${provider}: Read File → Read`, () => { + const result = normalizeToolCall(provider, tc('Read File', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Read'); + }); + + it(`${provider}: readFile → Read`, () => { + const result = normalizeToolCall(provider, tc('readFile', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Read'); + }); + + it(`${provider}: readTextFile → Read`, () => { + const result = normalizeToolCall(provider, tc('readTextFile', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Read'); + }); + + it(`${provider}: writeTextFile → Write`, () => { + const result = normalizeToolCall(provider, tc('writeTextFile', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Write'); + }); + + it(`${provider}: Write File → Write`, () => { + const result = normalizeToolCall(provider, tc('Write File', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Write'); + }); + + it(`${provider}: editFile → Edit`, () => { + const result = normalizeToolCall(provider, tc('editFile', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Edit'); + }); + + it(`${provider}: Edit File → Edit`, () => { + const result = normalizeToolCall(provider, tc('Edit File', { file_path: '/foo.ts' })); + expect(result.tool).toBe('Edit'); + }); + + it(`${provider}: runTerminalCommand → Bash`, () => { + const result = normalizeToolCall(provider, tc('runTerminalCommand', { command: 'ls' })); + expect(result.tool).toBe('Bash'); + }); + + it(`${provider}: "Using skill: X" prefix → Skill with extracted name`, () => { + const result = normalizeToolCall(provider, tc('Using skill: my-skill', {})); + expect(result.tool).toBe('Skill'); + expect((result.input as Record).skill).toBe('my-skill'); + }); + + it(`${provider}: "Viewing X" prefix → Read`, () => { + const result = normalizeToolCall(provider, tc('Viewing /foo/bar.ts', {})); + expect(result.tool).toBe('Read'); + }); + } + }); + + // ------------------------------------------------------------------------- + // Codex + // ------------------------------------------------------------------------- + describe('codex', () => { + it('command_execution → Bash', () => { + const result = normalizeToolCall('codex', tc('command_execution', { command: 'cat file' })); + expect(result.tool).toBe('Bash'); + }); + + it('file_change → Edit', () => { + const result = normalizeToolCall('codex', tc('file_change', { changes: [] })); + expect(result.tool).toBe('Edit'); + }); + + it('"mcp:server/skill-name" prefix → Skill with extracted name', () => { + const result = normalizeToolCall('codex', tc('mcp:my-server/my-skill', {})); + expect(result.tool).toBe('Skill'); + expect((result.input as Record).skill).toBe('my-server/my-skill'); + }); + }); + + // ------------------------------------------------------------------------- + // Pi + // ------------------------------------------------------------------------- + describe('pi providers', () => { + for (const provider of ['pi-coding-agent', 'pi-cli'] as ProviderKind[]) { + it(`${provider}: read → Read`, () => { + const result = normalizeToolCall(provider, tc('read', { path: '/foo.ts' })); + expect(result.tool).toBe('Read'); + }); + + it(`${provider}: read normalizes path → file_path`, () => { + const result = normalizeToolCall(provider, tc('read', { path: '/foo.ts' })); + expect((result.input as Record).file_path).toBe('/foo.ts'); + }); + } + }); + + // ------------------------------------------------------------------------- + // Input field normalization + // ------------------------------------------------------------------------- + describe('input field normalization', () => { + it('Read: copies path → file_path when file_path missing', () => { + const result = normalizeToolCall('claude', tc('Read', { path: '/foo.ts' })); + expect((result.input as Record).file_path).toBe('/foo.ts'); + expect((result.input as Record).path).toBe('/foo.ts'); + }); + + it('Read: copies filePath → file_path when file_path missing', () => { + const result = normalizeToolCall('copilot-cli', tc('Read', { filePath: '/bar.ts' })); + expect((result.input as Record).file_path).toBe('/bar.ts'); + }); + + it('Read: does not overwrite existing file_path', () => { + const result = normalizeToolCall( + 'claude', + tc('Read', { file_path: '/original.ts', path: '/other.ts' }), + ); + expect((result.input as Record).file_path).toBe('/original.ts'); + }); + }); + + // ------------------------------------------------------------------------- + // Pass-through for unknown tools + // ------------------------------------------------------------------------- + describe('pass-through', () => { + it('unknown tool name passes through unchanged', () => { + const original = tc('custom_search', { query: 'foo' }); + const result = normalizeToolCall('copilot-cli', original); + expect(result.tool).toBe('custom_search'); + expect(result.input).toEqual({ query: 'foo' }); + }); + + it('unknown provider passes through unchanged', () => { + const original = tc('Read File', { file_path: '/foo.ts' }); + const result = normalizeToolCall('openai' as ProviderKind, original); + expect(result.tool).toBe('Read File'); + }); + }); + + // ------------------------------------------------------------------------- + // Preserves other ToolCall fields + // ------------------------------------------------------------------------- + describe('preserves ToolCall metadata', () => { + it('preserves id, startTime, endTime, durationMs, output', () => { + const original: ToolCall = { + tool: 'readFile', + input: { file_path: '/foo.ts' }, + output: 'file contents', + id: 'tc-123', + startTime: '2024-01-01T00:00:00Z', + endTime: '2024-01-01T00:00:01Z', + durationMs: 1000, + }; + const result = normalizeToolCall('copilot-cli', original); + expect(result.tool).toBe('Read'); + expect(result.output).toBe('file contents'); + expect(result.id).toBe('tc-123'); + expect(result.startTime).toBe('2024-01-01T00:00:00Z'); + expect(result.endTime).toBe('2024-01-01T00:00:01Z'); + expect(result.durationMs).toBe(1000); + }); + }); +}); diff --git a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts index 84e5d0d6a..55bf24352 100644 --- a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts +++ b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts @@ -35,9 +35,12 @@ describe('pi-cli tool call extraction from events', () => { const toolCalls = extractToolCallsFromEvents(events); expect(toolCalls).toHaveLength(1); - expect(toolCalls[0].tool).toBe('read'); + expect(toolCalls[0].tool).toBe('Read'); expect(toolCalls[0].id).toBe('tc-1'); - expect(toolCalls[0].input).toEqual({ path: '.agents/skills/csv-analyzer/SKILL.md' }); + expect(toolCalls[0].input).toEqual({ + path: '.agents/skills/csv-analyzer/SKILL.md', + file_path: '.agents/skills/csv-analyzer/SKILL.md', + }); expect(toolCalls[0].output).toBe('skill content here'); }); @@ -66,9 +69,10 @@ describe('pi-cli tool call extraction from events', () => { expect(messages[0].role).toBe('assistant'); expect(messages[0].toolCalls).toBeDefined(); expect(messages[0].toolCalls).toHaveLength(1); - expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[0].tool).toBe('Read'); expect(messages[0].toolCalls?.[0].input).toEqual({ path: '.agents/skills/csv-analyzer/SKILL.md', + file_path: '.agents/skills/csv-analyzer/SKILL.md', }); }); @@ -143,8 +147,8 @@ describe('pi-cli tool call extraction from events', () => { const messages = extractMessages(events); expect(messages[0].toolCalls).toHaveLength(2); - expect(messages[0].toolCalls?.[0].tool).toBe('read'); - expect(messages[0].toolCalls?.[1].tool).toBe('bash'); + expect(messages[0].toolCalls?.[0].tool).toBe('Read'); + expect(messages[0].toolCalls?.[1].tool).toBe('Bash'); }); it('should create synthetic assistant message when no assistant message exists', () => { @@ -171,7 +175,7 @@ describe('pi-cli tool call extraction from events', () => { expect(messages).toHaveLength(2); expect(messages[1].role).toBe('assistant'); expect(messages[1].toolCalls).toHaveLength(1); - expect(messages[1].toolCalls?.[0].tool).toBe('read'); + expect(messages[1].toolCalls?.[0].tool).toBe('Read'); }); it('should fall back to turn_end events and still inject tool calls', () => { @@ -197,7 +201,7 @@ describe('pi-cli tool call extraction from events', () => { expect(messages).toHaveLength(1); expect(messages[0].toolCalls).toHaveLength(1); - expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[0].tool).toBe('Read'); }); it('should handle tool_call type in message content', () => { @@ -223,9 +227,10 @@ describe('pi-cli tool call extraction from events', () => { const messages = extractMessages(events); expect(messages[0].toolCalls).toHaveLength(1); - expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[0].tool).toBe('Read'); expect(messages[0].toolCalls?.[0].input).toEqual({ path: '.agents/skills/csv-analyzer/SKILL.md', + file_path: '.agents/skills/csv-analyzer/SKILL.md', }); }); });