Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import path from 'node:path';

import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core';
import {
DEFAULT_THRESHOLD,
type EvaluationResult,
type EvaluatorResult,
type TranscriptJsonLine,
} from '@agentv/core';
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';

Expand Down Expand Up @@ -766,5 +771,41 @@ export async function writeArtifactsFromResults(

await writeJsonlFile(indexPath, indexRecords);

// Write transcript JSONL (auto-generated on every eval run)
const transcriptPath = path.join(outputDir, 'transcript.jsonl');
const transcriptLines: TranscriptJsonLine[] = results.map((result) => {
let inputText = '';
if (typeof result.input === 'string') {
inputText = result.input;
} else if (Array.isArray(result.input)) {
const firstUserMsg = result.input.find((m) => m.role === 'user');
inputText = typeof firstUserMsg?.content === 'string' ? firstUserMsg.content : '';
}
return {
input: inputText,
output: result.output,
token_usage: result.tokenUsage
? {
input: result.tokenUsage.input,
output: result.tokenUsage.output,
cached: result.tokenUsage.cached,
}
: undefined,
duration_ms: result.durationMs,
cost_usd: result.costUsd,
source: {
provider: result.target,
session_id: result.conversationId ?? result.testId,
timestamp: result.timestamp,
},
};
});
await writeFile(
transcriptPath,
transcriptLines.map((line) => JSON.stringify(line)).join('\n') +
(transcriptLines.length ? '\n' : ''),
'utf8',
);

return { testArtifactDir, timingPath, benchmarkPath, indexPath };
}
8 changes: 7 additions & 1 deletion apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ describe('writeArtifactsFromResults', () => {
'beta',
'index.jsonl',
'timing.json',
'transcript.jsonl',
]);

const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha'));
Expand Down Expand Up @@ -624,7 +625,12 @@ describe('writeArtifactsFromResults', () => {
const paths = await writeArtifactsFromResults([], testDir);

const artifactEntries = await readdir(paths.testArtifactDir);
expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']);
expect(artifactEntries.sort()).toEqual([
'benchmark.json',
'index.jsonl',
'timing.json',
'transcript.jsonl',
]);

const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
expect(timing.total_tokens).toBe(0);
Expand Down
169 changes: 18 additions & 151 deletions packages/core/src/evaluation/evaluators/skill-trigger.ts
Original file line number Diff line number Diff line change
@@ -1,120 +1,27 @@
/**
* Built-in skill-trigger evaluator.
*
* Detects whether the agent invoked a named skill as its first tool call.
* Supports multiple provider kinds via static tool-name mappings.
* For providers not covered here, use a code-grader instead.
* Detects whether the agent invoked a named skill during a session.
* Works with canonical tool names produced by normalizeToolCall() — no
* provider-specific matching logic needed.
*
* Detection logic:
* - Only the FIRST tool call matters.
* - Skill tool: checks input.[skillInputField] contains the skill name (case-sensitive substring).
* - Read tool: checks input.[readInputField] contains the skill name (case-sensitive substring).
* - Any other tool as first call means the skill was not triggered.
* - Scans ALL tool calls (not just the first) for skill invocation evidence.
* - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name.
* - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path.
* - Fallback: checks tool output for skill file path references.
* - Supports negative cases via should_trigger: false.
*
* To add a new provider:
* 1. Create a ToolMatcher with the provider's tool names and input fields.
* 2. Add entries to PROVIDER_TOOL_SEMANTICS mapping the provider kind(s) to the matcher.
* 3. If the provider's tool-call format doesn't fit the ToolMatcher model, use a code-grader instead.
* Prerequisites:
* All providers and import parsers must call normalizeToolCall() when
* constructing ToolCall objects. This ensures canonical tool names
* ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field
* names (input.skill, input.file_path) regardless of provider.
*/

import type { ProviderKind } from '../providers/types.js';
import type { SkillTriggerEvaluatorConfig } from '../types.js';
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';

/** Tool-name semantics for different provider kinds. */
interface ToolMatcher {
/** Tool names that indicate skill invocation. */
readonly skillTools: readonly string[];
/** Input field that contains the skill name for skill tools. */
readonly skillInputField: string;
/** Tool names that indicate file read. */
readonly readTools: readonly string[];
/** Input field that contains the skill name for read tools. */
readonly readInputField: string;
/** Tool-name prefixes that encode the skill directly in the tool name. */
readonly skillToolPrefixes?: readonly string[];
/** Tool-name prefixes that encode the file path directly in the tool name. */
readonly readToolPrefixes?: readonly string[];
/** Alternate input field names that may contain the file path. */
readonly readInputFields?: readonly string[];
}

const CLAUDE_MATCHER: ToolMatcher = {
skillTools: ['Skill'],
skillInputField: 'skill',
readTools: ['Read'],
readInputField: 'file_path',
};

/** Copilot uses ACP protocol — tool names vary by version and context. */
const COPILOT_MATCHER: ToolMatcher = {
skillTools: ['Skill', 'skill'],
skillInputField: 'skill',
readTools: ['Read File', 'readFile', 'Read', 'readTextFile'],
readInputField: 'file_path',
skillToolPrefixes: ['Using skill: '],
readToolPrefixes: ['Viewing '],
readInputFields: ['file_path', 'path'],
};

/**
* Pi CLI reads skill files using the lowercase `read` tool with a `path` argument.
* Skills are auto-discovered from `.agents/skills/` relative to the working directory.
*
* Skill lookup order (workspace-scoped first):
* 1. .agents/skills/<skill-name>/SKILL.md (workspace-relative, auto-discovered)
* 2. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
*/
const PI_CODING_AGENT_MATCHER: ToolMatcher = {
skillTools: [],
skillInputField: 'skill',
readTools: ['read'],
readInputField: 'path',
readInputFields: ['path', 'file_path', 'filePath'],
};

/**
* Codex reads skill files via command_execution using a bash sed command containing
* the skill file path. The skill name appears in the command string, so we match
* any command_execution whose command field includes the skill name.
*
* Skill lookup order (workspace-scoped first):
* 1. .agents/skills/<skill-name>/SKILL.md (workspace-relative)
* 2. .codex/skills/<skill-name>/SKILL.md (fallback)
* 3. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
*
* MCP-based skill invocation (`mcp:<server>/<skill-name>`) is also supported for
* Codex configurations that surface skills as MCP tools.
*/
const CODEX_MATCHER: ToolMatcher = {
skillTools: [],
skillInputField: 'skill',
readTools: ['command_execution'],
readInputField: 'command',
skillToolPrefixes: ['mcp:'],
readToolPrefixes: ['mcp:'],
readInputFields: ['command', 'path', 'file_path', 'filePath'],
};

/**
* Static mapping of provider kinds to their tool-name semantics.
* Providers not listed here fall back to CLAUDE_MATCHER.
*/
const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
claude: CLAUDE_MATCHER,
'claude-cli': CLAUDE_MATCHER,
'claude-sdk': CLAUDE_MATCHER,
codex: CODEX_MATCHER,
'pi-coding-agent': PI_CODING_AGENT_MATCHER,
'pi-cli': PI_CODING_AGENT_MATCHER,
'copilot-cli': COPILOT_MATCHER,
'copilot-log': COPILOT_MATCHER,
'copilot-sdk': COPILOT_MATCHER,
vscode: COPILOT_MATCHER,
'vscode-insiders': COPILOT_MATCHER,
};

export class SkillTriggerEvaluator implements Evaluator {
readonly kind = 'skill-trigger';

Expand All @@ -124,19 +31,9 @@ export class SkillTriggerEvaluator implements Evaluator {
this.config = config;
}

private resolveMatcher(providerKind: ProviderKind | undefined): ToolMatcher {
if (providerKind) {
const match = PROVIDER_TOOL_SEMANTICS[providerKind];
if (match) return match;
}
return CLAUDE_MATCHER;
}

evaluate(context: EvaluationContext): EvaluationScore {
const skillName = this.config.skill;
const shouldTrigger = this.config.should_trigger !== false;
const providerKind = context.provider?.kind as ProviderKind | undefined;
const matcher = this.resolveMatcher(providerKind);

const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);

Expand All @@ -147,42 +44,23 @@ export class SkillTriggerEvaluator implements Evaluator {
const toolName = toolCall.tool ?? '';
const input = (toolCall.input ?? {}) as Record<string, unknown>;

if (matcher.skillTools.includes(toolName)) {
const skillArg = String(input[matcher.skillInputField] ?? '');
if (toolName === 'Skill') {
const skillArg = String(input.skill ?? '');
if (skillArg.includes(skillName)) {
triggered = true;
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
evidence = `Skill tool invoked with skill="${skillArg}"`;
break;
}
} else if (
matcher.skillToolPrefixes?.some(
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
)
) {
triggered = true;
evidence = `Skill tool invoked via tool name "${toolName}"`;
break;
} else if (matcher.readTools.includes(toolName)) {
const filePath = this.readPathFromInput(input, matcher);
if (filePath.includes(skillName)) {
} else if (toolName === 'Read') {
const filePath = String(input.file_path ?? '');
if (filePath.includes(`skills/${skillName}/`)) {
triggered = true;
evidence = `Read tool loaded skill file: ${filePath}`;
break;
}
} else if (
matcher.readToolPrefixes?.some(
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
)
) {
triggered = true;
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
break;
}

// Fallback: check if a tool's output contains a skill file path.
// Some providers (e.g., copilot-sdk) discover skill content via search
// tools (grep/glob) whose inputs don't reference the skill name, but
// whose outputs include skill file paths like ".agents/skills/<name>/SKILL.md".
if (!triggered && toolCall.output != null) {
const outputStr =
typeof toolCall.output === 'string' ? toolCall.output : JSON.stringify(toolCall.output);
Expand Down Expand Up @@ -228,15 +106,4 @@ export class SkillTriggerEvaluator implements Evaluator {
expectedAspectCount: 1,
};
}

private readPathFromInput(input: Record<string, unknown>, matcher: ToolMatcher): string {
const fields = matcher.readInputFields ?? [matcher.readInputField];
for (const field of fields) {
const value = input[field];
if (value !== undefined && value !== null) {
return String(value);
}
}
return '';
}
}
13 changes: 8 additions & 5 deletions packages/core/src/evaluation/providers/claude-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import path from 'node:path';

import { extractTextContent, toContentArray } from './claude-content.js';
import { recordClaudeLogEntry } from './claude-log-tracker.js';
import { normalizeToolCall } from './normalize-tool-call.js';
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
import type { ClaudeResolvedConfig } from './targets.js';
import type {
Expand Down Expand Up @@ -493,11 +494,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
}
const p = part as Record<string, unknown>;
if (p.type === 'tool_use' && typeof p.name === 'string') {
toolCalls.push({
tool: p.name,
input: p.input,
id: typeof p.id === 'string' ? p.id : undefined,
});
toolCalls.push(
normalizeToolCall('claude-cli', {
tool: p.name,
input: p.input,
id: typeof p.id === 'string' ? p.id : undefined,
}),
);
}
}
return toolCalls;
Expand Down
13 changes: 8 additions & 5 deletions packages/core/src/evaluation/providers/claude-sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import path from 'node:path';

import { extractTextContent, toContentArray } from './claude-content.js';
import { recordClaudeLogEntry } from './claude-log-tracker.js';
import { normalizeToolCall } from './normalize-tool-call.js';
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
import type { ClaudeResolvedConfig } from './targets.js';
import type {
Expand Down Expand Up @@ -297,11 +298,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
}
const p = part as Record<string, unknown>;
if (p.type === 'tool_use' && typeof p.name === 'string') {
toolCalls.push({
tool: p.name,
input: p.input,
id: typeof p.id === 'string' ? p.id : undefined,
});
toolCalls.push(
normalizeToolCall('claude-sdk', {
tool: p.name,
input: p.input,
id: typeof p.id === 'string' ? p.id : undefined,
}),
);
}
}
return toolCalls;
Expand Down
Loading
Loading