Skip to content
Merged
2 changes: 1 addition & 1 deletion apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"test:watch": "bun test --watch"
},
"dependencies": {
"@ai-sdk/openai": "^2.0.0",
"@ai-sdk/openai": "^3.0.0",
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
"@github/copilot-sdk": "^0.1.25",
"@inquirer/prompts": "^8.2.1",
Expand Down
8 changes: 7 additions & 1 deletion apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ export interface TimingArtifact {
readonly token_usage: {
readonly input: number;
readonly output: number;
readonly reasoning: number;
};
}

Expand Down Expand Up @@ -273,13 +274,17 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact {
let totalInput = 0;
let totalOutput = 0;
let totalReasoning = 0;
let totalDurationMs = 0;

for (const result of results) {
const usage = result.tokenUsage as { input?: number; output?: number } | undefined;
const usage = result.tokenUsage as
| { input?: number; output?: number; reasoning?: number }
| undefined;
if (usage) {
totalInput += usage.input ?? 0;
totalOutput += usage.output ?? 0;
totalReasoning += usage.reasoning ?? 0;
}
if (result.durationMs != null) {
totalDurationMs += result.durationMs;
Expand All @@ -293,6 +298,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
token_usage: {
input: totalInput,
output: totalOutput,
reasoning: totalReasoning,
},
};
}
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ describe('buildTimingArtifact', () => {
expect(timing.total_tokens).toBe(4500);
expect(timing.duration_ms).toBe(90000);
expect(timing.total_duration_seconds).toBe(90);
expect(timing.token_usage).toEqual({ input: 3000, output: 1500 });
expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 });
});

it('handles results with no timing data', () => {
Expand All @@ -212,7 +212,7 @@ describe('buildTimingArtifact', () => {
expect(timing.total_tokens).toBe(0);
expect(timing.duration_ms).toBe(0);
expect(timing.total_duration_seconds).toBe(0);
expect(timing.token_usage).toEqual({ input: 0, output: 0 });
expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 });
});

it('handles empty results array', () => {
Expand All @@ -232,7 +232,7 @@ describe('buildTimingArtifact', () => {

const timing = buildTimingArtifact(results);
expect(timing.total_tokens).toBe(500);
expect(timing.token_usage).toEqual({ input: 500, output: 0 });
expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 });
});
});

Expand Down
32 changes: 16 additions & 16 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@
"dependencies": {
"@agentclientprotocol/sdk": "^0.14.1",
"@agentv/eval": "workspace:*",
"@ai-sdk/anthropic": "^2.0.53",
"@ai-sdk/azure": "^2.0.78",
"@ai-sdk/google": "^2.0.44",
"@ai-sdk/openai": "^2.0.0",
"@ai-sdk/anthropic": "^3.0.0",
"@ai-sdk/azure": "^3.0.0",
"@ai-sdk/google": "^3.0.0",
"@ai-sdk/openai": "^3.0.0",
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
"@github/copilot-sdk": "^0.1.25",
"@mariozechner/pi-agent-core": "^0.54.2",
"@mariozechner/pi-ai": "^0.54.2",
"@openai/codex-sdk": "^0.104.0",
"@openrouter/ai-sdk-provider": "^2.3.1",
"ai": "^5.0.106",
"ai": "^6.0.0",
"fast-glob": "^3.3.3",
"json5": "^2.2.3",
"micromatch": "^4.0.8",
Expand Down
79 changes: 76 additions & 3 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
}
}

const caseStartMs = Date.now();
const attemptBudget = (maxRetries ?? 0) + 1;
let attempt = 0;
let providerResponse: ProviderResponse | undefined = cachedResponse;
Expand Down Expand Up @@ -1713,13 +1714,37 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
workspacePath,
});

const totalDurationMs = Date.now() - caseStartMs;

// Aggregate grader token usage from individual evaluator results
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
const evalRunTokenUsage =
tokenUsage || graderTokens
? {
input: (tokenUsage?.input ?? 0) + (graderTokens?.input ?? 0),
output: (tokenUsage?.output ?? 0) + (graderTokens?.output ?? 0),
...(tokenUsage?.reasoning != null || graderTokens?.reasoning != null
? { reasoning: (tokenUsage?.reasoning ?? 0) + (graderTokens?.reasoning ?? 0) }
: {}),
...(tokenUsage?.cached != null || graderTokens?.cached != null
? { cached: (tokenUsage?.cached ?? 0) + (graderTokens?.cached ?? 0) }
: {}),
}
: undefined;

const evalRun = {
durationMs: totalDurationMs,
...(evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}),
};

const executionStatus: ExecutionStatus = providerError
? 'execution_error'
: classifyQualityStatus(result.score);

const finalResult = providerError
? {
...result,
evalRun,
error: providerError,
executionStatus,
failureStage: 'agent' as const,
Expand All @@ -1729,7 +1754,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
beforeEachOutput,
afterEachOutput,
}
: { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
: { ...result, evalRun, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };

// Determine if this is a failure (has error or low score)
const isFailure = !!finalResult.error || finalResult.score < 0.5;
Expand All @@ -1751,6 +1776,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati

return finalResult;
} catch (error) {
const evalRun = { durationMs: Date.now() - caseStartMs };
const errorResult = buildErrorResult(
evalCase,
target.name,
Expand All @@ -1766,10 +1792,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
if (forceCleanup || (retainOnFailure ?? 'keep') === 'cleanup') {
await cleanupWorkspace(workspacePath).catch(() => {});
} else {
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
return { ...errorResult, evalRun, workspacePath, beforeEachOutput, afterEachOutput };
}
}
return { ...errorResult, beforeEachOutput, afterEachOutput };
return { ...errorResult, evalRun, beforeEachOutput, afterEachOutput };
}
}

Expand Down Expand Up @@ -2565,6 +2591,53 @@ function buildResultInput(promptInputs: PromptInputs): EvaluationResult['input']
return promptInputs.question;
}

/**
* Sum token usage across all evaluator results (including nested children).
* Returns undefined when no evaluator reported token usage.
*/
function aggregateEvaluatorTokenUsage(scores?: readonly EvaluatorResult[]): TokenUsage | undefined {
if (!scores || scores.length === 0) return undefined;

let hasAny = false;
let input = 0;
let output = 0;
let reasoning = 0;
let cached = 0;
let hasReasoning = false;
let hasCached = false;

const visit = (items: readonly EvaluatorResult[]): void => {
for (const item of items) {
if (item.tokenUsage) {
hasAny = true;
input += item.tokenUsage.input;
output += item.tokenUsage.output;
if (item.tokenUsage.reasoning != null) {
hasReasoning = true;
reasoning += item.tokenUsage.reasoning;
}
if (item.tokenUsage.cached != null) {
hasCached = true;
cached += item.tokenUsage.cached;
}
}
if (item.scores) {
visit(item.scores);
}
}
};

visit(scores);
if (!hasAny) return undefined;

return {
input,
output,
...(hasReasoning ? { reasoning } : {}),
...(hasCached ? { cached } : {}),
};
}

function isTimeoutLike(error: unknown): boolean {
if (!error) {
return false;
Expand Down
4 changes: 1 addition & 3 deletions packages/core/src/evaluation/providers/agentv-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ function createLanguageModel(modelString: string): LanguageModel {

switch (provider) {
case 'openai':
// Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the
// codebase uses LanguageModelV2. The runtime API is compatible.
return createOpenAI()(modelName) as unknown as LanguageModel;
return createOpenAI()(modelName);
case 'anthropic':
return createAnthropic()(modelName);
case 'azure':
Expand Down
13 changes: 9 additions & 4 deletions packages/core/src/evaluation/providers/ai-sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,7 @@ export class OpenRouterProvider implements Provider {
const openrouter = createOpenRouter({
apiKey: config.apiKey,
});
// Cast: OpenRouter may return LanguageModelV3 while the rest of the
// codebase uses LanguageModelV2. The runtime API is compatible.
this.model = openrouter(config.model) as unknown as LanguageModel;
this.model = openrouter(config.model);
}

async invoke(request: ProviderRequest): Promise<ProviderResponse> {
Expand Down Expand Up @@ -392,9 +390,16 @@ async function invokeModel(options: {
function mapResponse(result: TextResult): ProviderResponse {
const content = result.text ?? '';
const rawUsage = result.totalUsage ?? result.usage;
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? undefined;
const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? undefined;
const tokenUsage =
rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
? {
input: rawUsage.inputTokens,
output: rawUsage.outputTokens,
...(reasoning != null ? { reasoning } : {}),
...(cached != null ? { cached } : {}),
}
: undefined;

return {
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/providers/claude-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,12 @@ export class ClaudeCliProvider implements Provider {
((usage.cache_read_input_tokens as number) ?? 0) +
((usage.cache_creation_input_tokens as number) ?? 0);
const outputTokens = (usage.output_tokens as number) ?? 0;
const reasoningTokens = (usage.reasoning_tokens as number) ?? undefined;
tokenUsage = {
input: inputTokens,
output: outputTokens,
cached: (usage.cache_read_input_tokens as number) ?? undefined,
reasoning: reasoningTokens,
};

// Stream callback for LLM usage
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/providers/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ export interface ProviderTokenUsage {
readonly output: number;
/** Cached tokens (optional, provider-specific) */
readonly cached?: number;
/** Reasoning/thinking tokens (optional, provider-specific) */
readonly reasoning?: number;
}

export interface ProviderResponse {
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/trace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export interface TokenUsage {
readonly output: number;
/** Cached tokens (optional, provider-specific) */
readonly cached?: number;
/** Reasoning/thinking tokens (optional, provider-specific) */
readonly reasoning?: number;
}

/**
Expand Down
Loading