Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ agentv eval evals/my-eval.yaml

**5. Compare results across targets:**
```bash
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
agentv compare .agentv/results/runs/<timestamp>/index.jsonl
```

## Output formats
Expand Down
45 changes: 32 additions & 13 deletions apps/cli/src/commands/compare/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
restPositionals,
string,
} from 'cmd-ts';

import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js';

Expand Down Expand Up @@ -62,23 +63,40 @@ interface MatrixRow {
scores: Record<string, number>;
}

interface CompareInputRecord extends EvalResult {
target?: string;
}

function loadCompareResults(filePath: string): CompareInputRecord[] {
return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
if (!record.testId || record.testId === 'unknown') {
throw new Error(`Missing test_id in result source: ${filePath}`);
}
if (typeof record.score !== 'number' || Number.isNaN(record.score)) {
throw new Error(`Missing or invalid score in result source: ${filePath}`);
}
return {
testId: record.testId,
score: record.score,
target: record.target,
};
});
}

export interface MatrixOutput {
matrix: MatrixRow[];
pairwise: ComparisonOutput[];
targets: string[];
}

export function loadJsonlResults(filePath: string): EvalResult[] {
return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => ({
testId: record.testId,
score: record.score,
}));
return loadCompareResults(filePath).map(({ testId, score }) => ({ testId, score }));
}

export function loadCombinedResults(filePath: string): Map<string, EvalResult[]> {
const groups = new Map<string, EvalResult[]>();

for (const record of loadLightweightResults(resolveResultSourcePath(filePath))) {
for (const record of loadCompareResults(filePath)) {
if (typeof record.target !== 'string') {
throw new Error(`Missing target field in combined result source: ${filePath}`);
}
Expand Down Expand Up @@ -413,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
export const compareCommand = command({
name: 'compare',
description:
'Compare evaluation result files: two-file pairwise, combined JSONL pairwise, or N-way matrix',
'Compare evaluation run manifests: two-run pairwise, single-run pairwise, or N-way matrix',
args: {
results: restPositionals({
type: string,
displayName: 'results',
description: 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.',
description:
'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.',
}),
threshold: option({
type: optional(number),
Expand All @@ -430,13 +449,13 @@ export const compareCommand = command({
type: optional(string),
long: 'baseline',
short: 'b',
description: 'Target name to use as baseline (filters combined JSONL)',
description: 'Target name to use as baseline (filters a single run manifest)',
}),
candidate: option({
type: optional(string),
long: 'candidate',
short: 'c',
description: 'Target name to use as candidate (filters combined JSONL)',
description: 'Target name to use as candidate (filters a single run manifest)',
}),
targets: multioption({
type: array(string),
Expand All @@ -460,7 +479,7 @@ export const compareCommand = command({

try {
if (results.length === 0) {
throw new Error('At least one JSONL result file is required');
throw new Error('At least one run workspace or index.jsonl manifest is required');
}

if (results.length === 2) {
Expand All @@ -478,7 +497,7 @@ export const compareCommand = command({
const exitCode = determineExitCode(comparison.summary.meanDelta);
process.exit(exitCode);
} else if (results.length === 1) {
// Combined JSONL mode
// Single-run manifest mode
let groups = loadCombinedResults(results[0]);

// Filter by --targets if specified
Expand Down Expand Up @@ -514,7 +533,7 @@ export const compareCommand = command({
}

if (baseline && candidate) {
// Pairwise mode from combined JSONL
// Pairwise mode from a single run manifest
const baselineResults = groups.get(baseline);
const candidateResults = groups.get(candidate);
if (!baselineResults) {
Expand Down Expand Up @@ -548,7 +567,7 @@ export const compareCommand = command({
process.exit(exitCode);
}
} else {
throw new Error('Expected 1 or 2 JSONL result files');
throw new Error('Expected 1 or 2 run workspaces or index.jsonl manifests');
}
} catch (error) {
console.error(`Error: ${(error as Error).message}`);
Expand Down
68 changes: 67 additions & 1 deletion apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,69 @@ function toCamelCaseDeep(obj: unknown): unknown {
return obj;
}

type ParsedEvaluationResult = Record<string, unknown> & {
timestamp: string;
testId: string;
score: number;
assertions: EvaluationResult['assertions'];
target: string;
output: EvaluationResult['output'];
executionStatus: EvaluationResult['executionStatus'];
};

const EXECUTION_STATUSES = new Set<EvaluationResult['executionStatus']>([
'ok',
'quality_failure',
'execution_error',
]);

function isAssertionEntry(value: unknown): value is EvaluationResult['assertions'][number] {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return false;
}

const candidate = value as { text?: unknown; passed?: unknown; evidence?: unknown };
return (
typeof candidate.text === 'string' &&
typeof candidate.passed === 'boolean' &&
(candidate.evidence === undefined || typeof candidate.evidence === 'string')
);
}

function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return false;
}

const candidate = value as { role?: unknown };
return typeof candidate.role === 'string';
}

function isExecutionStatus(value: unknown): value is EvaluationResult['executionStatus'] {
return (
typeof value === 'string' &&
EXECUTION_STATUSES.has(value as EvaluationResult['executionStatus'])
);
}

function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return undefined;
}

const result = value as Record<string, unknown>;
return {
...result,
timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
testId: typeof result.testId === 'string' ? result.testId : 'unknown',
score: typeof result.score === 'number' ? result.score : 0,
assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
target: typeof result.target === 'string' ? result.target : 'unknown',
output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
};
}

// ---------------------------------------------------------------------------
// JSONL parsing
// ---------------------------------------------------------------------------
Expand All @@ -610,7 +673,10 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
const parsed = JSON.parse(trimmed);
// JSONL files from AgentV use snake_case; convert back to camelCase
const camelCased = toCamelCaseDeep(parsed);
results.push(camelCased as EvaluationResult);
const normalized = normalizeParsedResult(camelCased);
if (normalized) {
results.push(normalized);
}
} catch {
// Skip malformed lines
}
Expand Down
3 changes: 2 additions & 1 deletion apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ export const evalRunCommand = command({
retryErrors: option({
type: optional(string),
long: 'retry-errors',
description: 'Path to previous output JSONL — re-run only execution_error test cases',
description:
'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
}),
strict: flag({
long: 'strict',
Expand Down
18 changes: 18 additions & 0 deletions apps/cli/src/commands/eval/result-layout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ export function resolveRunIndexPath(runDir: string): string {
return path.join(runDir, RESULT_INDEX_FILENAME);
}

export function isRunManifestPath(filePath: string): boolean {
return path.basename(filePath) === RESULT_INDEX_FILENAME;
}

export function resolveExistingRunPrimaryPath(runDir: string): string | undefined {
const indexPath = resolveRunIndexPath(runDir);
if (existsSync(indexPath)) {
Expand Down Expand Up @@ -49,3 +53,17 @@ export function resolveWorkspaceOrFilePath(filePath: string): string {

return existing;
}

export function resolveRunManifestPath(filePath: string): string {
if (isDirectoryPath(filePath)) {
return resolveWorkspaceOrFilePath(filePath);
}

if (!isRunManifestPath(filePath)) {
throw new Error(
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`,
);
}

return filePath;
}
16 changes: 7 additions & 9 deletions apps/cli/src/commands/eval/retry-errors.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import type { EvaluationResult } from '@agentv/core';

import {
loadLightweightResults,
loadManifestResults,
resolveResultSourcePath,
} from '../results/manifest.js';
import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js';

async function loadRetrySourceResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
return loadManifestResults(resolveResultSourcePath(jsonlPath));
}

/**
* Load test IDs from an index/results source that have executionStatus === 'execution_error'.
*/
export async function loadErrorTestIds(jsonlPath: string): Promise<readonly string[]> {
const resolvedPath = resolveResultSourcePath(jsonlPath);
const ids = loadLightweightResults(resolvedPath)
const ids = (await loadRetrySourceResults(jsonlPath))
.filter((result) => result.executionStatus === 'execution_error')
.map((result) => result.testId);

Expand All @@ -23,8 +22,7 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
* These are the "good" results that should be preserved when merging retry output.
*/
export async function loadNonErrorResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
const resolvedPath = resolveResultSourcePath(jsonlPath);
return loadManifestResults(resolvedPath).filter(
return (await loadRetrySourceResults(jsonlPath)).filter(
(result) => result.testId && result.executionStatus !== 'execution_error',
);
}
25 changes: 10 additions & 15 deletions apps/cli/src/commands/eval/run-cache.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,19 @@ const CACHE_FILENAME = 'cache.json';
export interface RunCache {
/** Directory path for new per-run directory format (e.g. .agentv/results/runs/<ts>/) */
readonly lastRunDir?: string;
/** JSONL file path for legacy flat-file format. Kept for backward compat. */
/** @deprecated Legacy flat-file pointer from old cache files. Ignored on read. */
readonly lastResultFile?: string;
readonly timestamp: string;
}

/**
* Resolve the primary result manifest path from a RunCache entry.
* New format: lastRunDir/index.jsonl
* Legacy format: lastResultFile (flat JSONL path)
*/
export function resolveRunCacheFile(cache: RunCache): string {
if (cache.lastRunDir) {
return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
}
return cache.lastResultFile ?? '';
return '';
}

function cachePath(cwd: string): string {
Expand All @@ -47,18 +45,15 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
}

export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
return;
}

const dir = path.join(cwd, '.agentv');
await mkdir(dir, { recursive: true });
const basename = path.basename(resultPath);
const cache: RunCache =
basename === RESULT_INDEX_FILENAME
? {
lastRunDir: path.dirname(resultPath),
timestamp: new Date().toISOString(),
}
: {
lastResultFile: resultPath,
timestamp: new Date().toISOString(),
};
const cache: RunCache = {
lastRunDir: path.dirname(resultPath),
timestamp: new Date().toISOString(),
};
await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8');
}
Loading
Loading