From 6e7d8f9d7b947da2205733526ea5228a8cd4653f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 05:24:28 +0000 Subject: [PATCH 1/4] refactor(results): remove flat manifest loading Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 2 +- apps/cli/src/commands/compare/index.ts | 67 +++++++++- apps/cli/src/commands/eval/artifact-writer.ts | 73 +++++++++- apps/cli/src/commands/eval/result-layout.ts | 18 +++ apps/cli/src/commands/eval/retry-errors.ts | 25 ++-- apps/cli/src/commands/eval/run-cache.ts | 25 ++-- apps/cli/src/commands/eval/run-eval.ts | 125 +++++++++--------- apps/cli/src/commands/pipeline/bench.ts | 8 +- apps/cli/src/commands/pipeline/grade.ts | 8 +- apps/cli/src/commands/pipeline/input.ts | 14 +- apps/cli/src/commands/pipeline/run.ts | 16 +-- apps/cli/src/commands/results/export.ts | 74 ++++++++--- apps/cli/src/commands/results/manifest.ts | 97 +++----------- apps/cli/src/commands/results/serve.ts | 50 ++++--- apps/cli/src/commands/results/shared.ts | 26 ++-- apps/cli/src/commands/trace/list.ts | 10 +- apps/cli/src/commands/trace/score.ts | 6 +- apps/cli/src/commands/trace/utils.ts | 53 ++------ .../cli/test/commands/compare/compare.test.ts | 15 +++ apps/cli/test/commands/eval/run-cache.test.ts | 14 +- apps/cli/test/commands/results/export.test.ts | 27 +++- apps/cli/test/commands/results/serve.test.ts | 18 ++- apps/cli/test/commands/results/shared.test.ts | 82 +++++++----- apps/cli/test/commands/trace/trace.test.ts | 119 ++++------------- .../docs/docs/evaluation/running-evals.mdx | 6 +- .../docs/docs/evaluators/structured-data.mdx | 2 +- .../docs/docs/getting-started/quickstart.mdx | 2 +- .../web/src/content/docs/docs/tools/trace.mdx | 8 +- examples/features/benchmark-tooling/README.md | 14 +- .../document-extraction/.agentv/targets.yaml | 4 +- .../features/document-extraction/README.md | 16 +-- .../evals/confusion-metrics.eval.yaml | 2 +- .../evals/field-accuracy.eval.yaml | 5 +- .../scripts/aggregate_metrics.ts | 8 +- examples/features/trace-analysis/README.md | 12 +- examples/showcase/export-screening/README.md | 4 +- .../src/evaluation/loaders/jsonl-parser.ts | 49 ++++--- packages/core/src/evaluation/yaml-parser.ts | 79 ++++++----- 38 files changed, 648 insertions(+), 535 deletions(-) diff --git a/README.md b/README.md index e51776e2b..23415f112 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ agentv eval evals/my-eval.yaml **5. Compare results across targets:** ```bash -agentv compare .agentv/results/runs/eval_/index.jsonl +agentv compare .agentv/results/runs//index.jsonl ``` ## Output formats diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index 927eb6454..5dadaaf06 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -1,3 +1,6 @@ +import { readFileSync } from 'node:fs'; +import path from 'node:path'; + import { array, command, @@ -62,6 +65,66 @@ interface MatrixRow { scores: Record; } +interface ParsedCompareResult { + testId: string; + score: number; + target?: string; +} + +function loadFlatCompareResults(filePath: string): ParsedCompareResult[] { + const content = readFileSync(filePath, 'utf8'); + const results: ParsedCompareResult[] = []; + + for (const rawLine of content.split('\n')) { + const line = rawLine.trim(); + if (!line) continue; + + const parsed = JSON.parse(line) as Record; + const testId = + typeof parsed.test_id === 'string' + ? parsed.test_id + : typeof parsed.testId === 'string' + ? parsed.testId + : typeof parsed.eval_id === 'string' + ? parsed.eval_id + : typeof parsed.evalId === 'string' + ? parsed.evalId + : undefined; + if (!testId) { + throw new Error(`Missing test_id in result source: ${filePath}`); + } + + if (typeof parsed.score !== 'number' || Number.isNaN(parsed.score)) { + throw new Error(`Missing or invalid score in result source: ${filePath}`); + } + + results.push({ + testId, + score: parsed.score, + target: typeof parsed.target === 'string' ? parsed.target : undefined, + }); + } + + return results; +} + +function loadCompareResults(filePath: string): ParsedCompareResult[] { + try { + const resolvedPath = resolveResultSourcePath(filePath); + if (path.basename(resolvedPath) === 'index.jsonl') { + return loadLightweightResults(resolvedPath).map((record) => ({ + testId: record.testId, + score: record.score, + target: record.target, + })); + } + } catch { + // Fall back to direct JSONL parsing for explicit flat result files. + } + + return loadFlatCompareResults(filePath); +} + export interface MatrixOutput { matrix: MatrixRow[]; pairwise: ComparisonOutput[]; @@ -69,7 +132,7 @@ export interface MatrixOutput { } export function loadJsonlResults(filePath: string): EvalResult[] { - return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => ({ + return loadCompareResults(filePath).map((record) => ({ testId: record.testId, score: record.score, })); @@ -78,7 +141,7 @@ export function loadJsonlResults(filePath: string): EvalResult[] { export function loadCombinedResults(filePath: string): Map { const groups = new Map(); - for (const record of loadLightweightResults(resolveResultSourcePath(filePath))) { + for (const record of loadCompareResults(filePath)) { if (typeof record.target !== 'string') { throw new Error(`Missing target field in combined result source: ${filePath}`); } diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 2111453be..14035f20b 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -594,6 +594,74 @@ function toCamelCaseDeep(obj: unknown): unknown { return obj; } +type ParsedEvaluationResult = Record & { + timestamp: string; + testId: string; + score: number; + assertions: EvaluationResult['assertions']; + target: string; + output: EvaluationResult['output']; + executionStatus: EvaluationResult['executionStatus']; +}; + +const EXECUTION_STATUSES = new Set([ + 'ok', + 'quality_failure', + 'execution_error', +]); + +function isAssertionEntry(value: unknown): value is EvaluationResult['assertions'][number] { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return false; + } + + const candidate = value as { text?: unknown; passed?: unknown; evidence?: unknown }; + return ( + typeof candidate.text === 'string' && + typeof candidate.passed === 'boolean' && + (candidate.evidence === undefined || typeof candidate.evidence === 'string') + ); +} + +function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return false; + } + + const candidate = value as { role?: unknown }; + return typeof candidate.role === 'string'; +} + +function isExecutionStatus(value: unknown): value is EvaluationResult['executionStatus'] { + return ( + typeof value === 'string' && + EXECUTION_STATUSES.has(value as EvaluationResult['executionStatus']) + ); +} + +function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return undefined; + } + + const result = value as Record; + return { + ...result, + timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(), + testId: + typeof result.testId === 'string' + ? result.testId + : typeof result.evalId === 'string' + ? result.evalId + : 'unknown', + score: typeof result.score === 'number' ? result.score : 0, + assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [], + target: typeof result.target === 'string' ? result.target : 'unknown', + output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [], + executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok', + }; +} + // --------------------------------------------------------------------------- // JSONL parsing // --------------------------------------------------------------------------- @@ -610,7 +678,10 @@ export function parseJsonlResults(content: string): EvaluationResult[] { const parsed = JSON.parse(trimmed); // JSONL files from AgentV use snake_case; convert back to camelCase const camelCased = toCamelCaseDeep(parsed); - results.push(camelCased as EvaluationResult); + const normalized = normalizeParsedResult(camelCased); + if (normalized) { + results.push(normalized); + } } catch { // Skip malformed lines } diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts index 800a62584..b6e6c57b7 100644 --- a/apps/cli/src/commands/eval/result-layout.ts +++ b/apps/cli/src/commands/eval/result-layout.ts @@ -20,6 +20,10 @@ export function resolveRunIndexPath(runDir: string): string { return path.join(runDir, RESULT_INDEX_FILENAME); } +export function isRunManifestPath(filePath: string): boolean { + return path.basename(filePath) === RESULT_INDEX_FILENAME; +} + export function resolveExistingRunPrimaryPath(runDir: string): string | undefined { const indexPath = resolveRunIndexPath(runDir); if (existsSync(indexPath)) { @@ -49,3 +53,17 @@ export function resolveWorkspaceOrFilePath(filePath: string): string { return existing; } + +export function resolveRunManifestPath(filePath: string): string { + if (isDirectoryPath(filePath)) { + return resolveWorkspaceOrFilePath(filePath); + } + + if (!isRunManifestPath(filePath)) { + throw new Error( + `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`, + ); + } + + return filePath; +} diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts index a1760ffa6..8a39bc3bf 100644 --- a/apps/cli/src/commands/eval/retry-errors.ts +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -1,17 +1,25 @@ +import { readFile } from 'node:fs/promises'; + import type { EvaluationResult } from '@agentv/core'; -import { - loadLightweightResults, - loadManifestResults, - resolveResultSourcePath, -} from '../results/manifest.js'; +import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js'; +import { parseJsonlResults } from './artifact-writer.js'; + +async function loadRetrySourceResults(jsonlPath: string): Promise { + try { + const resolvedPath = resolveResultSourcePath(jsonlPath); + return loadManifestResults(resolvedPath); + } catch { + const content = await readFile(jsonlPath, 'utf8'); + return parseJsonlResults(content); + } +} /** * Load test IDs from an index/results source that have executionStatus === 'execution_error'. */ export async function loadErrorTestIds(jsonlPath: string): Promise { - const resolvedPath = resolveResultSourcePath(jsonlPath); - const ids = loadLightweightResults(resolvedPath) + const ids = (await loadRetrySourceResults(jsonlPath)) .filter((result) => result.executionStatus === 'execution_error') .map((result) => result.testId); @@ -23,8 +31,7 @@ export async function loadErrorTestIds(jsonlPath: string): Promise { - const resolvedPath = resolveResultSourcePath(jsonlPath); - return loadManifestResults(resolvedPath).filter( + return (await loadRetrySourceResults(jsonlPath)).filter( (result) => result.testId && result.executionStatus !== 'execution_error', ); } diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts index 80c523a26..50c9e7824 100644 --- a/apps/cli/src/commands/eval/run-cache.ts +++ b/apps/cli/src/commands/eval/run-cache.ts @@ -16,21 +16,19 @@ const CACHE_FILENAME = 'cache.json'; export interface RunCache { /** Directory path for new per-run directory format (e.g. .agentv/results/runs//) */ readonly lastRunDir?: string; - /** JSONL file path for legacy flat-file format. Kept for backward compat. */ + /** @deprecated Legacy flat-file pointer from old cache files. Ignored on read. */ readonly lastResultFile?: string; readonly timestamp: string; } /** * Resolve the primary result manifest path from a RunCache entry. - * New format: lastRunDir/index.jsonl - * Legacy format: lastResultFile (flat JSONL path) */ export function resolveRunCacheFile(cache: RunCache): string { if (cache.lastRunDir) { return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir); } - return cache.lastResultFile ?? ''; + return ''; } function cachePath(cwd: string): string { @@ -47,18 +45,15 @@ export async function loadRunCache(cwd: string): Promise { } export async function saveRunCache(cwd: string, resultPath: string): Promise { + if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) { + return; + } + const dir = path.join(cwd, '.agentv'); await mkdir(dir, { recursive: true }); - const basename = path.basename(resultPath); - const cache: RunCache = - basename === RESULT_INDEX_FILENAME - ? { - lastRunDir: path.dirname(resultPath), - timestamp: new Date().toISOString(), - } - : { - lastResultFile: resultPath, - timestamp: new Date().toISOString(), - }; + const cache: RunCache = { + lastRunDir: path.dirname(resultPath), + timestamp: new Date().toISOString(), + }; await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8'); } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 1a26fff4b..febbec3cc 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -400,21 +400,21 @@ function createProgressReporter( }; } -function makeEvalKey(testFilePath: string, evalId: string): string { - return `${path.resolve(testFilePath)}::${evalId}`; +function makeTestCaseKey(testFilePath: string, testId: string): string { + return `${path.resolve(testFilePath)}::${testId}`; } -function createDisplayIdTracker(): { getOrAssign(evalKey: string): number } { +function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } { const map = new Map(); let nextId = 1; return { - getOrAssign(evalKey: string): number { - const existing = map.get(evalKey); + getOrAssign(testCaseKey: string): number { + const existing = map.get(testCaseKey); if (existing !== undefined) { return existing; } const assigned = nextId++; - map.set(evalKey, assigned); + map.set(testCaseKey, assigned); return assigned; }, }; @@ -476,11 +476,11 @@ async function prepareFileMetadata(params: { readonly cwd: string; readonly options: NormalizedOptions; }): Promise<{ - readonly evalIds: readonly string[]; - readonly evalCases: readonly EvalTest[]; + readonly testIds: readonly string[]; + readonly testCases: readonly EvalTest[]; readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[]; readonly trialsConfig?: TrialsConfig; - readonly suiteTargets?: readonly string[]; + readonly datasetTargets?: readonly string[]; readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; @@ -501,23 +501,23 @@ async function prepareFileMetadata(params: { const relativePath = path.relative(cwd, testFilePath); const category = deriveCategory(relativePath); - const suite = await loadTestSuite(testFilePath, repoRoot, { + const dataset = await loadTestSuite(testFilePath, repoRoot, { verbose: options.verbose, filter: options.filter, category, }); - const filteredIds = suite.tests.map((value) => value.id); + const testIds = dataset.tests.map((value) => value.id); // Determine target names: CLI --target flags override YAML const cliTargets = options.cliTargets; - const suiteTargets = suite.targets; + const datasetTargets = dataset.targets; - // Resolve which target names to use (precedence: CLI > YAML targets > YAML target > default) + // Resolve which target names to use (precedence: CLI > dataset YAML targets > default) let targetNames: readonly string[]; if (cliTargets.length > 0) { targetNames = cliTargets; - } else if (suiteTargets && suiteTargets.length > 0) { - targetNames = suiteTargets; + } else if (datasetTargets && datasetTargets.length > 0) { + targetNames = datasetTargets; } else { targetNames = []; } @@ -567,18 +567,18 @@ async function prepareFileMetadata(params: { } return { - evalIds: filteredIds, - evalCases: suite.tests, + testIds, + testCases: dataset.tests, selections, - trialsConfig: suite.trials, - suiteTargets, - yamlWorkers: suite.workers, - yamlCache: suite.cacheConfig?.enabled, - yamlCachePath: suite.cacheConfig?.cachePath, - totalBudgetUsd: suite.totalBudgetUsd, - failOnError: suite.failOnError, - threshold: suite.threshold, - tags: suite.metadata?.tags, + trialsConfig: dataset.trials, + datasetTargets, + yamlWorkers: dataset.workers, + yamlCache: dataset.cacheConfig?.enabled, + yamlCachePath: dataset.cacheConfig?.cachePath, + totalBudgetUsd: dataset.totalBudgetUsd, + failOnError: dataset.failOnError, + threshold: dataset.threshold, + tags: dataset.metadata?.tags, }; } @@ -613,11 +613,11 @@ async function runSingleEvalFile(params: { readonly workersOverride?: number; readonly yamlWorkers?: number; readonly progressReporter: ProgressReporter; - readonly seenEvalCases: Set; - readonly displayIdTracker: { getOrAssign(evalKey: string): number }; + readonly seenTestCases: Set; + readonly displayIdTracker: { getOrAssign(testCaseKey: string): number }; readonly selection: TargetSelection; readonly inlineTargetLabel: string; - readonly evalCases: readonly EvalTest[]; + readonly testCases: readonly EvalTest[]; readonly trialsConfig?: TrialsConfig; readonly matrixMode?: boolean; readonly totalBudgetUsd?: number; @@ -636,11 +636,11 @@ async function runSingleEvalFile(params: { workersOverride, yamlWorkers, progressReporter, - seenEvalCases, + seenTestCases, displayIdTracker, selection, inlineTargetLabel, - evalCases, + testCases, trialsConfig, matrixMode, totalBudgetUsd, @@ -731,7 +731,7 @@ async function runSingleEvalFile(params: { return true; })(), filter: options.filter, - evalCases, + evalCases: testCases, verbose: options.verbose, maxConcurrency: resolvedWorkers, workspaceMode: options.workspaceMode, @@ -747,7 +747,7 @@ async function runSingleEvalFile(params: { ( streamingObserver as { completeFromResult?: (result: EvaluationResult) => void } | null )?.completeFromResult?.(result); - // Finalize streaming observer span with score + // Finalize the streaming observer span with score. streamingObserver?.finalizeEvalCase(result.score, result.error); // Trim output messages for results JSONL based on --output-messages. @@ -775,13 +775,13 @@ async function runSingleEvalFile(params: { } }, onProgress: async (event) => { - const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId; - const evalKey = makeEvalKey(testFilePath, evalKeyId); - if (event.status === 'pending' && !seenEvalCases.has(evalKey)) { - seenEvalCases.add(evalKey); - progressReporter.setTotal(seenEvalCases.size); + const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId; + const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId); + if (event.status === 'pending' && !seenTestCases.has(testCaseKey)) { + seenTestCases.add(testCaseKey); + progressReporter.setTotal(seenTestCases.size); } - const displayId = displayIdTracker.getOrAssign(evalKey); + const displayId = displayIdTracker.getOrAssign(testCaseKey); // Start streaming observer when eval case begins execution if (event.status === 'running' && streamingObserver) { @@ -997,7 +997,7 @@ export async function runEvalCommand( // We defer cache creation until after file metadata is loaded const evaluationRunner = await resolveEvaluationRunner(); const allResults: EvaluationResult[] = []; - const seenEvalCases = new Set(); + const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); // Derive file-level concurrency from worker count (global) when provided @@ -1012,14 +1012,14 @@ export async function runEvalCommand( const fileMetadata = new Map< string, { - readonly evalIds: readonly string[]; - readonly evalCases: readonly EvalTest[]; + readonly testIds: readonly string[]; + readonly testCases: readonly EvalTest[]; readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string; }[]; readonly trialsConfig?: TrialsConfig; - readonly suiteTargets?: readonly string[]; + readonly datasetTargets?: readonly string[]; readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; @@ -1097,13 +1097,12 @@ export async function runEvalCommand( const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path.resolve(yamlCachePath) : undefined) : undefined; - const useCache = cacheEnabled; if (cacheEnabled) { console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); } - // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold + // Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold. const yamlThreshold = firstMeta?.threshold; const resolvedThreshold = options.threshold ?? yamlThreshold; if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) { @@ -1127,13 +1126,13 @@ export async function runEvalCommand( // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides) let totalEvalCount = 0; for (const meta of fileMetadata.values()) { - const suiteTargetNames = meta.selections.map((s) => s.selection.targetName); - for (const test of meta.evalCases) { - // Per-test targets override suite-level targets + const datasetTargetNames = meta.selections.map((s) => s.selection.targetName); + for (const test of meta.testCases) { + // Per-test targets override dataset-level targets. const testTargetNames = test.targets && test.targets.length > 0 - ? test.targets.filter((t) => suiteTargetNames.includes(t)) - : suiteTargetNames; + ? test.targets.filter((t) => datasetTargetNames.includes(t)) + : datasetTargetNames; totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1; } } @@ -1177,13 +1176,13 @@ export async function runEvalCommand( }); for (const [testFilePath, meta] of fileMetadata.entries()) { for (const { selection, inlineTargetLabel } of meta.selections) { - for (const testId of meta.evalIds) { - const evalKey = makeEvalKey( + for (const testId of meta.testIds) { + const testCaseKey = makeTestCaseKey( testFilePath, meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId, ); - seenEvalCases.add(evalKey); - const displayId = displayIdTracker.getOrAssign(evalKey); + seenTestCases.add(testCaseKey); + const displayId = displayIdTracker.getOrAssign(testCaseKey); progressReporter.update(displayId, { workerId: displayId, testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId, @@ -1207,19 +1206,19 @@ export async function runEvalCommand( // Run all targets concurrently (each target has its own worker limit) const targetResults = await Promise.all( targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Filter eval cases to those applicable to this target + // Filter test cases to those applicable to this target. const targetName = selection.targetName; - const applicableEvalCases = + const applicableTestCases = targetPrep.selections.length > 1 - ? targetPrep.evalCases.filter((test) => { + ? targetPrep.testCases.filter((test) => { if (test.targets && test.targets.length > 0) { return test.targets.includes(targetName); } return true; }) - : targetPrep.evalCases; + : targetPrep.testCases; - if (applicableEvalCases.length === 0) { + if (applicableTestCases.length === 0) { return []; } @@ -1236,11 +1235,11 @@ export async function runEvalCommand( workersOverride: perFileWorkers, yamlWorkers: targetPrep.yamlWorkers, progressReporter, - seenEvalCases, + seenTestCases, displayIdTracker, selection, inlineTargetLabel, - evalCases: applicableEvalCases, + testCases: applicableTestCases, trialsConfig: targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, @@ -1254,9 +1253,9 @@ export async function runEvalCommand( // Mark all tests in this file as errors and continue with other files. const message = fileError instanceof Error ? fileError.message : String(fileError); console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); - const errorResults: EvaluationResult[] = applicableEvalCases.map((evalCase) => ({ + const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ timestamp: new Date().toISOString(), - testId: evalCase.id, + testId: testCase.id, score: 0, assertions: [], output: [], diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 58a86c271..ee355c5b2 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -37,15 +37,15 @@ export const evalBenchCommand = command({ const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8')); const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; - const evalSet: string = manifest.dataset ?? ''; + const datasetName: string = manifest.dataset ?? ''; const experiment: string | undefined = manifest.experiment; - const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const indexLines: string[] = []; const allPassRates: number[] = []; for (const testId of testIds) { - const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId]; + const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; const testDir = join(exportDir, ...subpath); const artifactSubdir = subpath.join('/'); const evaluators: EvaluatorScore[] = []; @@ -177,7 +177,7 @@ export const evalBenchCommand = command({ JSON.stringify({ timestamp: manifest.timestamp, test_id: testId, - dataset: evalSet || undefined, + dataset: datasetName || undefined, experiment: experiment || undefined, score: Math.round(weightedScore * 1000) / 1000, target: targetName, diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 80729f0b7..45faa8608 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -10,7 +10,7 @@ * Progress is printed to stderr so users see real-time feedback. * * Export directory additions: - * ///code_grader_results/.json + * ///code_grader_results/.json */ import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; @@ -196,14 +196,14 @@ export const evalGradeCommand = command({ const manifestPath = join(exportDir, 'manifest.json'); const manifest = JSON.parse(await readFile(manifestPath, 'utf8')); const testIds: string[] = manifest.test_ids; - const evalSet: string = manifest.dataset ?? ''; - const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const datasetName: string = manifest.dataset ?? ''; + const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; // Collect all grader tasks upfront so we know the total count const tasks: GraderTask[] = []; for (const testId of testIds) { - const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId]; + const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; const testDir = join(exportDir, ...subpath); const codeGradersDir = join(testDir, 'code_graders'); const resultsDir = join(testDir, 'code_grader_results'); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index ef53fe5e0..28b43b391 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -9,7 +9,7 @@ * Export directory layout: * / * ├── manifest.json - * └── / (omitted if eval.yaml has no name) + * └── / (omitted if eval.yaml has no name) * └── / * ├── input.json * ├── invoke.json @@ -58,8 +58,8 @@ export const evalInputCommand = command({ const evalDir = dirname(resolvedEvalPath); const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); - const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); - const tests = suite.tests; + const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); + const tests = dataset.tests; if (tests.length === 0) { console.error('No tests found in eval file.'); @@ -107,13 +107,13 @@ export const evalInputCommand = command({ // No targets file found — subagent-as-target mode } - const evalSetName = suite.metadata?.name?.trim() ?? ''; - const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const datasetName = dataset.metadata?.name?.trim() ?? ''; + const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; for (const test of tests) { - const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id]; + const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id]; const testDir = join(outDir, ...subpath); await mkdir(testDir, { recursive: true }); testIds.push(test.id); @@ -168,7 +168,7 @@ export const evalInputCommand = command({ // manifest.json await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - dataset: evalSetName || undefined, + dataset: datasetName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index be062a4c7..372bfd04f 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -100,8 +100,8 @@ export const evalRunCommand = command({ // ── Step 1: Extract inputs (same as pipeline input) ────────────── const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); - const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); - const tests = suite.tests; + const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); + const tests = dataset.tests; if (tests.length === 0) { console.error('No tests found in eval file.'); @@ -145,13 +145,13 @@ export const evalRunCommand = command({ // No targets file — subagent-as-target mode } - const evalSetName = suite.metadata?.name?.trim() ?? ''; - const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const datasetName = dataset.metadata?.name?.trim() ?? ''; + const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; for (const test of tests) { - const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id]; + const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id]; const testDir = join(outDir, ...subpath); await mkdir(testDir, { recursive: true }); testIds.push(test.id); @@ -198,7 +198,7 @@ export const evalRunCommand = command({ await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - dataset: evalSetName || undefined, + dataset: datasetName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { name: targetName, kind: targetKind }, @@ -230,7 +230,7 @@ export const evalRunCommand = command({ writeInvProgress(); const invokeTarget = async (testId: string): Promise => { - const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId]; + const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; const testDir = join(outDir, ...subpath); const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8')); if (invoke.kind !== 'cli') return; @@ -341,7 +341,7 @@ export const evalRunCommand = command({ const graderTasks: GraderTask[] = []; for (const testId of testIds) { - const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId]; + const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; const testDir = join(outDir, ...subpath); const codeGradersDir = join(testDir, 'code_graders'); const resultsDir = join(testDir, 'code_grader_results'); diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 8a73eabef..b03a649af 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -21,11 +21,16 @@ * - To add new per-test workspace files, add them under each test directory. */ +import { existsSync } from 'node:fs'; +import { readFile } from 'node:fs/promises'; import path from 'node:path'; + import { command, option, optional, positional, string } from 'cmd-ts'; +import type { EvaluationResult } from '@agentv/core'; + import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; -import { loadResults as loadSharedResults, patchTestIds, resolveSourceFile } from './shared.js'; +import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js'; // ── Export logic ───────────────────────────────────────────────────────── @@ -40,7 +45,7 @@ export async function exportResults( throw new Error(`No results found in ${sourceFile}`); } - await writeArtifactsFromResults(patchTestIds(results), outputDir, { + await writeArtifactsFromResults(results, outputDir, { evalFile: sourceFile, }); } @@ -48,23 +53,54 @@ export async function exportResults( // ── Helpers ────────────────────────────────────────────────────────────── /** - * Derive the default output directory from a JSONL source path. - * Handles both directory-per-run manifests (/index.jsonl) and legacy flat files. + * Derive the default output directory from a run manifest path. */ -function deriveOutputDir(cwd: string, sourceFile: string): string { - const parentDir = path.basename(path.dirname(sourceFile)); - // Directory-per-run: parent is the timestamp dir (or legacy eval_ dir) - if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) { - return path.join(cwd, '.agentv', 'results', 'export', parentDir); +export function deriveOutputDir(cwd: string, sourceFile: string): string { + const baseName = path.basename(sourceFile); + if (baseName !== 'index.jsonl') { + const stem = path.basename(sourceFile, path.extname(sourceFile)); + return path.join( + cwd, + '.agentv', + 'results', + 'export', + stem.startsWith('eval_') ? stem.slice(5) : stem, + ); } + + const parentDir = path.basename(path.dirname(sourceFile)); if (parentDir.startsWith('eval_')) { - // Legacy eval_ prefix: strip it return path.join(cwd, '.agentv', 'results', 'export', parentDir.slice(5)); } - // Legacy flat file: extract timestamp from filename - const basename = path.basename(sourceFile, '.jsonl'); - const dirName = basename.startsWith('eval_') ? basename.slice(5) : basename; - return path.join(cwd, '.agentv', 'results', 'export', dirName); + return path.join(cwd, '.agentv', 'results', 'export', parentDir); +} + +export async function loadExportSource( + source: string | undefined, + cwd: string, +): Promise<{ sourceFile: string; results: readonly EvaluationResult[] }> { + try { + const { sourceFile } = await resolveSourceFile(source, cwd); + const { results } = await loadSharedResults(source, cwd); + return { sourceFile, results }; + } catch (error) { + if (!source) { + throw error; + } + + const explicitSource = path.isAbsolute(source) ? source : path.resolve(cwd, source); + if (!existsSync(explicitSource) || path.extname(explicitSource) !== '.jsonl') { + throw error; + } + + const content = await readFile(explicitSource, 'utf8'); + const results = parseJsonlResults(content); + if (results.length === 0) { + throw new Error(`No results found in ${explicitSource}`); + } + + return { sourceFile: explicitSource, results }; + } } // ── CLI command ────────────────────────────────────────────────────────── @@ -76,7 +112,8 @@ export const resultsExportCommand = command({ source: positional({ type: optional(string), displayName: 'source', - description: 'JSONL result file to export (defaults to most recent in .agentv/results/)', + description: + 'Run workspace directory or index.jsonl manifest to export (defaults to most recent in .agentv/results/runs/)', }), out: option({ type: optional(string), @@ -95,8 +132,7 @@ export const resultsExportCommand = command({ const cwd = dir ?? process.cwd(); try { - const { sourceFile } = await resolveSourceFile(source, cwd); - const { results } = await loadSharedResults(source, cwd); + const { sourceFile, results } = await loadExportSource(source, cwd); const outputDir = out ? path.isAbsolute(out) @@ -111,9 +147,7 @@ export const resultsExportCommand = command({ // Report exported test IDs console.log(`Exported ${results.length} test(s) to ${outputDir}`); for (const result of results) { - const id = - result.testId ?? (result as unknown as Record).evalId ?? 'unknown'; - console.log(` ${id}`); + console.log(` ${result.testId ?? 'unknown'}`); } } catch (error) { console.error(`Error: ${(error as Error).message}`); diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index fb3b4e7a4..cffb4760a 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -4,13 +4,15 @@ import path from 'node:path'; import type { EvaluationResult } from '@agentv/core'; import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js'; -import { parseJsonlResults } from '../eval/artifact-writer.js'; -import { RESULT_INDEX_FILENAME, resolveWorkspaceOrFilePath } from '../eval/result-layout.js'; +import { + RESULT_INDEX_FILENAME, + isDirectoryPath, + resolveRunManifestPath, +} from '../eval/result-layout.js'; export interface ResultManifestRecord { readonly timestamp?: string; readonly test_id?: string; - readonly eval_id?: string; readonly dataset?: string; readonly category?: string; readonly experiment?: string; @@ -41,10 +43,6 @@ function parseJsonlLines(content: string): T[] { .map((line) => JSON.parse(line) as T); } -function isIndexManifestPath(sourceFile: string): boolean { - return path.basename(sourceFile) === RESULT_INDEX_FILENAME; -} - function parseMarkdownMessages(content: string): { role: string; content: string }[] { const trimmed = content.trim(); if (!trimmed.startsWith('@[')) { @@ -120,7 +118,7 @@ function hydrateOutput( function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult { const grading = readOptionalJson(baseDir, record.grading_path); const timing = readOptionalJson(baseDir, record.timing_path); - const testId = record.test_id ?? record.eval_id ?? 'unknown'; + const testId = record.test_id ?? 'unknown'; return { timestamp: record.timestamp, @@ -175,16 +173,14 @@ export function parseResultManifest(content: string): ResultManifestRecord[] { export function resolveResultSourcePath(source: string, cwd?: string): string { const resolved = path.isAbsolute(source) ? source : path.resolve(cwd ?? process.cwd(), source); - return resolveWorkspaceOrFilePath(resolved); + if (isDirectoryPath(resolved) || path.basename(resolved) === RESULT_INDEX_FILENAME) { + return resolveRunManifestPath(resolved); + } + return resolved; } export function loadManifestResults(sourceFile: string): EvaluationResult[] { - const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile); - - if (!isIndexManifestPath(resolvedSourceFile)) { - return parseJsonlResults(readFileSync(resolvedSourceFile, 'utf8')); - } - + const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); const records = parseResultManifest(content); const baseDir = path.dirname(resolvedSourceFile); @@ -193,7 +189,6 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] { export interface LightweightResultRecord { readonly testId: string; - readonly dataset?: string; readonly target?: string; readonly experiment?: string; readonly score: number; @@ -204,64 +199,16 @@ export interface LightweightResultRecord { } export function loadLightweightResults(sourceFile: string): LightweightResultRecord[] { - const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile); + const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); - - if (isIndexManifestPath(resolvedSourceFile)) { - return parseResultManifest(content).map((record) => ({ - testId: record.test_id ?? record.eval_id ?? 'unknown', - dataset: record.dataset, - target: record.target, - experiment: record.experiment, - score: record.score, - scores: record.scores, - executionStatus: record.execution_status, - error: record.error, - timestamp: record.timestamp, - })); - } - - const records: LightweightResultRecord[] = []; - for (const line of content.split(/\r?\n/)) { - const trimmed = line.trim(); - if (!trimmed) { - continue; - } - - let record: Record; - try { - record = JSON.parse(trimmed) as Record; - } catch { - continue; - } - - const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId; - if (typeof rawTestId !== 'string') { - throw new Error(`Missing test_id in result: ${trimmed}`); - } - - if (typeof record.score !== 'number') { - throw new Error(`Missing or invalid score in result: ${trimmed}`); - } - - records.push({ - testId: rawTestId, - dataset: typeof record.dataset === 'string' ? record.dataset : undefined, - target: typeof record.target === 'string' ? record.target : undefined, - score: record.score, - scores: Array.isArray(record.scores) - ? (record.scores as readonly Record[]) - : undefined, - executionStatus: - typeof record.execution_status === 'string' - ? record.execution_status - : typeof record.executionStatus === 'string' - ? record.executionStatus - : undefined, - error: typeof record.error === 'string' ? record.error : undefined, - timestamp: typeof record.timestamp === 'string' ? record.timestamp : undefined, - }); - } - - return records; + return parseResultManifest(content).map((record) => ({ + testId: record.test_id ?? 'unknown', + target: record.target, + experiment: record.experiment, + score: record.score, + scores: record.scores, + executionStatus: record.execution_status, + error: record.error, + timestamp: record.timestamp, + })); } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 555a3a440..671a5aaa6 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -7,8 +7,8 @@ * * API endpoints: * - GET / — Studio SPA (React app) - * - GET /api/runs — list available result files with metadata - * - GET /api/runs/:filename — load results from a specific run file + * - GET /api/runs — list available run workspaces with metadata + * - GET /api/runs/:filename — load results from a specific run workspace * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews * - GET /api/projects — list registered projects @@ -20,7 +20,7 @@ * how searchDir is resolved. * * Exported functions (for testing): - * - resolveSourceFile(source, cwd) — resolves JSONL path + * - resolveSourceFile(source, cwd) — resolves a run manifest path * - loadResults(content) — parses JSONL into EvaluationResult[] * - createApp(results, cwd) — Hono app factory */ @@ -43,6 +43,7 @@ import type { Context } from 'hono'; import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; +import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; import { listResultFiles } from '../trace/utils.js'; import { @@ -51,21 +52,21 @@ import { parseResultManifest, resolveResultSourcePath, } from './manifest.js'; -import { patchTestIds } from './shared.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── /** - * Resolve the JSONL result file path from an explicit source, run cache, - * or directory scan. Throws if no file can be found. + * Resolve a run manifest path from an explicit source, run cache, + * or directory scan. Throws if no run workspace can be found. */ export async function resolveSourceFile(source: string | undefined, cwd: string): Promise { if (source) { - const resolved = resolveResultSourcePath(source, cwd); + let resolved = resolveResultSourcePath(source, cwd); if (!existsSync(resolved)) { throw new Error(`Source file not found: ${resolved}`); } + resolved = resolveRunManifestPath(resolved); return resolved; } @@ -79,11 +80,11 @@ export async function resolveSourceFile(source: string | undefined, cwd: string) const metas = listResultFiles(cwd, 10); if (metas.length === 0) { throw new Error( - 'No result files found in .agentv/results/\nRun an evaluation first: agentv eval ', + 'No run workspaces found in .agentv/results/runs/\nRun an evaluation first: agentv eval ', ); } if (metas.length > 1) { - console.log('Available result files:'); + console.log('Available run workspaces:'); for (const m of metas) { console.log(` ${m.path}`); } @@ -95,8 +96,7 @@ export async function resolveSourceFile(source: string | undefined, cwd: string) // ── JSONL parsing ──────────────────────────────────────────────────────── /** - * Parse JSONL content into EvaluationResult[], with backward-compat - * patching of eval_id → testId. + * Parse JSONL content into EvaluationResult[]. */ export function loadResults(content: string): EvaluationResult[] { const results = parseJsonlResults(content); @@ -104,12 +104,7 @@ export function loadResults(content: string): EvaluationResult[] { throw new Error('No valid results found in JSONL content'); } - return results.map((r) => { - if (!r.testId && (r as unknown as Record).evalId) { - return { ...r, testId: String((r as unknown as Record).evalId) }; - } - return r; - }); + return results; } // ── Feedback persistence ───────────────────────────────────────────────── @@ -273,7 +268,7 @@ function handleRunDetail(c: C, { searchDir }: DataContext) { const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { - const loaded = patchTestIds(loadManifestResults(meta.path)); + const loaded = loadManifestResults(meta.path); return c.json({ results: stripHeavyFields(loaded), source: meta.filename }); } catch { return c.json({ error: 'Failed to load run' }, 500); @@ -285,7 +280,7 @@ function handleRunDatasets(c: C, { searchDir, agentvDir }: DataContext) { const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { - const loaded = patchTestIds(loadManifestResults(meta.path)); + const loaded = loadManifestResults(meta.path); const { pass_threshold } = loadStudioConfig(agentvDir); const datasetMap = new Map(); for (const r of loaded) { @@ -314,7 +309,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { - const loaded = patchTestIds(loadManifestResults(meta.path)); + const loaded = loadManifestResults(meta.path); const { pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, @@ -354,7 +349,7 @@ function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) { const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { - const loaded = patchTestIds(loadManifestResults(meta.path)); + const loaded = loadManifestResults(meta.path); const { pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const datasetMap = new Map(); @@ -385,7 +380,7 @@ function handleEvalDetail(c: C, { searchDir }: DataContext) { const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { - const loaded = patchTestIds(loadManifestResults(meta.path)); + const loaded = loadManifestResults(meta.path); const result = loaded.find((r) => r.testId === evalId); if (!result) return c.json({ error: 'Eval not found' }, 404); return c.json({ eval: result }); @@ -854,7 +849,7 @@ export function createApp( const entries = metas.map((m) => { let totalCostUsd = 0; try { - const loaded = patchTestIds(loadManifestResults(m.path)); + const loaded = loadManifestResults(m.path); totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); } catch { // ignore load errors for aggregate @@ -986,7 +981,8 @@ export const resultsServeCommand = command({ source: positional({ type: optional(string), displayName: 'source', - description: 'JSONL result file to serve (defaults to most recent in .agentv/results/)', + description: + 'Run workspace directory or index.jsonl manifest to serve (defaults to most recent in .agentv/results/runs/)', }), port: option({ type: optional(number), @@ -1078,19 +1074,19 @@ export const resultsServeCommand = command({ process.exit(1); } sourceFile = resolved; - results = patchTestIds(loadManifestResults(resolved)); + results = loadManifestResults(resolved); } else { // Auto-discover: run cache -> directory scan -> empty state const cache = await loadRunCache(cwd); const cachedFile = cache ? resolveRunCacheFile(cache) : ''; if (cachedFile && existsSync(cachedFile)) { sourceFile = cachedFile; - results = patchTestIds(loadManifestResults(cachedFile)); + results = loadManifestResults(cachedFile); } else { const metas = listResultFiles(cwd, 1); if (metas.length > 0) { sourceFile = metas[0].path; - results = patchTestIds(loadManifestResults(metas[0].path)); + results = loadManifestResults(metas[0].path); } // If no metas, results stays empty — dashboard shows welcome state } diff --git a/apps/cli/src/commands/results/shared.ts b/apps/cli/src/commands/results/shared.ts index c70267318..874982266 100644 --- a/apps/cli/src/commands/results/shared.ts +++ b/apps/cli/src/commands/results/shared.ts @@ -2,8 +2,7 @@ * Shared utilities for `agentv results` subcommands. * * Provides: - * - resolveSourceFile() — find an index/results manifest from explicit path or auto-discover latest - * - patchTestIds() — backward-compat eval_id -> test_id patching + * - resolveSourceFile() — find an index manifest from explicit path or auto-discover latest * - sourceArg — cmd-ts positional for optional result source path * * How to extend: @@ -14,6 +13,7 @@ import { existsSync } from 'node:fs'; import { optional, positional, string } from 'cmd-ts'; import type { EvaluationResult } from '@agentv/core'; +import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; import { listResultFiles } from '../trace/utils.js'; import { loadManifestResults, resolveResultSourcePath } from './manifest.js'; @@ -22,7 +22,8 @@ import { loadManifestResults, resolveResultSourcePath } from './manifest.js'; export const sourceArg = positional({ type: optional(string), displayName: 'source', - description: 'Result file or workspace directory (defaults to most recent in .agentv/results/)', + description: + 'Run workspace directory or index.jsonl manifest (defaults to most recent in .agentv/results/runs/)', }); /** @@ -40,6 +41,7 @@ export async function resolveSourceFile( console.error(`Error: File not found: ${sourceFile}`); process.exit(1); } + sourceFile = resolveRunManifestPath(sourceFile); } else { const cache = await loadRunCache(cwd); const cachedFile = cache ? resolveRunCacheFile(cache) : ''; @@ -48,7 +50,7 @@ export async function resolveSourceFile( } else { const metas = listResultFiles(cwd, 1); if (metas.length === 0) { - console.error('Error: No result files found in .agentv/results/'); + console.error('Error: No run workspaces found in .agentv/results/runs/'); console.error('Run an evaluation first: agentv eval '); process.exit(1); } @@ -60,7 +62,7 @@ export async function resolveSourceFile( } /** - * Load and parse eval results from an index/results source file, with backward-compat patching. + * Load and parse eval results from a run workspace or index manifest. */ export async function loadResults( source: string | undefined, @@ -74,17 +76,5 @@ export async function loadResults( process.exit(1); } - return { results: patchTestIds(results), sourceFile }; -} - -/** - * Patch older JSONL records that used eval_id instead of test_id. - */ -export function patchTestIds(results: EvaluationResult[]): EvaluationResult[] { - return results.map((r) => { - if (!r.testId && (r as unknown as Record).evalId) { - return { ...r, testId: String((r as unknown as Record).evalId) }; - } - return r; - }); + return { results, sourceFile }; } diff --git a/apps/cli/src/commands/trace/list.ts b/apps/cli/src/commands/trace/list.ts index a923013bf..42bea2b72 100644 --- a/apps/cli/src/commands/trace/list.ts +++ b/apps/cli/src/commands/trace/list.ts @@ -14,13 +14,13 @@ function formatListTable(metas: ResultFileMeta[]): string { const lines: string[] = []; if (metas.length === 0) { - lines.push(`${c.yellow}No result files found in .agentv/results/${c.reset}`); + lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`); lines.push(`${c.dim}Run an evaluation first: agentv run ${c.reset}`); return lines.join('\n'); } lines.push(''); - lines.push(`${c.bold}Evaluation Results${c.reset} ${c.dim}(.agentv/results/)${c.reset}`); + lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`); lines.push(''); // Column widths @@ -42,7 +42,9 @@ function formatListTable(metas: ResultFileMeta[]): string { } lines.push(''); - lines.push(`${c.dim}${metas.length} result file${metas.length !== 1 ? 's' : ''} found${c.reset}`); + lines.push( + `${c.dim}${metas.length} run workspace${metas.length !== 1 ? 's' : ''} found${c.reset}`, + ); lines.push(''); return lines.join('\n'); @@ -50,7 +52,7 @@ function formatListTable(metas: ResultFileMeta[]): string { export const traceListCommand = command({ name: 'list', - description: 'List recent evaluation result files from .agentv/results/', + description: 'List recent evaluation run workspaces from .agentv/results/runs/', args: { limit: option({ type: optional(number), diff --git a/apps/cli/src/commands/trace/score.ts b/apps/cli/src/commands/trace/score.ts index cf425f3a1..da986096c 100644 --- a/apps/cli/src/commands/trace/score.ts +++ b/apps/cli/src/commands/trace/score.ts @@ -144,7 +144,7 @@ function extractCandidate(raw: RawResult): string { * Only used to satisfy the EvaluationContext interface — deterministic and * trace-based evaluators don't access these fields. */ -function buildEvalTest(raw: RawResult): EvalTest { +function buildTestCase(raw: RawResult): EvalTest { return { id: raw.test_id ?? 'unknown', question: '', @@ -210,7 +210,7 @@ async function runScore( const output = raw.output as readonly Message[] | undefined; const evalContext: EvaluationContext = { - evalCase: buildEvalTest(raw), + evalCase: buildTestCase(raw), candidate, target: { kind: 'custom' as const, name: raw.target ?? 'unknown', config: {} } as never, provider: stubProvider, @@ -295,7 +295,7 @@ function renderTable(scored: ScoreResult[], assertSpec: string): string { export const traceScoreCommand = command({ name: 'score', - description: 'Run evaluators against existing result files post-hoc', + description: 'Run evaluators against existing trace sources post-hoc', args: { file: positional({ type: string, diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index 7baec2dc5..45d865ed6 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -104,7 +104,7 @@ export interface RawTraceSpan { * * Supported sources: * - Run workspace directories / index.jsonl manifests - * - Legacy simple trace JSONL files + * - Standalone trace JSONL files for trace-only workflows * - OTLP JSON trace files written via --otel-file */ export function loadResultFile(filePath: string): RawResult[] { @@ -518,7 +518,7 @@ export function toTraceSummary(result: RawResult): TraceSummary | undefined { } /** - * Metadata about a result file for listing. + * Metadata about a discovered run manifest for listing. */ export interface ResultFileMeta { path: string; @@ -531,62 +531,33 @@ export interface ResultFileMeta { } /** - * Enumerate result files in the .agentv/results/ directory. - * Scans runs/ for both directory-per-run layouts (index.jsonl preferred inside subdirs) - * and legacy flat .jsonl files. Also scans the base directory for pre-runs/ files. + * Enumerate canonical run manifests in `.agentv/results/runs/`. */ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { - const baseDir = path.join(cwd, '.agentv', 'results'); - const runsDir = path.join(baseDir, RESULT_RUNS_DIRNAME); + const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME); const files: { filePath: string; displayName: string }[] = []; - // Scan runs/ for both directory-based runs and flat JSONL files. - // Process directories first so they take priority in dedup over flat files. try { const entries = readdirSync(runsDir, { withFileTypes: true }); for (const entry of entries) { - if (entry.isDirectory()) { - const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name)); - if (primaryPath) { - files.push({ filePath: primaryPath, displayName: entry.name }); - } + if (!entry.isDirectory()) { + continue; } - } - for (const entry of entries) { - if (!entry.isDirectory() && entry.name.endsWith('.jsonl')) { - files.push({ filePath: path.join(runsDir, entry.name), displayName: entry.name }); + + const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name)); + if (primaryPath) { + files.push({ filePath: primaryPath, displayName: entry.name }); } } } catch { // runs/ doesn't exist yet } - // Also scan base directory for legacy files (backward compat) - try { - const entries = readdirSync(baseDir).filter((f) => f.endsWith('.jsonl')); - for (const entry of entries) { - files.push({ filePath: path.join(baseDir, entry), displayName: entry }); - } - } catch { - // Base directory doesn't exist yet - } - - // Deduplicate by normalized name (strip .jsonl so dir "eval_X" matches file "eval_X.jsonl") - const seen = new Set(); - const uniqueFiles: { filePath: string; displayName: string }[] = []; - for (const file of files) { - const key = file.displayName.replace(/\.jsonl$/, ''); - if (!seen.has(key)) { - seen.add(key); - uniqueFiles.push(file); - } - } - // Sort by display name descending (most recent first) - uniqueFiles.sort((a, b) => b.displayName.localeCompare(a.displayName)); + files.sort((a, b) => b.displayName.localeCompare(a.displayName)); - const limited = limit !== undefined && limit > 0 ? uniqueFiles.slice(0, limit) : uniqueFiles; + const limited = limit !== undefined && limit > 0 ? files.slice(0, limit) : files; const metas: ResultFileMeta[] = []; diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts index 2ffdef178..e548d300d 100644 --- a/apps/cli/test/commands/compare/compare.test.ts +++ b/apps/cli/test/commands/compare/compare.test.ts @@ -56,6 +56,21 @@ describe('compare command', () => { ]); }); + it('should load flat JSONL files with camelCase testId results', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync( + filePath, + '{"testId": "case-1", "score": 0.8}\n{"testId": "case-2", "score": 0.9}\n', + ); + + const results = loadJsonlResults(filePath); + + expect(results).toEqual([ + { testId: 'case-1', score: 0.8 }, + { testId: 'case-2', score: 0.9 }, + ]); + }); + it('should handle empty lines in JSONL', () => { const filePath = path.join(tempDir, 'results.jsonl'); writeFileSync( diff --git a/apps/cli/test/commands/eval/run-cache.test.ts b/apps/cli/test/commands/eval/run-cache.test.ts index ff2c852c6..c2ee4f7f6 100644 --- a/apps/cli/test/commands/eval/run-cache.test.ts +++ b/apps/cli/test/commands/eval/run-cache.test.ts @@ -5,27 +5,29 @@ import { type RunCache, resolveRunCacheFile } from '../../../src/commands/eval/r describe('resolveRunCacheFile', () => { it('should resolve new directory-based cache to index.jsonl inside dir', () => { - const cache: RunCache = { lastRunDir: '/results/runs/eval_2026-03-24', timestamp: '' }; + const cache: RunCache = { lastRunDir: '/results/runs/2026-03-24T00-00-00-000Z', timestamp: '' }; expect(resolveRunCacheFile(cache)).toBe( - path.join('/results/runs/eval_2026-03-24', 'index.jsonl'), + path.join('/results/runs/2026-03-24T00-00-00-000Z', 'index.jsonl'), ); }); - it('should resolve legacy file-based cache to lastResultFile', () => { + it('ignores legacy file-based cache entries', () => { const cache: RunCache = { lastResultFile: '/results/runs/eval_2026-03-24.jsonl', timestamp: '', }; - expect(resolveRunCacheFile(cache)).toBe('/results/runs/eval_2026-03-24.jsonl'); + expect(resolveRunCacheFile(cache)).toBe(''); }); it('should prefer lastRunDir over lastResultFile when both present', () => { const cache: RunCache = { - lastRunDir: '/results/runs/eval_dir', + lastRunDir: '/results/runs/2026-03-24T00-00-00-000Z', lastResultFile: '/results/runs/eval_old.jsonl', timestamp: '', }; - expect(resolveRunCacheFile(cache)).toBe(path.join('/results/runs/eval_dir', 'index.jsonl')); + expect(resolveRunCacheFile(cache)).toBe( + path.join('/results/runs/2026-03-24T00-00-00-000Z', 'index.jsonl'), + ); }); it('should return empty string when neither field is set', () => { diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 806ad6ae1..f6f8645ff 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -9,7 +9,11 @@ import type { IndexArtifactEntry, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; -import { exportResults } from '../../../src/commands/results/export.js'; +import { + deriveOutputDir, + exportResults, + loadExportSource, +} from '../../../src/commands/results/export.js'; // ── Sample JSONL content (snake_case, matching on-disk format) ────────── @@ -114,6 +118,25 @@ describe('results export', () => { rmSync(tempDir, { recursive: true, force: true }); }); + it('loadExportSource accepts explicit legacy flat JSONL files', async () => { + const sourceFile = path.join(tempDir, 'eval_2026-03-18.jsonl'); + writeFileSync( + sourceFile, + toJsonl({ ...RESULT_FULL, eval_id: 'legacy-id', test_id: undefined }), + ); + + const { sourceFile: loadedSource, results } = await loadExportSource(sourceFile, tempDir); + + expect(loadedSource).toBe(sourceFile); + expect(results).toHaveLength(1); + expect(results[0].testId).toBe('legacy-id'); + }); + + it('deriveOutputDir uses the source filename for flat JSONL inputs', () => { + const outputDir = deriveOutputDir(tempDir, path.join(tempDir, 'eval_2026-03-18.jsonl')); + expect(outputDir).toBe(path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18')); + }); + it('should create benchmark.json matching artifact-writer schema', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_PARTIAL); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 7bee162cd..2d7766622 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -58,6 +58,18 @@ describe('resolveSourceFile', () => { 'Source file not found', ); }); + + it('rejects legacy flat result files', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-serve-source-')); + const flatFile = path.join(tempDir, 'results.jsonl'); + writeFileSync(flatFile, toJsonl(RESULT_A)); + + await expect(resolveSourceFile(flatFile, tempDir)).rejects.toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); + + rmSync(tempDir, { recursive: true, force: true }); + }); }); // ── loadResults ────────────────────────────────────────────────────────── @@ -327,8 +339,10 @@ describe('serve app', () => { it('loads results from an existing run file', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); - const filename = 'eval_2026-03-25T10-00-00-000Z.jsonl'; - writeFileSync(path.join(runsDir, filename), toJsonl(RESULT_A, RESULT_B)); + const filename = '2026-03-25T10-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A, RESULT_B)); const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request(`/api/runs/${filename}`); diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index 6df36ebba..2cd110fc3 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -1,43 +1,63 @@ -import { describe, expect, it } from 'bun:test'; -import type { EvaluationResult } from '@agentv/core'; +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; -import { patchTestIds } from '../../../src/commands/results/shared.js'; +import { resolveRunManifestPath } from '../../../src/commands/eval/result-layout.js'; +import { resolveSourceFile } from '../../../src/commands/results/shared.js'; -describe('patchTestIds', () => { - it('passes through results with testId', () => { - const results = [{ testId: 'test-1', score: 1 }] as unknown as EvaluationResult[]; - expect(patchTestIds(results)).toEqual(results); +describe('results shared source resolution', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-results-shared-')); }); - it('patches evalId to testId for backward compatibility', () => { - const results = [{ evalId: 'old-1', score: 1 }] as unknown as EvaluationResult[]; - const patched = patchTestIds(results); - expect(patched[0].testId).toBe('old-1'); + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); }); - it('preserves all other fields when patching evalId', () => { - const results = [ - { evalId: 'old-1', score: 0.8, target: 'gpt-4o', timestamp: '2026-01-01' }, - ] as unknown as EvaluationResult[]; - const patched = patchTestIds(results); - expect(patched[0]).toEqual({ - evalId: 'old-1', - score: 0.8, - target: 'gpt-4o', - timestamp: '2026-01-01', - testId: 'old-1', - }); + it('resolves an explicit run workspace directory to index.jsonl', async () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"t1","score":1}\n'); + + const resolved = await resolveSourceFile(runDir, tempDir); + + expect(resolved.sourceFile).toBe(path.join(runDir, 'index.jsonl')); }); - it('does not overwrite existing testId with evalId', () => { - const results = [ - { testId: 'test-1', evalId: 'old-1', score: 1 }, - ] as unknown as EvaluationResult[]; - const patched = patchTestIds(results); - expect(patched[0].testId).toBe('test-1'); + it('auto-discovers the most recent canonical run workspace', async () => { + const olderRunDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + '2026-03-24T10-00-00-000Z', + ); + const newerRunDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + '2026-03-25T10-00-00-000Z', + ); + mkdirSync(olderRunDir, { recursive: true }); + mkdirSync(newerRunDir, { recursive: true }); + writeFileSync(path.join(olderRunDir, 'index.jsonl'), '{"test_id":"old","score":1}\n'); + writeFileSync(path.join(newerRunDir, 'index.jsonl'), '{"test_id":"new","score":1}\n'); + + const resolved = await resolveSourceFile(undefined, tempDir); + + expect(resolved.sourceFile).toBe(path.join(newerRunDir, 'index.jsonl')); }); - it('handles empty array', () => { - expect(patchTestIds([])).toEqual([]); + it('rejects legacy flat result files as result sources', () => { + const flatFile = path.join(tempDir, 'results.jsonl'); + writeFileSync(flatFile, '{"test_id":"t1","score":1}\n'); + + expect(() => resolveRunManifestPath(flatFile)).toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); }); }); diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 9a7d3c939..b813711d4 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -256,102 +256,78 @@ describe('trace utils', () => { expect(metas).toEqual([]); }); - it('should enumerate JSONL files in .agentv/results/runs/', () => { + it('should enumerate run workspaces in .agentv/results/runs/', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); + const olderRunDir = path.join(runsDir, '2026-02-20T21-38-05-833Z'); + const newerRunDir = path.join(runsDir, '2026-02-21T10-00-00-000Z'); + mkdirSync(olderRunDir, { recursive: true }); + mkdirSync(newerRunDir, { recursive: true }); writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), + path.join(olderRunDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n${RESULT_WITHOUT_TRACE}\n`, ); - writeFileSync( - path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z.jsonl'), - `${RESULT_FAILING}\n`, - ); + writeFileSync(path.join(newerRunDir, 'index.jsonl'), `${RESULT_FAILING}\n`); const metas = listResultFiles(tempDir); expect(metas).toHaveLength(2); // Most recent first - expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z.jsonl'); + expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z'); expect(metas[0].testCount).toBe(1); expect(metas[0].passRate).toBe(0); - expect(metas[1].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl'); + expect(metas[1].filename).toBe('2026-02-20T21-38-05-833Z'); expect(metas[1].testCount).toBe(2); expect(metas[1].passRate).toBe(0.5); }); - it('should find legacy files in .agentv/results/ (backward compat)', () => { - const resultsDir = path.join(tempDir, '.agentv', 'results'); - mkdirSync(resultsDir, { recursive: true }); - - writeFileSync( - path.join(resultsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); - - const metas = listResultFiles(tempDir); - expect(metas).toHaveLength(1); - expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl'); - }); - - it('should deduplicate files preferring runs/ over legacy root', () => { + it('should ignore legacy flat result files in results roots', () => { const resultsDir = path.join(tempDir, '.agentv', 'results'); const runsDir = path.join(resultsDir, 'runs'); mkdirSync(runsDir, { recursive: true }); - - // Same filename in both locations - writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); writeFileSync( path.join(resultsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), `${RESULT_WITH_TRACE}\n`, ); + writeFileSync(path.join(runsDir, '2026-02-21T10-00-00-000Z.jsonl'), `${RESULT_FAILING}\n`); const metas = listResultFiles(tempDir); - expect(metas).toHaveLength(1); - // Should prefer the runs/ version - expect(metas[0].path).toContain(path.join('runs', 'eval_2026-02-20T21-38-05-833Z.jsonl')); + + expect(metas).toEqual([]); }); it('should respect limit', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); - writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); - writeFileSync( - path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z.jsonl'), - `${RESULT_FAILING}\n`, - ); + const olderRunDir = path.join(runsDir, '2026-02-20T21-38-05-833Z'); + const newerRunDir = path.join(runsDir, '2026-02-21T10-00-00-000Z'); + mkdirSync(olderRunDir, { recursive: true }); + mkdirSync(newerRunDir, { recursive: true }); + writeFileSync(path.join(olderRunDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`); + writeFileSync(path.join(newerRunDir, 'index.jsonl'), `${RESULT_FAILING}\n`); const metas = listResultFiles(tempDir, 1); expect(metas).toHaveLength(1); - expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z.jsonl'); + expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z'); }); - it('should ignore non-JSONL files', () => { + it('should ignore non-directory entries in runs/', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); writeFileSync(path.join(runsDir, 'notes.txt'), 'not a result file'); - writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); + writeFileSync(path.join(runsDir, '2026-02-20T21-38-05-833Z.jsonl'), `${RESULT_WITH_TRACE}\n`); const metas = listResultFiles(tempDir); - expect(metas).toHaveLength(1); + expect(metas).toHaveLength(0); }); it('should discover index.jsonl inside run directories in runs/', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); - const runDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z'); + const runDir = path.join(runsDir, '2026-02-20T21-38-05-833Z'); mkdirSync(runDir, { recursive: true }); writeFileSync( @@ -364,55 +340,12 @@ describe('trace utils', () => { expect(metas).toHaveLength(1); expect(metas[0].testCount).toBe(2); expect(metas[0].passRate).toBe(0.5); - expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z'); - }); - - it('should list both directory-based and flat-file results together', () => { - const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); - mkdirSync(runsDir, { recursive: true }); - - // New directory-based run - const runDir = path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z'); - mkdirSync(runDir, { recursive: true }); - writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_FAILING}\n`); - - // Legacy flat file - writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); - - const metas = listResultFiles(tempDir); - expect(metas).toHaveLength(2); - // Most recent first - expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z'); - expect(metas[1].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl'); - }); - - it('should deduplicate directory and flat file with same timestamp', () => { - const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); - mkdirSync(runsDir, { recursive: true }); - - // Directory-based (preferred) - const runDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z'); - mkdirSync(runDir, { recursive: true }); - writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`); - - // Flat file with same timestamp - writeFileSync( - path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'), - `${RESULT_WITH_TRACE}\n`, - ); - - const metas = listResultFiles(tempDir); - expect(metas).toHaveLength(1); - // Prefer directory-based (scanned first) - expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z'); + expect(metas[0].filename).toBe('2026-02-20T21-38-05-833Z'); }); it('should skip directories without index.jsonl', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); - const emptyDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z'); + const emptyDir = path.join(runsDir, '2026-02-20T21-38-05-833Z'); mkdirSync(emptyDir, { recursive: true }); // Directory exists but no manifest/result file inside diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 769b18641..5d94ed245 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -11,7 +11,7 @@ sidebar: agentv eval evals/my-eval.yaml ``` -Results are written to `.agentv/results/.jsonl`. Each line is a JSON object with one result per test case. +Results are written to `.agentv/results/runs//index.jsonl`. Each line is a JSON object with one result per test case, and the run workspace also stores the manifest and related artifacts. Each `scores[]` entry includes per-grader timing: @@ -218,10 +218,10 @@ Notes: Re-run only the tests that had infrastructure/execution errors from a previous output: ```bash -agentv eval evals/my-eval.yaml --retry-errors .agentv/results/eval_previous.jsonl +agentv eval evals/my-eval.yaml --retry-errors .agentv/results/runs//index.jsonl ``` -This reads the previous JSONL, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output. +This reads the previous run manifest, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output. ### Execution Error Tolerance diff --git a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx b/apps/web/src/content/docs/docs/evaluators/structured-data.mdx index bf5b06b36..b6d8e0f23 100644 --- a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx +++ b/apps/web/src/content/docs/docs/evaluators/structured-data.mdx @@ -14,7 +14,7 @@ Built-in evaluators for grading structured outputs and gating on execution metri ## Ground Truth -Put the expected structured output in the evalcase `expected_output` (as an object or message array). Evaluators read expected values from there. +Put the expected structured output in the test case `expected_output` (as an object or message array). Evaluators read expected values from there. ```yaml tests: diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx index f4e4bc9f8..ec1fcd2ec 100644 --- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx +++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx @@ -66,7 +66,7 @@ tests: agentv eval ./evals/example.yaml ``` -Results appear in `.agentv/results/eval_.jsonl` with scores, reasoning, and execution traces. +Results appear in `.agentv/results/runs//index.jsonl` with scores, reasoning, and execution traces. ## Next Steps diff --git a/apps/web/src/content/docs/docs/tools/trace.mdx b/apps/web/src/content/docs/docs/tools/trace.mdx index f1d400622..f3ce06021 100644 --- a/apps/web/src/content/docs/docs/tools/trace.mdx +++ b/apps/web/src/content/docs/docs/tools/trace.mdx @@ -19,13 +19,13 @@ For full tool-call inspection, prefer OTLP JSON exports over eval manifests. ### `trace list` -Enumerate evaluation result files from `.agentv/results/`. +Enumerate canonical evaluation run workspaces from `.agentv/results/runs/`. ```bash agentv trace list [--limit N] [--format json|table] ``` -Shows filename, test count, pass rate, average score, file size, and timestamp for each result file. +Shows filename, test count, pass rate, average score, file size, and timestamp for each run workspace. ### `trace show` @@ -56,7 +56,7 @@ research-question, 15.1s, 10,167 tok, $0.105 Scores: response_quality 75% | routing_accuracy 100% ``` -Falls back to a flat summary when output messages are not present in the result file. +Falls back to a flat summary when output messages are not present in the run workspace. ### `trace stats` @@ -94,7 +94,7 @@ agentv trace show trace.otlp.json --format json \ | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]' # Compare providers -agentv trace stats .agentv/results/runs/eval_/index.jsonl --group-by target --format json \ +agentv trace stats .agentv/results/runs//index.jsonl --group-by target --format json \ | jq '.groups[] | {label, score_mean: .metrics.score.mean}' ``` diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md index 68fe45e0c..0af46584c 100644 --- a/examples/features/benchmark-tooling/README.md +++ b/examples/features/benchmark-tooling/README.md @@ -34,19 +34,19 @@ Pairwise Summary: ```bash # N-way matrix (all targets) -agentv compare .agentv/results/runs/eval_/index.jsonl +agentv compare .agentv/results/runs//index.jsonl # With baseline regression check (exits 1 if any target regresses) -agentv compare .agentv/results/runs/eval_/index.jsonl --baseline gpt-4.1 +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 # Pairwise from combined file -agentv compare .agentv/results/runs/eval_/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini # Filter to specific targets -agentv compare .agentv/results/runs/eval_/index.jsonl --targets gpt-4.1 --targets gpt-5-mini +agentv compare .agentv/results/runs//index.jsonl --targets gpt-4.1 --targets gpt-5-mini # JSON output -agentv compare .agentv/results/runs/eval_/index.jsonl --json +agentv compare .agentv/results/runs//index.jsonl --json ``` ### Pairwise Mode @@ -54,7 +54,7 @@ agentv compare .agentv/results/runs/eval_/index.jsonl --json Extract a head-to-head comparison between two specific targets: ```bash -agentv compare .agentv/results/runs/eval_/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini ``` ``` @@ -247,7 +247,7 @@ Generates a consolidated benchmark summary across models and metrics from result ### Usage ```bash -# Summarize all result files in a directory +# Summarize all run workspaces in a directory bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ # Summarize specific files diff --git a/examples/features/document-extraction/.agentv/targets.yaml b/examples/features/document-extraction/.agentv/targets.yaml index 0c3ae6e01..afd7d5ef8 100644 --- a/examples/features/document-extraction/.agentv/targets.yaml +++ b/examples/features/document-extraction/.agentv/targets.yaml @@ -4,8 +4,8 @@ targets: provider_batching: false verbose: true - # Runs the mock invoice extractor for each evalcase individually - # {FILES} is replaced with the input file paths from the evalcase + # Runs the mock invoice extractor for each test case individually + # {FILES} is replaced with the input file paths from the test case # {OUTPUT_FILE} is the temporary file path where output should be written command: bun run ../mock_extractor.ts {FILES} {OUTPUT_FILE} diff --git a/examples/features/document-extraction/README.md b/examples/features/document-extraction/README.md index dd5265de4..4d9416676 100644 --- a/examples/features/document-extraction/README.md +++ b/examples/features/document-extraction/README.md @@ -2,14 +2,14 @@ This folder demonstrates two evaluation patterns for document extraction: -1. **`field_accuracy`** (built-in) - Per-evalcase scoring with pass/fail per field +1. **`field_accuracy`** (built-in) - Per-test-case scoring with pass/fail per field 2. **`code_grader`** (custom) - TP/TN/FP/FN metrics for cross-document aggregation ## When to Use Each Pattern | Pattern | Use Case | Output | |---------|----------|--------| -| `field_accuracy` | Simple pass/fail scoring per evalcase | Score (0-1) per evalcase | +| `field_accuracy` | Simple pass/fail scoring per test case | Score (0-1) per test case | | `code_grader` with `details.metrics` | Aggregate precision/recall across documents | TP/TN/FP/FN per field | ## Quick Start @@ -17,7 +17,7 @@ This folder demonstrates two evaluation patterns for document extraction: From repo root: ```bash -# Pattern 1: Field accuracy (per-evalcase scoring) +# Pattern 1: Field accuracy (per-test-case scoring) bun agentv eval examples/features/document-extraction/evals/field-accuracy.eval.yaml # Pattern 2: Confusion metrics (cross-document aggregation) @@ -25,12 +25,12 @@ bun agentv eval examples/features/document-extraction/evals/confusion-metrics.ev # Aggregate TP/TN/FP/FN into a table (only works with confusion-metrics.eval.yaml) bun run examples/features/document-extraction/scripts/aggregate_metrics.ts \ - .agentv/results/eval_.jsonl + .agentv/results/runs//index.jsonl ``` ## Pattern 1: Field Accuracy (`field-accuracy.eval.yaml`) -Uses the built-in `field_accuracy` evaluator for per-evalcase scoring: +Uses the built-in `field_accuracy` evaluator for per-test-case scoring: ```yaml evaluators: @@ -47,7 +47,7 @@ evaluators: tolerance: 1.0 ``` -**Output**: A score (0-1) per evalcase based on weighted field matches. +**Output**: A score (0-1) per test case based on weighted field matches. **Best for**: Quick validation, CI/CD gates, simple pass/fail checks. @@ -71,7 +71,7 @@ evaluators: **Output**: Aggregate metrics table with fractional precision/recall: ``` -Processed 5 evaluation results from .agentv/results/eval_.jsonl +Processed 5 evaluation results from .agentv/results/runs//index.jsonl Field | TP | TN | FP | FN | Precision | Recall | F1 | Count ---------------+----+----+----+----+-----------+--------+-------+------ @@ -96,7 +96,7 @@ Macro-F1: 0.759 The `aggregate_metrics.ts` script only works with evaluators that emit `details.metrics`: ```bash -bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_/index.jsonl [options] +bun run scripts/aggregate_metrics.ts .agentv/results/runs//index.jsonl [options] Options: --evaluator Filter to a specific evaluator diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml index cacb52a5f..b1f5be5e3 100644 --- a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml +++ b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml @@ -22,7 +22,7 @@ # # Aggregate: # bun run examples/features/document-extraction/scripts/aggregate_metrics.ts \ -# .agentv/results/eval_.jsonl +# .agentv/results/runs//index.jsonl # description: Header field confusion metrics (TP/TN/FP/FN aggregation) diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.yaml b/examples/features/document-extraction/evals/field-accuracy.eval.yaml index a16da674a..1e4a52710 100644 --- a/examples/features/document-extraction/evals/field-accuracy.eval.yaml +++ b/examples/features/document-extraction/evals/field-accuracy.eval.yaml @@ -1,6 +1,6 @@ # Field Accuracy Evaluation Dataset # -# This dataset demonstrates the built-in `field_accuracy` evaluator for per-evalcase scoring. +# This dataset demonstrates the built-in `field_accuracy` evaluator for per-test-case scoring. # Use this pattern when you need simple pass/fail scoring per field. # # For aggregatable TP/TN/FP/FN metrics across documents, see confusion-metrics.yaml instead. @@ -22,7 +22,7 @@ # invoice-005: ~1.000 (line items extracted correctly) # invoice-006: ~1.000 (greedy matching handles reordered line items) # -description: Field accuracy evaluator patterns (per-evalcase scoring) +description: Field accuracy evaluator patterns (per-test-case scoring) execution: target: mock_extractor @@ -416,4 +416,3 @@ tests: value: ../fixtures/invoice-006.json - type: text value: "Extract line items from invoice (may be reordered)." - diff --git a/examples/features/document-extraction/scripts/aggregate_metrics.ts b/examples/features/document-extraction/scripts/aggregate_metrics.ts index 90bcd6684..c20e52a44 100644 --- a/examples/features/document-extraction/scripts/aggregate_metrics.ts +++ b/examples/features/document-extraction/scripts/aggregate_metrics.ts @@ -6,9 +6,9 @@ * per attribute across the whole dataset. * * Usage: - * bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_/index.jsonl - * bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_/index.jsonl --evaluator header_confusion - * bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_/index.jsonl --format csv + * bun run scripts/aggregate_metrics.ts .agentv/results/runs//index.jsonl + * bun run scripts/aggregate_metrics.ts .agentv/results/runs//index.jsonl --evaluator header_confusion + * bun run scripts/aggregate_metrics.ts .agentv/results/runs//index.jsonl --format csv */ import * as fs from 'node:fs'; @@ -241,7 +241,7 @@ Options: Example: bun run scripts/aggregate_metrics.ts .agentv/results/eval-001.jsonl - bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_/index.jsonl --evaluator header_confusion --format csv + bun run scripts/aggregate_metrics.ts .agentv/results/runs//index.jsonl --evaluator header_confusion --format csv `); process.exit(0); } diff --git a/examples/features/trace-analysis/README.md b/examples/features/trace-analysis/README.md index 52c3e86bb..c9f167d39 100644 --- a/examples/features/trace-analysis/README.md +++ b/examples/features/trace-analysis/README.md @@ -5,11 +5,11 @@ Demonstrates `agentv trace` subcommands for headless trace inspection and analys ## Quick Start ```bash -# List result files +# List run workspaces bun agentv trace list # Show summary trace details from the run manifest -bun agentv trace show .agentv/results/runs/eval_/index.jsonl +bun agentv trace show .agentv/results/runs//index.jsonl # Show hierarchical trace tree from an OTLP export bun agentv trace show traces/eval.otlp.json --tree @@ -18,13 +18,13 @@ bun agentv trace show traces/eval.otlp.json --tree bun agentv trace show traces/eval.otlp.json --test-id research-question --tree # Compute percentile statistics -bun agentv trace stats .agentv/results/runs/eval_/index.jsonl +bun agentv trace stats .agentv/results/runs//index.jsonl # Group stats by target provider -bun agentv trace stats .agentv/results/runs/eval_/index.jsonl --group-by target +bun agentv trace stats .agentv/results/runs//index.jsonl --group-by target # JSON output for piping to jq -bun agentv trace stats .agentv/results/runs/eval_/index.jsonl --format json | jq '.groups[].metrics' +bun agentv trace stats .agentv/results/runs//index.jsonl --format json | jq '.groups[].metrics' ``` ## What's in the Example Data @@ -53,6 +53,6 @@ bun agentv trace show traces/eval.otlp.json --format json \ | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]' # Compare scores by target provider -bun agentv trace stats .agentv/results/runs/eval_/index.jsonl --group-by target --format json \ +bun agentv trace stats .agentv/results/runs//index.jsonl --group-by target --format json \ | jq '.groups[] | {label, score_mean: .metrics.score.mean}' ``` diff --git a/examples/showcase/export-screening/README.md b/examples/showcase/export-screening/README.md index 0975b0503..059a8b7de 100644 --- a/examples/showcase/export-screening/README.md +++ b/examples/showcase/export-screening/README.md @@ -51,7 +51,7 @@ Use the wrapper script to compute a confusion matrix and policy-weighted overall structured CI result JSON file (defaults to `results.ci_check.json`): ```bash -bun run ./evals/ci_check.ts .agentv/results/runs/eval_/index.jsonl --threshold 0.95 --check-class High +bun run ./evals/ci_check.ts .agentv/results/runs//index.jsonl --threshold 0.95 --check-class High ``` ### Multi-Sample CI Gating @@ -153,7 +153,7 @@ bun run ./evals/ci_check.ts --eval ./evals/dataset.eval.yaml --threshold 0.95 bun run ./evals/ci_check.ts --eval ./evals/dataset.eval.yaml --samples 5 --threshold 0.90 # Or check an existing run manifest -bun run ./evals/ci_check.ts .agentv/results/runs/eval_/index.jsonl --threshold 0.95 +bun run ./evals/ci_check.ts .agentv/results/runs//index.jsonl --threshold 0.95 ``` ### Options diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 28b3efa3f..6d5e08a5e 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -158,10 +158,10 @@ export async function loadTestsFromJsonl( const rawFile = await readFile(absoluteTestPath, 'utf8'); const rawCases = parseJsonlContent(rawFile, evalFilePath); - // Derive eval set name: sidecar > filename - const fallbackEvalSet = path.basename(absoluteTestPath, '.jsonl') || 'eval'; - const evalSetName = - sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet; + // Derive dataset name: sidecar > filename + const fallbackDatasetName = path.basename(absoluteTestPath, '.jsonl') || 'eval'; + const datasetName = + sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackDatasetName; // Global defaults from sidecar const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm-grader'; @@ -170,7 +170,7 @@ export async function loadTestsFromJsonl( if (verbose) { console.log(`\n[JSONL Dataset: ${evalFilePath}]`); console.log(` Cases: ${rawCases.length}`); - console.log(` Eval set: ${evalSetName}`); + console.log(` Dataset: ${datasetName}`); if (sidecar.description) { console.log(` Description: ${sidecar.description}`); } @@ -179,34 +179,34 @@ export async function loadTestsFromJsonl( const results: EvalTest[] = []; for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) { - const evalcase = rawCases[lineIndex]; + const testCaseConfig = rawCases[lineIndex]; const lineNumber = lineIndex + 1; // 1-based for user-facing messages - const id = asString(evalcase.id); + const id = asString(testCaseConfig.id); // Skip eval cases that don't match the filter pattern (glob supported) if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { continue; } - const conversationId = asString(evalcase.conversation_id); - let outcome = asString(evalcase.criteria); - if (!outcome && evalcase.expected_outcome !== undefined) { - outcome = asString(evalcase.expected_outcome); + const conversationId = asString(testCaseConfig.conversation_id); + let outcome = asString(testCaseConfig.criteria); + if (!outcome && testCaseConfig.expected_outcome !== undefined) { + outcome = asString(testCaseConfig.expected_outcome); if (outcome) { logWarning( - `Test '${asString(evalcase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, ); } } // Resolve input with shorthand support - const rawInputMessages = resolveInputMessages(evalcase); + const rawInputMessages = resolveInputMessages(testCaseConfig); // Resolve expected_output with shorthand support - const expectedMessages = resolveExpectedMessages(evalcase) ?? []; + const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? []; // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assert const hasEvaluationSpec = - !!outcome || expectedMessages.length > 0 || evalcase.assert !== undefined; + !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== undefined; if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) { logError( `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`, @@ -265,13 +265,20 @@ export async function loadTestsFromJsonl( .join(' '); // Merge execution config: per-case overrides sidecar - const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined; + const caseExecution = isJsonObject(testCaseConfig.execution) + ? testCaseConfig.execution + : undefined; const mergedExecution = caseExecution ?? globalExecution; - const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator; + const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator; let evaluators: Awaited>; try { - evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? 'unknown'); + evaluators = await parseEvaluators( + testCaseConfig, + mergedExecution, + searchRoots, + id ?? 'unknown', + ); } catch (error) { // Skip entire test if evaluator validation fails const message = error instanceof Error ? error.message : String(error); @@ -280,7 +287,7 @@ export async function loadTestsFromJsonl( } // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) - const inlineRubrics = evalcase.rubrics; + const inlineRubrics = testCaseConfig.rubrics; if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { const rubricEvaluator = parseInlineRubrics(inlineRubrics); if (rubricEvaluator) { @@ -295,7 +302,7 @@ export async function loadTestsFromJsonl( const testCase: EvalTest = { id, - dataset: evalSetName, + dataset: datasetName, conversation_id: conversationId, question: question, input: inputMessages, @@ -303,7 +310,7 @@ export async function loadTestsFromJsonl( reference_answer: referenceAnswer, file_paths: userFilePaths, criteria: outcome ?? '', - evaluator: evalCaseEvaluatorKind, + evaluator: testCaseEvaluatorKind, assertions: evaluators, }; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 0e0cc962f..1117dc7ed 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -273,18 +273,18 @@ async function loadTestsFromYaml( } const suite = interpolated as RawTestSuite; - const evalSetNameFromSuite = asString(suite.name)?.trim(); - const fallbackEvalSet = + const datasetNameFromSuite = asString(suite.name)?.trim(); + const fallbackDatasetName = path .basename(absoluteTestPath) .replace(/\.eval\.ya?ml$/i, '') .replace(/\.ya?ml$/i, '') || 'eval'; - const evalSetName = - evalSetNameFromSuite && evalSetNameFromSuite.length > 0 - ? evalSetNameFromSuite - : fallbackEvalSet; + const datasetName = + datasetNameFromSuite && datasetNameFromSuite.length > 0 + ? datasetNameFromSuite + : fallbackDatasetName; - const rawTestcases = resolveTests(suite); + const rawTestCases = resolveTests(suite); const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader'; @@ -292,14 +292,14 @@ async function loadTestsFromYaml( const evalFileDir = path.dirname(absoluteTestPath); // Resolve tests: string path to external file, inline array, or error - let expandedTestcases: readonly JsonValue[]; - if (typeof rawTestcases === 'string') { + let expandedTestCases: readonly JsonValue[]; + if (typeof rawTestCases === 'string') { // String path: load tests from external file (YAML, JSONL) - const externalPath = path.resolve(evalFileDir, rawTestcases); - expandedTestcases = await loadCasesFromFile(externalPath); - } else if (Array.isArray(rawTestcases)) { + const externalPath = path.resolve(evalFileDir, rawTestCases); + expandedTestCases = await loadCasesFromFile(externalPath); + } else if (Array.isArray(rawTestCases)) { // Inline array: expand any file:// references - expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir); + expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir); } else { throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`); } @@ -329,47 +329,49 @@ async function loadTestsFromYaml( const results: EvalTest[] = []; - for (const rawEvalcase of expandedTestcases) { - if (!isJsonObject(rawEvalcase)) { + for (const rawTestCase of expandedTestCases) { + if (!isJsonObject(rawTestCase)) { logWarning('Skipping invalid test entry (expected object)'); continue; } - const evalcase = rawEvalcase as RawEvalCase; - const id = asString(evalcase.id); + const testCaseConfig = rawTestCase as RawEvalCase; + const id = asString(testCaseConfig.id); // Skip tests that don't match the filter pattern (glob supported) if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { continue; } - const conversationId = asString(evalcase.conversation_id); - let outcome = asString(evalcase.criteria); - if (!outcome && evalcase.expected_outcome !== undefined) { - outcome = asString(evalcase.expected_outcome); + const conversationId = asString(testCaseConfig.conversation_id); + let outcome = asString(testCaseConfig.criteria); + if (!outcome && testCaseConfig.expected_outcome !== undefined) { + outcome = asString(testCaseConfig.expected_outcome); if (outcome) { logWarning( - `Test '${asString(evalcase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, ); } } // Extract per-case execution config early (reused below for skip_defaults) - const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined; + const caseExecution = isJsonObject(testCaseConfig.execution) + ? testCaseConfig.execution + : undefined; const skipDefaults = caseExecution?.skip_defaults === true; // Resolve input with shorthand support (pass suite-level input_files for merge) const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : undefined; - const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles); + const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles); // Resolve expected_output with shorthand support - const expectedMessages = resolveExpectedMessages(evalcase) ?? []; + const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? []; // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || - evalcase.assertions !== undefined || - evalcase.assert !== undefined; + testCaseConfig.assertions !== undefined || + testCaseConfig.assert !== undefined; if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { logError( `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`, @@ -444,10 +446,15 @@ async function loadTestsFromYaml( .filter((part) => part.length > 0) .join(' '); - const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator; + const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator; let evaluators: Awaited>; try { - evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? 'unknown'); + evaluators = await parseEvaluators( + testCaseConfig, + globalExecution, + searchRoots, + id ?? 'unknown', + ); } catch (error) { // Skip entire test if evaluator validation fails const message = error instanceof Error ? error.message : String(error); @@ -456,7 +463,7 @@ async function loadTestsFromYaml( } // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) - const inlineRubrics = evalcase.rubrics; + const inlineRubrics = testCaseConfig.rubrics; if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { const rubricEvaluator = parseInlineRubrics(inlineRubrics); if (rubricEvaluator) { @@ -470,20 +477,20 @@ async function loadTestsFromYaml( const userFilePaths = collectResolvedInputFilePaths(inputMessages); // Parse per-case workspace config and merge with suite-level - const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir); + const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir); const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace); // Parse per-case metadata - const metadata = isJsonObject(evalcase.metadata) - ? (evalcase.metadata as Record) + const metadata = isJsonObject(testCaseConfig.metadata) + ? (testCaseConfig.metadata as Record) : undefined; // Extract per-test targets override (matrix evaluation) - const caseTargets = extractTargetsFromTestCase(evalcase as JsonObject); + const caseTargets = extractTargetsFromTestCase(testCaseConfig as JsonObject); const testCase: EvalTest = { id, - dataset: evalSetName, + dataset: datasetName, category: options?.category, conversation_id: conversationId, question: question, @@ -492,7 +499,7 @@ async function loadTestsFromYaml( reference_answer: referenceAnswer, file_paths: userFilePaths, criteria: outcome ?? '', - evaluator: evalCaseEvaluatorKind, + evaluator: testCaseEvaluatorKind, assertions: evaluators, workspace: mergedWorkspace, metadata, From 24a18dd8e649b342f91fbb000fffaf2873cc3f27 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 05:51:01 +0000 Subject: [PATCH 2/4] fix(trace): preserve canonical run timestamps Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/compare/index.ts | 4 +++- apps/cli/src/commands/results/export.ts | 3 ++- apps/cli/src/commands/results/serve.ts | 9 ++------- apps/cli/src/commands/trace/utils.ts | 4 +++- apps/cli/test/commands/trace/trace.test.ts | 7 +++++++ apps/web/src/content/docs/docs/tools/studio.mdx | 6 ++++-- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index 5dadaaf06..beab6d2a0 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -13,7 +13,9 @@ import { restPositionals, string, } from 'cmd-ts'; + import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; +import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js'; // ANSI color codes (no dependency needed) @@ -111,7 +113,7 @@ function loadFlatCompareResults(filePath: string): ParsedCompareResult[] { function loadCompareResults(filePath: string): ParsedCompareResult[] { try { const resolvedPath = resolveResultSourcePath(filePath); - if (path.basename(resolvedPath) === 'index.jsonl') { + if (path.basename(resolvedPath) === RESULT_INDEX_FILENAME) { return loadLightweightResults(resolvedPath).map((record) => ({ testId: record.testId, score: record.score, diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index b03a649af..5f11fd489 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -30,6 +30,7 @@ import { command, option, optional, positional, string } from 'cmd-ts'; import type { EvaluationResult } from '@agentv/core'; import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; +import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js'; // ── Export logic ───────────────────────────────────────────────────────── @@ -57,7 +58,7 @@ export async function exportResults( */ export function deriveOutputDir(cwd: string, sourceFile: string): string { const baseName = path.basename(sourceFile); - if (baseName !== 'index.jsonl') { + if (baseName !== RESULT_INDEX_FILENAME) { const stem = path.basename(sourceFile, path.extname(sourceFile)); return path.join( cwd, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 671a5aaa6..2ea30f357 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1068,13 +1068,8 @@ export const resultsServeCommand = command({ // When a source is explicitly provided, it must exist. // Otherwise, try to auto-discover results; start empty if none found. if (source) { - const resolved = resolveResultSourcePath(source, cwd); - if (!existsSync(resolved)) { - console.error(`Error: Source file not found: ${resolved}`); - process.exit(1); - } - sourceFile = resolved; - results = loadManifestResults(resolved); + sourceFile = await resolveSourceFile(source, cwd); + results = loadManifestResults(sourceFile); } else { // Auto-discover: run cache -> directory scan -> empty state const cache = await loadRunCache(cwd); diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index 45d865ed6..443a1466f 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -595,7 +595,9 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl */ export function extractTimestampFromFilename(filename: string): string | undefined { - const match = filename.match(/eval_(\d{4}-\d{2}-\d{2}T[\d-]+Z)/); + const match = filename.match( + /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/, + ); if (!match) return undefined; // Re-convert dashes back to colons/dots for display return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ':$1:$2.$3Z'); diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index b813711d4..32ea668cb 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -275,10 +275,12 @@ describe('trace utils', () => { expect(metas).toHaveLength(2); // Most recent first expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z'); + expect(metas[0].timestamp).toBe('2026-02-21T10:00:00.000Z'); expect(metas[0].testCount).toBe(1); expect(metas[0].passRate).toBe(0); expect(metas[1].filename).toBe('2026-02-20T21-38-05-833Z'); + expect(metas[1].timestamp).toBe('2026-02-20T21:38:05.833Z'); expect(metas[1].testCount).toBe(2); expect(metas[1].passRate).toBe(0.5); }); @@ -371,6 +373,11 @@ describe('trace utils', () => { const result = extractTimestampFromFilename('eval_2026-01-01T00-00-00-000Z.jsonl'); expect(result).toBe('2026-01-01T00:00:00.000Z'); }); + + it('should extract and format timestamp from bare run directory names', () => { + const result = extractTimestampFromFilename('2026-02-20T21-38-05-833Z'); + expect(result).toBe('2026-02-20T21:38:05.833Z'); + }); }); describe('formatDuration', () => { diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 6f4bc3925..2365b11ef 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -19,12 +19,14 @@ The `studio` command launches a web-based dashboard for browsing evaluation runs agentv studio ``` -Studio auto-discovers results from `.agentv/results/` in the current directory and opens at `http://localhost:3117`. +Studio auto-discovers run workspaces from `.agentv/results/runs/` in the current directory and opens at `http://localhost:3117`. -You can also point it at a specific results file: +You can also point it at a specific run workspace or `index.jsonl` manifest: ```bash agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z/index.jsonl +# or +agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z ``` ## Options From f7210663efa8fc7af8087f971be7279818e03554 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 06:04:00 +0000 Subject: [PATCH 3/4] fix(compare): require test_id in flat inputs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/compare/index.ts | 11 +----- .../cli/test/commands/compare/compare.test.ts | 38 ------------------- 2 files changed, 1 insertion(+), 48 deletions(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index beab6d2a0..a8cf5f5f8 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -82,16 +82,7 @@ function loadFlatCompareResults(filePath: string): ParsedCompareResult[] { if (!line) continue; const parsed = JSON.parse(line) as Record; - const testId = - typeof parsed.test_id === 'string' - ? parsed.test_id - : typeof parsed.testId === 'string' - ? parsed.testId - : typeof parsed.eval_id === 'string' - ? parsed.eval_id - : typeof parsed.evalId === 'string' - ? parsed.evalId - : undefined; + const testId = typeof parsed.test_id === 'string' ? parsed.test_id : undefined; if (!testId) { throw new Error(`Missing test_id in result source: ${filePath}`); } diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts index e548d300d..4847d99ba 100644 --- a/apps/cli/test/commands/compare/compare.test.ts +++ b/apps/cli/test/commands/compare/compare.test.ts @@ -41,36 +41,6 @@ describe('compare command', () => { ]); }); - it('should load valid JSONL file with legacy eval_id results', () => { - const filePath = path.join(tempDir, 'results.jsonl'); - writeFileSync( - filePath, - '{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n', - ); - - const results = loadJsonlResults(filePath); - - expect(results).toEqual([ - { testId: 'case-1', score: 0.8 }, - { testId: 'case-2', score: 0.9 }, - ]); - }); - - it('should load flat JSONL files with camelCase testId results', () => { - const filePath = path.join(tempDir, 'results.jsonl'); - writeFileSync( - filePath, - '{"testId": "case-1", "score": 0.8}\n{"testId": "case-2", "score": 0.9}\n', - ); - - const results = loadJsonlResults(filePath); - - expect(results).toEqual([ - { testId: 'case-1', score: 0.8 }, - { testId: 'case-2', score: 0.9 }, - ]); - }); - it('should handle empty lines in JSONL', () => { const filePath = path.join(tempDir, 'results.jsonl'); writeFileSync( @@ -192,14 +162,6 @@ describe('compare command', () => { expect(groups.get('a')).toHaveLength(2); }); - it('should support legacy eval_id field', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); - writeFileSync(filePath, '{"eval_id": "t1", "score": 0.8, "target": "a"}\n'); - - const groups = loadCombinedResults(filePath); - expect(groups.get('a')).toEqual([{ testId: 't1', score: 0.8 }]); - }); - it('should group records from index.jsonl manifests', () => { const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); mkdirSync(runDir, { recursive: true }); From 4c8e22dbf5fb98ce597d6d11fe1b4071e9d3f7b1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 07:25:21 +0000 Subject: [PATCH 4/4] refactor(results): drop flat jsonl compatibility Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/compare/index.ts | 79 ++++-------- apps/cli/src/commands/eval/artifact-writer.ts | 7 +- apps/cli/src/commands/eval/commands/run.ts | 3 +- apps/cli/src/commands/eval/retry-errors.ts | 11 +- apps/cli/src/commands/results/export.ts | 45 ++----- apps/cli/src/commands/results/serve.ts | 2 +- .../cli/test/commands/compare/compare.test.ts | 112 ++++++++++++------ .../results/export-e2e-providers.test.ts | 32 +---- apps/cli/test/commands/results/export.test.ts | 41 ++++--- apps/cli/test/unit/retry-errors.test.ts | 83 +++++++------ examples/features/benchmark-tooling/README.md | 46 +++---- examples/features/compare/README.md | 34 +++--- examples/features/compare/evals/README.md | 38 +++--- .../features/compare/evals/dataset.eval.yaml | 4 +- .../showcase/multi-model-benchmark/README.md | 16 +-- .../skills/agentv-eval-writer/SKILL.md | 14 +-- 16 files changed, 258 insertions(+), 309 deletions(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index a8cf5f5f8..53b0c7651 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -1,6 +1,3 @@ -import { readFileSync } from 'node:fs'; -import path from 'node:path'; - import { array, command, @@ -15,7 +12,6 @@ import { } from 'cmd-ts'; import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; -import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js'; // ANSI color codes (no dependency needed) @@ -67,55 +63,24 @@ interface MatrixRow { scores: Record; } -interface ParsedCompareResult { - testId: string; - score: number; +interface CompareInputRecord extends EvalResult { target?: string; } -function loadFlatCompareResults(filePath: string): ParsedCompareResult[] { - const content = readFileSync(filePath, 'utf8'); - const results: ParsedCompareResult[] = []; - - for (const rawLine of content.split('\n')) { - const line = rawLine.trim(); - if (!line) continue; - - const parsed = JSON.parse(line) as Record; - const testId = typeof parsed.test_id === 'string' ? parsed.test_id : undefined; - if (!testId) { +function loadCompareResults(filePath: string): CompareInputRecord[] { + return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => { + if (!record.testId || record.testId === 'unknown') { throw new Error(`Missing test_id in result source: ${filePath}`); } - - if (typeof parsed.score !== 'number' || Number.isNaN(parsed.score)) { + if (typeof record.score !== 'number' || Number.isNaN(record.score)) { throw new Error(`Missing or invalid score in result source: ${filePath}`); } - - results.push({ - testId, - score: parsed.score, - target: typeof parsed.target === 'string' ? parsed.target : undefined, - }); - } - - return results; -} - -function loadCompareResults(filePath: string): ParsedCompareResult[] { - try { - const resolvedPath = resolveResultSourcePath(filePath); - if (path.basename(resolvedPath) === RESULT_INDEX_FILENAME) { - return loadLightweightResults(resolvedPath).map((record) => ({ - testId: record.testId, - score: record.score, - target: record.target, - })); - } - } catch { - // Fall back to direct JSONL parsing for explicit flat result files. - } - - return loadFlatCompareResults(filePath); + return { + testId: record.testId, + score: record.score, + target: record.target, + }; + }); } export interface MatrixOutput { @@ -125,10 +90,7 @@ export interface MatrixOutput { } export function loadJsonlResults(filePath: string): EvalResult[] { - return loadCompareResults(filePath).map((record) => ({ - testId: record.testId, - score: record.score, - })); + return loadCompareResults(filePath).map(({ testId, score }) => ({ testId, score })); } export function loadCombinedResults(filePath: string): Map { @@ -469,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string export const compareCommand = command({ name: 'compare', description: - 'Compare evaluation result files: two-file pairwise, combined JSONL pairwise, or N-way matrix', + 'Compare evaluation run manifests: two-run pairwise, single-run pairwise, or N-way matrix', args: { results: restPositionals({ type: string, displayName: 'results', - description: 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.', + description: + 'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.', }), threshold: option({ type: optional(number), @@ -486,13 +449,13 @@ export const compareCommand = command({ type: optional(string), long: 'baseline', short: 'b', - description: 'Target name to use as baseline (filters combined JSONL)', + description: 'Target name to use as baseline (filters a single run manifest)', }), candidate: option({ type: optional(string), long: 'candidate', short: 'c', - description: 'Target name to use as candidate (filters combined JSONL)', + description: 'Target name to use as candidate (filters a single run manifest)', }), targets: multioption({ type: array(string), @@ -516,7 +479,7 @@ export const compareCommand = command({ try { if (results.length === 0) { - throw new Error('At least one JSONL result file is required'); + throw new Error('At least one run workspace or index.jsonl manifest is required'); } if (results.length === 2) { @@ -534,7 +497,7 @@ export const compareCommand = command({ const exitCode = determineExitCode(comparison.summary.meanDelta); process.exit(exitCode); } else if (results.length === 1) { - // Combined JSONL mode + // Single-run manifest mode let groups = loadCombinedResults(results[0]); // Filter by --targets if specified @@ -570,7 +533,7 @@ export const compareCommand = command({ } if (baseline && candidate) { - // Pairwise mode from combined JSONL + // Pairwise mode from a single run manifest const baselineResults = groups.get(baseline); const candidateResults = groups.get(candidate); if (!baselineResults) { @@ -604,7 +567,7 @@ export const compareCommand = command({ process.exit(exitCode); } } else { - throw new Error('Expected 1 or 2 JSONL result files'); + throw new Error('Expected 1 or 2 run workspaces or index.jsonl manifests'); } } catch (error) { console.error(`Error: ${(error as Error).message}`); diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 14035f20b..4c072d661 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -648,12 +648,7 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin return { ...result, timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(), - testId: - typeof result.testId === 'string' - ? result.testId - : typeof result.evalId === 'string' - ? result.evalId - : 'unknown', + testId: typeof result.testId === 'string' ? result.testId : 'unknown', score: typeof result.score === 'number' ? result.score : 0, assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [], target: typeof result.target === 'string' ? result.target : 'unknown', diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 8e6903c52..f57957503 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -141,7 +141,8 @@ export const evalRunCommand = command({ retryErrors: option({ type: optional(string), long: 'retry-errors', - description: 'Path to previous output JSONL — re-run only execution_error test cases', + description: + 'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases', }), strict: flag({ long: 'strict', diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts index 8a39bc3bf..65f76c2ea 100644 --- a/apps/cli/src/commands/eval/retry-errors.ts +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -1,18 +1,9 @@ -import { readFile } from 'node:fs/promises'; - import type { EvaluationResult } from '@agentv/core'; import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js'; -import { parseJsonlResults } from './artifact-writer.js'; async function loadRetrySourceResults(jsonlPath: string): Promise { - try { - const resolvedPath = resolveResultSourcePath(jsonlPath); - return loadManifestResults(resolvedPath); - } catch { - const content = await readFile(jsonlPath, 'utf8'); - return parseJsonlResults(content); - } + return loadManifestResults(resolveResultSourcePath(jsonlPath)); } /** diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 5f11fd489..c31622cc0 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -1,6 +1,6 @@ /** - * `agentv results export` — converts JSONL eval results into a directory - * structure matching the artifact-writer output format. + * `agentv results export` — converts a canonical run workspace or index.jsonl + * manifest into a directory structure matching the artifact-writer output format. * * Output structure: * / @@ -21,8 +21,6 @@ * - To add new per-test workspace files, add them under each test directory. */ -import { existsSync } from 'node:fs'; -import { readFile } from 'node:fs/promises'; import path from 'node:path'; import { command, option, optional, positional, string } from 'cmd-ts'; @@ -57,16 +55,8 @@ export async function exportResults( * Derive the default output directory from a run manifest path. */ export function deriveOutputDir(cwd: string, sourceFile: string): string { - const baseName = path.basename(sourceFile); - if (baseName !== RESULT_INDEX_FILENAME) { - const stem = path.basename(sourceFile, path.extname(sourceFile)); - return path.join( - cwd, - '.agentv', - 'results', - 'export', - stem.startsWith('eval_') ? stem.slice(5) : stem, - ); + if (path.basename(sourceFile) !== RESULT_INDEX_FILENAME) { + throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`); } const parentDir = path.basename(path.dirname(sourceFile)); @@ -80,35 +70,16 @@ export async function loadExportSource( source: string | undefined, cwd: string, ): Promise<{ sourceFile: string; results: readonly EvaluationResult[] }> { - try { - const { sourceFile } = await resolveSourceFile(source, cwd); - const { results } = await loadSharedResults(source, cwd); - return { sourceFile, results }; - } catch (error) { - if (!source) { - throw error; - } - - const explicitSource = path.isAbsolute(source) ? source : path.resolve(cwd, source); - if (!existsSync(explicitSource) || path.extname(explicitSource) !== '.jsonl') { - throw error; - } - - const content = await readFile(explicitSource, 'utf8'); - const results = parseJsonlResults(content); - if (results.length === 0) { - throw new Error(`No results found in ${explicitSource}`); - } - - return { sourceFile: explicitSource, results }; - } + const { sourceFile } = await resolveSourceFile(source, cwd); + const { results } = await loadSharedResults(source, cwd); + return { sourceFile, results }; } // ── CLI command ────────────────────────────────────────────────────────── export const resultsExportCommand = command({ name: 'export', - description: 'Export JSONL eval results into a per-test directory structure', + description: 'Export a run workspace or index.jsonl manifest into a per-test directory structure', args: { source: positional({ type: optional(string), diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 2ea30f357..e00e7e837 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -397,7 +397,7 @@ function handleEvalFiles(c: C, { searchDir }: DataContext) { try { const content = readFileSync(meta.path, 'utf8'); const records = parseResultManifest(content); - const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId); + const record = records.find((r) => r.test_id === evalId); if (!record) return c.json({ error: 'Eval not found' }, 404); const baseDir = path.dirname(meta.path); diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts index 4847d99ba..3b77b1bb1 100644 --- a/apps/cli/test/commands/compare/compare.test.ts +++ b/apps/cli/test/commands/compare/compare.test.ts @@ -26,11 +26,13 @@ describe('compare command', () => { }); describe('loadJsonlResults', () => { - it('should load valid JSONL file with test_id results', () => { - const filePath = path.join(tempDir, 'results.jsonl'); + it('should load index.jsonl manifests from a run workspace', () => { + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, - '{"test_id": "case-1", "score": 0.8}\n{"test_id": "case-2", "score": 0.9}\n', + '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n', ); const results = loadJsonlResults(filePath); @@ -41,11 +43,13 @@ describe('compare command', () => { ]); }); - it('should handle empty lines in JSONL', () => { - const filePath = path.join(tempDir, 'results.jsonl'); + it('should handle empty lines in index.jsonl manifests', () => { + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, - '{"test_id": "case-1", "score": 0.8}\n\n{"test_id": "case-2", "score": 0.9}\n', + '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n', ); const results = loadJsonlResults(filePath); @@ -53,48 +57,52 @@ describe('compare command', () => { expect(results).toHaveLength(2); }); - it('should load index.jsonl manifests from a run workspace', () => { + it('should throw error for missing test_id', () => { const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); mkdirSync(runDir, { recursive: true }); const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, - '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n', + '{"score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n', ); - const results = loadJsonlResults(filePath); - - expect(results).toEqual([ - { testId: 'case-1', score: 0.8 }, - { testId: 'case-2', score: 0.9 }, - ]); + expect(() => loadJsonlResults(filePath)).toThrow('Missing test_id'); }); - it('should throw error for missing test_id', () => { - const filePath = path.join(tempDir, 'results.jsonl'); - writeFileSync(filePath, '{"score": 0.8}\n'); + it('should throw error for missing score', () => { + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); + writeFileSync( + filePath, + '{"test_id": "case-1", "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n', + ); - expect(() => loadJsonlResults(filePath)).toThrow('Missing test_id'); + expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score'); }); - it('should throw error for missing score', () => { + it('should reject flat JSONL result files', () => { const filePath = path.join(tempDir, 'results.jsonl'); - writeFileSync(filePath, '{"test_id": "case-1"}\n'); + writeFileSync(filePath, '{"test_id": "case-1", "score": 0.8}\n'); - expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score'); + expect(() => loadJsonlResults(filePath)).toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); }); }); describe('loadCombinedResults', () => { it('should group records by target field', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, [ - '{"test_id": "t1", "score": 0.8, "target": "model-a"}', - '{"test_id": "t2", "score": 0.9, "target": "model-a"}', - '{"test_id": "t1", "score": 0.7, "target": "model-b"}', - '{"test_id": "t2", "score": 0.85, "target": "model-b"}', + '{"test_id": "t1", "score": 0.8, "target": "model-a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}', + '{"test_id": "t2", "score": 0.9, "target": "model-a", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}', + '{"test_id": "t1", "score": 0.7, "target": "model-b", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}', + '{"test_id": "t2", "score": 0.85, "target": "model-b", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}', ].join('\n'), ); @@ -112,13 +120,15 @@ describe('compare command', () => { }); it('should handle three or more targets', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, [ - '{"test_id": "t1", "score": 0.8, "target": "a"}', - '{"test_id": "t1", "score": 0.7, "target": "b"}', - '{"test_id": "t1", "score": 0.9, "target": "c"}', + '{"test_id": "t1", "score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}', + '{"test_id": "t1", "score": 0.7, "target": "b", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}', + '{"test_id": "t1", "score": 0.9, "target": "c", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}', ].join('\n'), ); @@ -131,31 +141,48 @@ describe('compare command', () => { }); it('should throw error for missing target field', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); - writeFileSync(filePath, '{"test_id": "t1", "score": 0.8}\n'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); + writeFileSync( + filePath, + '{"test_id": "t1", "score": 0.8, "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n', + ); expect(() => loadCombinedResults(filePath)).toThrow('Missing target field'); }); it('should throw error for missing test_id', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); - writeFileSync(filePath, '{"score": 0.8, "target": "a"}\n'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); + writeFileSync( + filePath, + '{"score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n', + ); expect(() => loadCombinedResults(filePath)).toThrow('Missing test_id'); }); it('should throw error for missing score', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); - writeFileSync(filePath, '{"test_id": "t1", "target": "a"}\n'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); + writeFileSync( + filePath, + '{"test_id": "t1", "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n', + ); expect(() => loadCombinedResults(filePath)).toThrow('Missing or invalid score'); }); it('should handle empty lines', () => { - const filePath = path.join(tempDir, 'combined.jsonl'); + const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const filePath = path.join(runDir, 'index.jsonl'); writeFileSync( filePath, - '{"test_id": "t1", "score": 0.8, "target": "a"}\n\n{"test_id": "t2", "score": 0.9, "target": "a"}\n', + '{"test_id": "t1", "score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n\n{"test_id": "t2", "score": 0.9, "target": "a", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}\n', ); const groups = loadCombinedResults(filePath); @@ -179,6 +206,15 @@ describe('compare command', () => { expect(groups.get('model-a')).toEqual([{ testId: 't1', score: 0.8 }]); expect(groups.get('model-b')).toEqual([{ testId: 't1', score: 0.7 }]); }); + + it('should reject flat combined JSONL files', () => { + const filePath = path.join(tempDir, 'combined-results.jsonl'); + writeFileSync(filePath, '{"test_id": "t1", "score": 0.8, "target": "a"}\n'); + + expect(() => loadCombinedResults(filePath)).toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); + }); }); describe('classifyOutcome', () => { diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index e162687b0..47bba1768 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -210,11 +210,8 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir( - outputDir: string, - record: { dataset?: string; test_id?: string; eval_id?: string }, -): string { - const testId = record.test_id ?? record.eval_id ?? 'unknown'; +function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string { + const testId = record.test_id ?? 'unknown'; return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId); } @@ -666,30 +663,5 @@ describe('export e2e — multi-provider metrics verification', () => { expect(timing.token_usage.reasoning).toBe(75); expect(timing.duration_ms).toBe(1000); }); - - it('should handle eval_id (legacy) as test_id alias', async () => { - const outputDir = path.join(tempDir, 'legacy'); - const record = { - timestamp: '2026-03-18T10:00:00.000Z', - eval_id: 'legacy-test-id', - dataset: 'test', - score: 1.0, - assertions: [{ text: 'ok', passed: true }], - output_text: 'ok', - target: 'mock', - execution_status: 'ok', - }; - - await exportResults('test.jsonl', toJsonl(record), outputDir); - - expect( - existsSync( - path.join( - artifactDir(outputDir, { ...record, test_id: undefined, target: 'mock' as const }), - 'grading.json', - ), - ), - ).toBe(true); - }); }); }); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index f6f8645ff..8b123bc57 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -99,11 +99,8 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir( - outputDir: string, - record: { dataset?: string; test_id?: string; eval_id?: string }, -): string { - const testId = record.test_id ?? record.eval_id ?? 'unknown'; +function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string { + const testId = record.test_id ?? 'unknown'; return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId); } @@ -118,23 +115,33 @@ describe('results export', () => { rmSync(tempDir, { recursive: true, force: true }); }); - it('loadExportSource accepts explicit legacy flat JSONL files', async () => { - const sourceFile = path.join(tempDir, 'eval_2026-03-18.jsonl'); - writeFileSync( - sourceFile, - toJsonl({ ...RESULT_FULL, eval_id: 'legacy-id', test_id: undefined }), - ); + it('loadExportSource resolves run workspaces to index.jsonl', async () => { + const runDir = path.join(tempDir, '2026-03-18T10-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const sourceFile = path.join(runDir, 'index.jsonl'); + writeFileSync(sourceFile, toJsonl(RESULT_FULL)); - const { sourceFile: loadedSource, results } = await loadExportSource(sourceFile, tempDir); + const { sourceFile: loadedSource, results } = await loadExportSource(runDir, tempDir); expect(loadedSource).toBe(sourceFile); expect(results).toHaveLength(1); - expect(results[0].testId).toBe('legacy-id'); + expect(results[0].testId).toBe('test-greeting'); + }); + + it('deriveOutputDir uses the run directory name for manifest inputs', () => { + const outputDir = deriveOutputDir( + tempDir, + path.join(tempDir, '2026-03-18T10-00-00-000Z', 'index.jsonl'), + ); + expect(outputDir).toBe( + path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18T10-00-00-000Z'), + ); }); - it('deriveOutputDir uses the source filename for flat JSONL inputs', () => { - const outputDir = deriveOutputDir(tempDir, path.join(tempDir, 'eval_2026-03-18.jsonl')); - expect(outputDir).toBe(path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18')); + it('deriveOutputDir rejects non-manifest paths', () => { + expect(() => deriveOutputDir(tempDir, path.join(tempDir, 'results.jsonl'))).toThrow( + 'Expected a run manifest named index.jsonl', + ); }); it('should create benchmark.json matching artifact-writer schema', async () => { diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts index bbc54a7ed..9aca5b16b 100644 --- a/apps/cli/test/unit/retry-errors.test.ts +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -14,27 +14,32 @@ describe('retry-errors', () => { } }); - function createJsonlFile(lines: object[]): string { + function createIndexFile(lines: object[]): string { tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); - const filePath = path.join(tmpDir, 'results.jsonl'); + const filePath = path.join(tmpDir, 'index.jsonl'); + mkdirSync(tmpDir, { recursive: true }); writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n')); return filePath; } - function createIndexFile(lines: object[]): string { - tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-index-test-')); - const filePath = path.join(tmpDir, 'index.jsonl'); - mkdirSync(tmpDir, { recursive: true }); + function createFlatJsonlFile(lines: object[]): string { + tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-flat-test-')); + const filePath = path.join(tmpDir, 'results.jsonl'); writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n')); return filePath; } it('loadErrorTestIds returns only execution_error test IDs', async () => { - const filePath = createJsonlFile([ - { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, - { testId: 'case-2', executionStatus: 'execution_error', score: 0, error: 'timeout' }, - { testId: 'case-3', executionStatus: 'quality_failure', score: 0.3 }, - { testId: 'case-4', executionStatus: 'execution_error', score: 0, error: 'provider failed' }, + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' }, + { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 }, + { + test_id: 'case-4', + execution_status: 'execution_error', + score: 0, + error: 'provider failed', + }, ]); const ids = await loadErrorTestIds(filePath); @@ -42,9 +47,9 @@ describe('retry-errors', () => { }); it('loadErrorTestIds deduplicates IDs', async () => { - const filePath = createJsonlFile([ - { testId: 'case-1', executionStatus: 'execution_error', score: 0 }, - { testId: 'case-1', executionStatus: 'execution_error', score: 0 }, + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'execution_error', score: 0 }, + { test_id: 'case-1', execution_status: 'execution_error', score: 0 }, ]); const ids = await loadErrorTestIds(filePath); @@ -52,9 +57,9 @@ describe('retry-errors', () => { }); it('loadErrorTestIds returns empty array when no errors', async () => { - const filePath = createJsonlFile([ - { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, - { testId: 'case-2', executionStatus: 'quality_failure', score: 0.5 }, + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'quality_failure', score: 0.5 }, ]); const ids = await loadErrorTestIds(filePath); @@ -62,10 +67,10 @@ describe('retry-errors', () => { }); it('loadNonErrorResults returns only non-error results', async () => { - const filePath = createJsonlFile([ - { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, - { testId: 'case-2', executionStatus: 'execution_error', score: 0 }, - { testId: 'case-3', executionStatus: 'quality_failure', score: 0.5 }, + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, + { test_id: 'case-3', execution_status: 'quality_failure', score: 0.5 }, ]); const results = await loadNonErrorResults(filePath); @@ -74,8 +79,8 @@ describe('retry-errors', () => { expect(results[1].testId).toBe('case-3'); }); - it('supports snake_case result files written by the CLI', async () => { - const filePath = createJsonlFile([ + it('supports index.jsonl manifests written by the CLI', async () => { + const filePath = createIndexFile([ { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, { test_id: 'case-3', execution_status: 'quality_failure', score: 0.5 }, @@ -90,7 +95,21 @@ describe('retry-errors', () => { expect(results[1].testId).toBe('case-3'); }); - it('supports index.jsonl manifests during the migration', async () => { + it('rejects flat JSONL result files', async () => { + const filePath = createFlatJsonlFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, + ]); + + await expect(loadErrorTestIds(filePath)).rejects.toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); + await expect(loadNonErrorResults(filePath)).rejects.toThrow( + 'Expected a run workspace directory or index.jsonl manifest', + ); + }); + + it('supports index.jsonl manifests', async () => { const filePath = createIndexFile([ { test_id: 'case-1', @@ -112,24 +131,20 @@ describe('retry-errors', () => { expect(ids).toEqual(['case-2']); }); - it('skips malformed JSON lines', async () => { + it('throws on malformed index.jsonl lines', async () => { tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); - const filePath = path.join(tmpDir, 'results.jsonl'); + const filePath = path.join(tmpDir, 'index.jsonl'); writeFileSync( filePath, [ - JSON.stringify({ testId: 'case-1', executionStatus: 'execution_error', score: 0 }), + JSON.stringify({ test_id: 'case-1', execution_status: 'execution_error', score: 0 }), 'not valid json', '', - JSON.stringify({ testId: 'case-2', executionStatus: 'ok', score: 0.9 }), + JSON.stringify({ test_id: 'case-2', execution_status: 'ok', score: 0.9 }), ].join('\n'), ); - const ids = await loadErrorTestIds(filePath); - expect(ids).toEqual(['case-1']); - - const results = await loadNonErrorResults(filePath); - expect(results).toHaveLength(1); - expect(results[0].testId).toBe('case-2'); + await expect(loadErrorTestIds(filePath)).rejects.toThrow(); + await expect(loadNonErrorResults(filePath)).rejects.toThrow(); }); }); diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md index 0af46584c..dc336d10e 100644 --- a/examples/features/benchmark-tooling/README.md +++ b/examples/features/benchmark-tooling/README.md @@ -4,13 +4,13 @@ Utilities for multi-model benchmarking workflows with AgentV. ## N-Way Multi-Model Comparison (built-in) -`agentv compare` natively supports combined JSONL files with a `target` field, enabling N-way matrix comparison without splitting files. +`agentv compare` natively supports canonical run manifests with a `target` field, enabling N-way matrix comparison without splitting files. ### Quick Start ```bash -# Try it now — fixture included, no API keys needed -agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl +# Compare a recent canonical run +agentv compare .agentv/results/runs//index.jsonl ``` Output: @@ -88,7 +88,7 @@ Each line includes a `target` field to identify which model produced the result: ### Key Files - `evals/benchmark.eval.yaml` - Example eval config with 3 tests -- `fixtures/combined-results.jsonl` - Sample combined output (9 records: 3 tests x 3 targets) +- canonical run workspaces under `.agentv/results/runs//` ## split-by-target @@ -123,20 +123,19 @@ Target names are normalized for safe filenames: ### Downstream Compare Workflow -After splitting, use `agentv compare` to perform pairwise model comparisons: +Use `agentv compare` directly on the canonical run manifest for pairwise or matrix comparisons: ```bash -# 1. Run a matrix evaluation that produces a combined results file +# 1. Run a matrix evaluation that produces a canonical run workspace bun agentv eval my-eval.yaml -# 2. Split results by target -bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target - -# 3. Compare any two targets -bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl +# 2. Compare any two targets from the same run +bun agentv compare .agentv/results/runs//index.jsonl \ + --baseline gpt-4.1 --candidate claude-sonnet-4 -# 4. JSON output for CI pipelines -bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json +# 3. JSON output for CI pipelines +bun agentv compare .agentv/results/runs//index.jsonl \ + --baseline gpt-4.1 --candidate claude-sonnet-4 --json ``` The `compare` command matches records by `test_id`, calculates score deltas, and classifies each as win/loss/tie. It exits non-zero on regressions, making it suitable for CI gates. @@ -149,7 +148,8 @@ Computes aggregate win/loss/tie rates from `agentv compare --json` output, makin ```bash # Save comparison output to a file -bun agentv compare baseline.jsonl candidate.jsonl --json > comparison.json +bun agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --json > comparison.json # Print a human-readable summary table bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.json @@ -167,8 +167,10 @@ Pass a directory of comparison JSON files to get per-metric win rates. Each file ```bash # Run comparisons for different metrics -bun agentv compare base.jsonl cand.jsonl --json > comparisons/accuracy.json -bun agentv compare base-latency.jsonl cand-latency.jsonl --json > comparisons/latency.json +bun agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --json > comparisons/accuracy.json +bun agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --json > comparisons/latency.json # Aggregate across all metrics bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparisons/ @@ -282,16 +284,14 @@ bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ # 1. Run multi-model evaluation bun agentv eval my-eval.yaml -# 2. Split results by target -bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target - -# 3. Compare two targets -bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json > comparison.json +# 2. Compare two targets from the run manifest +bun agentv compare .agentv/results/runs//index.jsonl \ + --baseline gpt-4.1 --candidate claude-sonnet-4 --json > comparison.json -# 4. Get win-rate summary +# 3. Get win-rate summary bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.json -# 5. Statistical significance test +# 4. Statistical significance test bun examples/features/benchmark-tooling/scripts/significance-test.ts \ ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl diff --git a/examples/features/compare/README.md b/examples/features/compare/README.md index 04e41cb8e..05df34e8f 100644 --- a/examples/features/compare/README.md +++ b/examples/features/compare/README.md @@ -1,11 +1,11 @@ # Baseline vs Candidate Comparison -Demonstrates comparing evaluation results using the `agentv compare` command. +Demonstrates comparing canonical run manifests using the `agentv compare` command. ## What This Shows -- N-way matrix comparison from a combined JSONL file -- Two-file pairwise comparison (baseline vs candidate) +- N-way matrix comparison from a run manifest with multiple targets +- Two-run pairwise comparison (baseline vs candidate) - Score delta calculation and win/loss classification - Baseline regression detection via exit codes - Human-readable and JSON output formats @@ -15,33 +15,31 @@ Demonstrates comparing evaluation results using the `agentv compare` command. ```bash # From repository root -# N-way matrix from a combined results file (see ../benchmark-tooling/ for fixture) -agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl +# N-way matrix from a canonical run manifest +agentv compare .agentv/results/runs//index.jsonl -# Pairwise from combined file -agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl \ +# Pairwise from the same combined run manifest +agentv compare .agentv/results/runs//index.jsonl \ --baseline gpt-4.1 --candidate gpt-5-mini # CI regression gate: exit 1 if any target regresses vs baseline -agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl \ +agentv compare .agentv/results/runs//index.jsonl \ --baseline gpt-4.1 -# Two-file pairwise comparison (legacy) -agentv compare examples/features/compare/evals/baseline-results.jsonl \ - examples/features/compare/evals/candidate-results.jsonl +# Two-run pairwise comparison +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl # With custom threshold for win/loss classification -agentv compare examples/features/compare/evals/baseline-results.jsonl \ - examples/features/compare/evals/candidate-results.jsonl --threshold 0.05 +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --threshold 0.05 # JSON output for CI pipelines -agentv compare examples/features/compare/evals/baseline-results.jsonl \ - examples/features/compare/evals/candidate-results.jsonl --json +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --json ``` ## Key Files -- `evals/baseline-results.jsonl` - Results from baseline configuration -- `evals/candidate-results.jsonl` - Results from candidate configuration +- canonical run workspaces under `.agentv/results/runs//` - `evals/README.md` - Detailed usage documentation -- `../benchmark-tooling/fixtures/combined-results.jsonl` - Combined multi-target fixture for N-way matrix diff --git a/examples/features/compare/evals/README.md b/examples/features/compare/evals/README.md index c01a261f0..79fa226d3 100644 --- a/examples/features/compare/evals/README.md +++ b/examples/features/compare/evals/README.md @@ -1,27 +1,25 @@ # Compare Command Example -The `agentv compare` command supports three modes: N-way matrix from a combined JSONL, pairwise from a combined JSONL, and two-file pairwise. +The `agentv compare` command supports three modes: N-way matrix from a canonical run manifest, pairwise from a canonical run manifest, and two-run pairwise. ## Use Case Compare model performance across different configurations: -- N-way matrix comparison across 3+ models from a single combined results file +- N-way matrix comparison across 3+ models from a single run manifest - Baseline regression gating in CI (exit 1 if any target regresses) - Head-to-head pairwise between two specific targets -- Before/after optimization runs (two-file pairwise) +- Before/after optimization runs (two-run pairwise) ## Sample Files -- `baseline-results.jsonl` - Results from baseline configuration (GPT-4.1) -- `candidate-results.jsonl` - Results from candidate configuration (GPT-5) -- `../../benchmark-tooling/fixtures/combined-results.jsonl` - Combined multi-target results (3 tests x 3 targets) +- canonical run workspaces under `.agentv/results/runs//` ## Usage -### N-Way Matrix (combined JSONL) +### N-Way Matrix (run manifest) ```bash -agentv compare combined-results.jsonl +agentv compare .agentv/results/runs//index.jsonl ``` Output: @@ -43,14 +41,14 @@ Pairwise Summary: ### Baseline Regression Check ```bash -agentv compare combined-results.jsonl --baseline gpt-4.1 +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 # Exits 1 if any target regresses vs gpt-4.1 ``` -### Pairwise from Combined JSONL +### Pairwise from a Single Run Manifest ```bash -agentv compare combined-results.jsonl --baseline gpt-4.1 --candidate gpt-5-mini +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini ``` ``` @@ -65,15 +63,16 @@ Comparing: gpt-4.1 → gpt-5-mini Summary: 0 wins, 0 losses, 3 ties | Mean Δ: -0.017 | Status: regressed ``` -### Two-File Pairwise (legacy) +### Two-Run Pairwise ```bash -agentv compare baseline-results.jsonl candidate-results.jsonl +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl ``` Output: ``` -Comparing: baseline-results.jsonl → candidate-results.jsonl +Comparing: .agentv/results/runs//index.jsonl → .agentv/results/runs//index.jsonl Test ID Baseline Candidate Delta Result ─────────────── ──────── ───────── ──────── ──────── @@ -91,7 +90,8 @@ Summary: 1 win, 0 losses, 4 ties | Mean Δ: +0.054 | Status: improved Use a stricter threshold (0.05) for win/loss classification: ```bash -agentv compare baseline-results.jsonl candidate-results.jsonl --threshold 0.05 +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl --threshold 0.05 ``` ### JSON Output @@ -99,7 +99,7 @@ agentv compare baseline-results.jsonl candidate-results.jsonl --threshold 0.05 For machine-readable output (CI pipelines, scripts): ```bash -agentv compare combined-results.jsonl --json +agentv compare .agentv/results/runs//index.jsonl --json ``` Output uses snake_case for Python ecosystem compatibility: @@ -130,8 +130,8 @@ Use exit codes for automated quality gates: ```bash # N-way: fail if any target regresses vs baseline -agentv compare results.jsonl --baseline gpt-4.1 || echo "Regression detected!" +agentv compare .agentv/results/runs//index.jsonl --baseline gpt-4.1 || echo "Regression detected!" -# Two-file: fail if candidate regresses -agentv compare baseline.jsonl candidate.jsonl || echo "Regression detected!" +# Two-run: fail if candidate regresses +agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl || echo "Regression detected!" ``` diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml index 2d7209118..b9cff5bc0 100644 --- a/examples/features/compare/evals/dataset.eval.yaml +++ b/examples/features/compare/evals/dataset.eval.yaml @@ -1,9 +1,9 @@ # Demo eval for the compare example. -# Run against two targets to generate baseline and candidate result files: +# Run against two targets to generate canonical run workspaces: # agentv eval evals/dataset.eval.yaml --target baseline # agentv eval evals/dataset.eval.yaml --target candidate # Then compare: -# agentv compare evals/baseline-results.jsonl evals/candidate-results.jsonl +# agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl name: compare-demo description: Demo eval for generating baseline and candidate results to compare diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md index 52fcad680..b519a28d4 100644 --- a/examples/showcase/multi-model-benchmark/README.md +++ b/examples/showcase/multi-model-benchmark/README.md @@ -51,20 +51,20 @@ bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yam ## Comparing Models -The eval produces a combined results file with a `target` field per record. Use `agentv compare` to see all models side by side: +The eval produces a canonical run workspace with `target` in each `index.jsonl` record. Use `agentv compare` to see all models side by side: ```bash # N-way matrix — see all models at once -agentv compare results.jsonl +agentv compare .agentv/results/runs//index.jsonl # Designate a baseline for CI regression gating -agentv compare results.jsonl --baseline copilot +agentv compare .agentv/results/runs//index.jsonl --baseline copilot # Pairwise: compare two specific targets -agentv compare results.jsonl --baseline copilot --candidate claude +agentv compare .agentv/results/runs//index.jsonl --baseline copilot --candidate claude # JSON output for CI integration -agentv compare results.jsonl --json +agentv compare .agentv/results/runs//index.jsonl --json ``` ### Expected Output @@ -134,7 +134,7 @@ This surfaces non-determinism — if a model passes on trial 1 but fails on tria ### 4. Compare -The `agentv compare` command reads a combined JSONL (with `target` field) and shows an N-way matrix with pairwise summaries. Each pair classifies per-test deltas: +The `agentv compare` command reads a canonical run manifest (`index.jsonl`, with `target` per record) and shows an N-way matrix with pairwise summaries. Each pair classifies per-test deltas: - **Win**: candidate score exceeds baseline by threshold (default 0.10) - **Loss**: baseline score exceeds candidate by threshold @@ -154,8 +154,8 @@ benchmark.eval.yaml └────────┬────────────────┘ │ ▼ - combined results.jsonl - (all targets in one file) + .agentv/results/runs// + index.jsonl │ ▼ ┌─────────────────────────┐ diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 5d9cc8331..1894d2f14 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -541,17 +541,17 @@ agentv eval assert --agent-output "..." --agent-input "..." # Import agent transcripts for offline grading agentv import claude --discover latest -# Re-run only execution errors from a previous output -agentv eval --retry-errors +# Re-run only execution errors from a previous run +agentv eval --retry-errors .agentv/results/runs//index.jsonl # Validate eval file agentv validate -# Compare results — N-way matrix from combined JSONL -agentv compare -agentv compare --baseline # CI regression gate -agentv compare --baseline --candidate # pairwise -agentv compare # two-file pairwise +# Compare results — N-way matrix from a canonical run manifest +agentv compare .agentv/results/runs//index.jsonl +agentv compare .agentv/results/runs//index.jsonl --baseline # CI regression gate +agentv compare .agentv/results/runs//index.jsonl --baseline --candidate # pairwise +agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl # Author assertions directly in the eval file # Prefer simple assertions when they fit the criteria; use deterministic or LLM-based graders when needed