From 6e7d8f9d7b947da2205733526ea5228a8cd4653f Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 05:24:28 +0000
Subject: [PATCH 1/4] refactor(results): remove flat manifest loading

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md                                     |   2 +-
 apps/cli/src/commands/compare/index.ts        |  67 +++++++++-
 apps/cli/src/commands/eval/artifact-writer.ts |  73 +++++++++-
 apps/cli/src/commands/eval/result-layout.ts   |  18 +++
 apps/cli/src/commands/eval/retry-errors.ts    |  25 ++--
 apps/cli/src/commands/eval/run-cache.ts       |  25 ++--
 apps/cli/src/commands/eval/run-eval.ts        | 125 +++++++++---------
 apps/cli/src/commands/pipeline/bench.ts       |   8 +-
 apps/cli/src/commands/pipeline/grade.ts       |   8 +-
 apps/cli/src/commands/pipeline/input.ts       |  14 +-
 apps/cli/src/commands/pipeline/run.ts         |  16 +--
 apps/cli/src/commands/results/export.ts       |  74 ++++++++---
 apps/cli/src/commands/results/manifest.ts     |  97 +++-----------
 apps/cli/src/commands/results/serve.ts        |  50 ++++---
 apps/cli/src/commands/results/shared.ts       |  26 ++--
 apps/cli/src/commands/trace/list.ts           |  10 +-
 apps/cli/src/commands/trace/score.ts          |   6 +-
 apps/cli/src/commands/trace/utils.ts          |  53 ++------
 .../cli/test/commands/compare/compare.test.ts |  15 +++
 apps/cli/test/commands/eval/run-cache.test.ts |  14 +-
 apps/cli/test/commands/results/export.test.ts |  27 +++-
 apps/cli/test/commands/results/serve.test.ts  |  18 ++-
 apps/cli/test/commands/results/shared.test.ts |  82 +++++++-----
 apps/cli/test/commands/trace/trace.test.ts    | 119 ++++-------------
 .../docs/docs/evaluation/running-evals.mdx    |   6 +-
 .../docs/docs/evaluators/structured-data.mdx  |   2 +-
 .../docs/docs/getting-started/quickstart.mdx  |   2 +-
 .../web/src/content/docs/docs/tools/trace.mdx |   8 +-
 examples/features/benchmark-tooling/README.md |  14 +-
 .../document-extraction/.agentv/targets.yaml  |   4 +-
 .../features/document-extraction/README.md    |  16 +--
 .../evals/confusion-metrics.eval.yaml         |   2 +-
 .../evals/field-accuracy.eval.yaml            |   5 +-
 .../scripts/aggregate_metrics.ts              |   8 +-
 examples/features/trace-analysis/README.md    |  12 +-
 examples/showcase/export-screening/README.md  |   4 +-
 .../src/evaluation/loaders/jsonl-parser.ts    |  49 ++++---
 packages/core/src/evaluation/yaml-parser.ts   |  79 ++++++-----
 38 files changed, 648 insertions(+), 535 deletions(-)
diff --git a/README.md b/README.md
index e51776e2b..23415f112 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ agentv eval evals/my-eval.yaml
 
 **5. Compare results across targets:**
 ```bash
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
 ## Output formats
diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index 927eb6454..5dadaaf06 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -1,3 +1,6 @@
+import { readFileSync } from 'node:fs';
+import path from 'node:path';
+
 import {
   array,
   command,
@@ -62,6 +65,66 @@ interface MatrixRow {
   scores: Record<string, number>;
 }
 
+interface ParsedCompareResult {
+  testId: string;
+  score: number;
+  target?: string;
+}
+
+function loadFlatCompareResults(filePath: string): ParsedCompareResult[] {
+  const content = readFileSync(filePath, 'utf8');
+  const results: ParsedCompareResult[] = [];
+
+  for (const rawLine of content.split('\n')) {
+    const line = rawLine.trim();
+    if (!line) continue;
+
+    const parsed = JSON.parse(line) as Record<string, unknown>;
+    const testId =
+      typeof parsed.test_id === 'string'
+        ? parsed.test_id
+        : typeof parsed.testId === 'string'
+          ? parsed.testId
+          : typeof parsed.eval_id === 'string'
+            ? parsed.eval_id
+            : typeof parsed.evalId === 'string'
+              ? parsed.evalId
+              : undefined;
+    if (!testId) {
+      throw new Error(`Missing test_id in result source: ${filePath}`);
+    }
+
+    if (typeof parsed.score !== 'number' || Number.isNaN(parsed.score)) {
+      throw new Error(`Missing or invalid score in result source: ${filePath}`);
+    }
+
+    results.push({
+      testId,
+      score: parsed.score,
+      target: typeof parsed.target === 'string' ? parsed.target : undefined,
+    });
+  }
+
+  return results;
+}
+
+function loadCompareResults(filePath: string): ParsedCompareResult[] {
+  try {
+    const resolvedPath = resolveResultSourcePath(filePath);
+    if (path.basename(resolvedPath) === 'index.jsonl') {
+      return loadLightweightResults(resolvedPath).map((record) => ({
+        testId: record.testId,
+        score: record.score,
+        target: record.target,
+      }));
+    }
+  } catch {
+    // Fall back to direct JSONL parsing for explicit flat result files.
+  }
+
+  return loadFlatCompareResults(filePath);
+}
+
 export interface MatrixOutput {
   matrix: MatrixRow[];
   pairwise: ComparisonOutput[];
@@ -69,7 +132,7 @@ export interface MatrixOutput {
 }
 
 export function loadJsonlResults(filePath: string): EvalResult[] {
-  return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => ({
+  return loadCompareResults(filePath).map((record) => ({
     testId: record.testId,
     score: record.score,
   }));
@@ -78,7 +141,7 @@ export function loadJsonlResults(filePath: string): EvalResult[] {
 export function loadCombinedResults(filePath: string): Map<string, EvalResult[]> {
   const groups = new Map<string, EvalResult[]>();
 
-  for (const record of loadLightweightResults(resolveResultSourcePath(filePath))) {
+  for (const record of loadCompareResults(filePath)) {
     if (typeof record.target !== 'string') {
       throw new Error(`Missing target field in combined result source: ${filePath}`);
     }
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 2111453be..14035f20b 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -594,6 +594,74 @@ function toCamelCaseDeep(obj: unknown): unknown {
   return obj;
 }
 
+type ParsedEvaluationResult = Record<string, unknown> & {
+  timestamp: string;
+  testId: string;
+  score: number;
+  assertions: EvaluationResult['assertions'];
+  target: string;
+  output: EvaluationResult['output'];
+  executionStatus: EvaluationResult['executionStatus'];
+};
+
+const EXECUTION_STATUSES = new Set<EvaluationResult['executionStatus']>([
+  'ok',
+  'quality_failure',
+  'execution_error',
+]);
+
+function isAssertionEntry(value: unknown): value is EvaluationResult['assertions'][number] {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return false;
+  }
+
+  const candidate = value as { text?: unknown; passed?: unknown; evidence?: unknown };
+  return (
+    typeof candidate.text === 'string' &&
+    typeof candidate.passed === 'boolean' &&
+    (candidate.evidence === undefined || typeof candidate.evidence === 'string')
+  );
+}
+
+function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return false;
+  }
+
+  const candidate = value as { role?: unknown };
+  return typeof candidate.role === 'string';
+}
+
+function isExecutionStatus(value: unknown): value is EvaluationResult['executionStatus'] {
+  return (
+    typeof value === 'string' &&
+    EXECUTION_STATUSES.has(value as EvaluationResult['executionStatus'])
+  );
+}
+
+function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return undefined;
+  }
+
+  const result = value as Record<string, unknown>;
+  return {
+    ...result,
+    timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
+    testId:
+      typeof result.testId === 'string'
+        ? result.testId
+        : typeof result.evalId === 'string'
+          ? result.evalId
+          : 'unknown',
+    score: typeof result.score === 'number' ? result.score : 0,
+    assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
+    target: typeof result.target === 'string' ? result.target : 'unknown',
+    output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
+    executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
+  };
+}
+
 // ---------------------------------------------------------------------------
 // JSONL parsing
 // ---------------------------------------------------------------------------
@@ -610,7 +678,10 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
       const parsed = JSON.parse(trimmed);
       // JSONL files from AgentV use snake_case; convert back to camelCase
       const camelCased = toCamelCaseDeep(parsed);
-      results.push(camelCased as EvaluationResult);
+      const normalized = normalizeParsedResult(camelCased);
+      if (normalized) {
+        results.push(normalized);
+      }
     } catch {
       // Skip malformed lines
     }
diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts
index 800a62584..b6e6c57b7 100644
--- a/apps/cli/src/commands/eval/result-layout.ts
+++ b/apps/cli/src/commands/eval/result-layout.ts
@@ -20,6 +20,10 @@ export function resolveRunIndexPath(runDir: string): string {
   return path.join(runDir, RESULT_INDEX_FILENAME);
 }
 
+export function isRunManifestPath(filePath: string): boolean {
+  return path.basename(filePath) === RESULT_INDEX_FILENAME;
+}
+
 export function resolveExistingRunPrimaryPath(runDir: string): string | undefined {
   const indexPath = resolveRunIndexPath(runDir);
   if (existsSync(indexPath)) {
@@ -49,3 +53,17 @@ export function resolveWorkspaceOrFilePath(filePath: string): string {
 
   return existing;
 }
+
+export function resolveRunManifestPath(filePath: string): string {
+  if (isDirectoryPath(filePath)) {
+    return resolveWorkspaceOrFilePath(filePath);
+  }
+
+  if (!isRunManifestPath(filePath)) {
+    throw new Error(
+      `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`,
+    );
+  }
+
+  return filePath;
+}
diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
index a1760ffa6..8a39bc3bf 100644
--- a/apps/cli/src/commands/eval/retry-errors.ts
+++ b/apps/cli/src/commands/eval/retry-errors.ts
@@ -1,17 +1,25 @@
+import { readFile } from 'node:fs/promises';
+
 import type { EvaluationResult } from '@agentv/core';
 
-import {
-  loadLightweightResults,
-  loadManifestResults,
-  resolveResultSourcePath,
-} from '../results/manifest.js';
+import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js';
+import { parseJsonlResults } from './artifact-writer.js';
+
+async function loadRetrySourceResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
+  try {
+    const resolvedPath = resolveResultSourcePath(jsonlPath);
+    return loadManifestResults(resolvedPath);
+  } catch {
+    const content = await readFile(jsonlPath, 'utf8');
+    return parseJsonlResults(content);
+  }
+}
 
 /**
  * Load test IDs from an index/results source that have executionStatus === 'execution_error'.
  */
 export async function loadErrorTestIds(jsonlPath: string): Promise<readonly string[]> {
-  const resolvedPath = resolveResultSourcePath(jsonlPath);
-  const ids = loadLightweightResults(resolvedPath)
+  const ids = (await loadRetrySourceResults(jsonlPath))
     .filter((result) => result.executionStatus === 'execution_error')
     .map((result) => result.testId);
 
@@ -23,8 +31,7 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
  * These are the "good" results that should be preserved when merging retry output.
  */
 export async function loadNonErrorResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
-  const resolvedPath = resolveResultSourcePath(jsonlPath);
-  return loadManifestResults(resolvedPath).filter(
+  return (await loadRetrySourceResults(jsonlPath)).filter(
     (result) => result.testId && result.executionStatus !== 'execution_error',
   );
 }
diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts
index 80c523a26..50c9e7824 100644
--- a/apps/cli/src/commands/eval/run-cache.ts
+++ b/apps/cli/src/commands/eval/run-cache.ts
@@ -16,21 +16,19 @@ const CACHE_FILENAME = 'cache.json';
 export interface RunCache {
   /** Directory path for new per-run directory format (e.g. .agentv/results/runs/<ts>/) */
   readonly lastRunDir?: string;
-  /** JSONL file path for legacy flat-file format. Kept for backward compat. */
+  /** @deprecated Legacy flat-file pointer from old cache files. Ignored on read. */
   readonly lastResultFile?: string;
   readonly timestamp: string;
 }
 
 /**
  * Resolve the primary result manifest path from a RunCache entry.
- * New format: lastRunDir/index.jsonl
- * Legacy format: lastResultFile (flat JSONL path)
  */
 export function resolveRunCacheFile(cache: RunCache): string {
   if (cache.lastRunDir) {
     return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
   }
-  return cache.lastResultFile ?? '';
+  return '';
 }
 
 function cachePath(cwd: string): string {
@@ -47,18 +45,15 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
 }
 
 export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
+  if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
+    return;
+  }
+
   const dir = path.join(cwd, '.agentv');
   await mkdir(dir, { recursive: true });
-  const basename = path.basename(resultPath);
-  const cache: RunCache =
-    basename === RESULT_INDEX_FILENAME
-      ? {
-          lastRunDir: path.dirname(resultPath),
-          timestamp: new Date().toISOString(),
-        }
-      : {
-          lastResultFile: resultPath,
-          timestamp: new Date().toISOString(),
-        };
+  const cache: RunCache = {
+    lastRunDir: path.dirname(resultPath),
+    timestamp: new Date().toISOString(),
+  };
   await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8');
 }
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 1a26fff4b..febbec3cc 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -400,21 +400,21 @@ function createProgressReporter(
   };
 }
 
-function makeEvalKey(testFilePath: string, evalId: string): string {
-  return `${path.resolve(testFilePath)}::${evalId}`;
+function makeTestCaseKey(testFilePath: string, testId: string): string {
+  return `${path.resolve(testFilePath)}::${testId}`;
 }
 
-function createDisplayIdTracker(): { getOrAssign(evalKey: string): number } {
+function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } {
   const map = new Map<string, number>();
   let nextId = 1;
   return {
-    getOrAssign(evalKey: string): number {
-      const existing = map.get(evalKey);
+    getOrAssign(testCaseKey: string): number {
+      const existing = map.get(testCaseKey);
       if (existing !== undefined) {
         return existing;
       }
       const assigned = nextId++;
-      map.set(evalKey, assigned);
+      map.set(testCaseKey, assigned);
       return assigned;
     },
   };
@@ -476,11 +476,11 @@ async function prepareFileMetadata(params: {
   readonly cwd: string;
   readonly options: NormalizedOptions;
 }): Promise<{
-  readonly evalIds: readonly string[];
-  readonly evalCases: readonly EvalTest[];
+  readonly testIds: readonly string[];
+  readonly testCases: readonly EvalTest[];
   readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[];
   readonly trialsConfig?: TrialsConfig;
-  readonly suiteTargets?: readonly string[];
+  readonly datasetTargets?: readonly string[];
   readonly yamlWorkers?: number;
   readonly yamlCache?: boolean;
   readonly yamlCachePath?: string;
@@ -501,23 +501,23 @@ async function prepareFileMetadata(params: {
   const relativePath = path.relative(cwd, testFilePath);
   const category = deriveCategory(relativePath);
 
-  const suite = await loadTestSuite(testFilePath, repoRoot, {
+  const dataset = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
     filter: options.filter,
     category,
   });
-  const filteredIds = suite.tests.map((value) => value.id);
+  const testIds = dataset.tests.map((value) => value.id);
 
   // Determine target names: CLI --target flags override YAML
   const cliTargets = options.cliTargets;
-  const suiteTargets = suite.targets;
+  const datasetTargets = dataset.targets;
 
-  // Resolve which target names to use (precedence: CLI > YAML targets > YAML target > default)
+  // Resolve which target names to use (precedence: CLI > dataset YAML targets > default)
   let targetNames: readonly string[];
   if (cliTargets.length > 0) {
     targetNames = cliTargets;
-  } else if (suiteTargets && suiteTargets.length > 0) {
-    targetNames = suiteTargets;
+  } else if (datasetTargets && datasetTargets.length > 0) {
+    targetNames = datasetTargets;
   } else {
     targetNames = [];
   }
@@ -567,18 +567,18 @@ async function prepareFileMetadata(params: {
   }
 
   return {
-    evalIds: filteredIds,
-    evalCases: suite.tests,
+    testIds,
+    testCases: dataset.tests,
     selections,
-    trialsConfig: suite.trials,
-    suiteTargets,
-    yamlWorkers: suite.workers,
-    yamlCache: suite.cacheConfig?.enabled,
-    yamlCachePath: suite.cacheConfig?.cachePath,
-    totalBudgetUsd: suite.totalBudgetUsd,
-    failOnError: suite.failOnError,
-    threshold: suite.threshold,
-    tags: suite.metadata?.tags,
+    trialsConfig: dataset.trials,
+    datasetTargets,
+    yamlWorkers: dataset.workers,
+    yamlCache: dataset.cacheConfig?.enabled,
+    yamlCachePath: dataset.cacheConfig?.cachePath,
+    totalBudgetUsd: dataset.totalBudgetUsd,
+    failOnError: dataset.failOnError,
+    threshold: dataset.threshold,
+    tags: dataset.metadata?.tags,
   };
 }
 
@@ -613,11 +613,11 @@ async function runSingleEvalFile(params: {
   readonly workersOverride?: number;
   readonly yamlWorkers?: number;
   readonly progressReporter: ProgressReporter;
-  readonly seenEvalCases: Set<string>;
-  readonly displayIdTracker: { getOrAssign(evalKey: string): number };
+  readonly seenTestCases: Set<string>;
+  readonly displayIdTracker: { getOrAssign(testCaseKey: string): number };
   readonly selection: TargetSelection;
   readonly inlineTargetLabel: string;
-  readonly evalCases: readonly EvalTest[];
+  readonly testCases: readonly EvalTest[];
   readonly trialsConfig?: TrialsConfig;
   readonly matrixMode?: boolean;
   readonly totalBudgetUsd?: number;
@@ -636,11 +636,11 @@ async function runSingleEvalFile(params: {
     workersOverride,
     yamlWorkers,
     progressReporter,
-    seenEvalCases,
+    seenTestCases,
     displayIdTracker,
     selection,
     inlineTargetLabel,
-    evalCases,
+    testCases,
     trialsConfig,
     matrixMode,
     totalBudgetUsd,
@@ -731,7 +731,7 @@ async function runSingleEvalFile(params: {
       return true;
     })(),
     filter: options.filter,
-    evalCases,
+    evalCases: testCases,
     verbose: options.verbose,
     maxConcurrency: resolvedWorkers,
     workspaceMode: options.workspaceMode,
@@ -747,7 +747,7 @@ async function runSingleEvalFile(params: {
       (
         streamingObserver as { completeFromResult?: (result: EvaluationResult) => void } | null
       )?.completeFromResult?.(result);
-      // Finalize streaming observer span with score
+      // Finalize the streaming observer span with score.
       streamingObserver?.finalizeEvalCase(result.score, result.error);
 
       // Trim output messages for results JSONL based on --output-messages.
@@ -775,13 +775,13 @@ async function runSingleEvalFile(params: {
       }
     },
     onProgress: async (event) => {
-      const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
-      const evalKey = makeEvalKey(testFilePath, evalKeyId);
-      if (event.status === 'pending' && !seenEvalCases.has(evalKey)) {
-        seenEvalCases.add(evalKey);
-        progressReporter.setTotal(seenEvalCases.size);
+      const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
+      const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId);
+      if (event.status === 'pending' && !seenTestCases.has(testCaseKey)) {
+        seenTestCases.add(testCaseKey);
+        progressReporter.setTotal(seenTestCases.size);
       }
-      const displayId = displayIdTracker.getOrAssign(evalKey);
+      const displayId = displayIdTracker.getOrAssign(testCaseKey);
 
       // Start streaming observer when eval case begins execution
       if (event.status === 'running' && streamingObserver) {
@@ -997,7 +997,7 @@ export async function runEvalCommand(
   // We defer cache creation until after file metadata is loaded
   const evaluationRunner = await resolveEvaluationRunner();
   const allResults: EvaluationResult[] = [];
-  const seenEvalCases = new Set<string>();
+  const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
   // Derive file-level concurrency from worker count (global) when provided
@@ -1012,14 +1012,14 @@ export async function runEvalCommand(
   const fileMetadata = new Map<
     string,
     {
-      readonly evalIds: readonly string[];
-      readonly evalCases: readonly EvalTest[];
+      readonly testIds: readonly string[];
+      readonly testCases: readonly EvalTest[];
       readonly selections: readonly {
         selection: TargetSelection;
         inlineTargetLabel: string;
       }[];
       readonly trialsConfig?: TrialsConfig;
-      readonly suiteTargets?: readonly string[];
+      readonly datasetTargets?: readonly string[];
       readonly yamlWorkers?: number;
       readonly yamlCache?: boolean;
       readonly yamlCachePath?: string;
@@ -1097,13 +1097,12 @@ export async function runEvalCommand(
   const cache = cacheEnabled
     ? new ResponseCache(yamlCachePath ? path.resolve(yamlCachePath) : undefined)
     : undefined;
-  const useCache = cacheEnabled;
 
   if (cacheEnabled) {
     console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
   }
 
-  // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold
+  // Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
   const yamlThreshold = firstMeta?.threshold;
   const resolvedThreshold = options.threshold ?? yamlThreshold;
   if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -1127,13 +1126,13 @@ export async function runEvalCommand(
   // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides)
   let totalEvalCount = 0;
   for (const meta of fileMetadata.values()) {
-    const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
-    for (const test of meta.evalCases) {
-      // Per-test targets override suite-level targets
+    const datasetTargetNames = meta.selections.map((s) => s.selection.targetName);
+    for (const test of meta.testCases) {
+      // Per-test targets override dataset-level targets.
       const testTargetNames =
         test.targets && test.targets.length > 0
-          ? test.targets.filter((t) => suiteTargetNames.includes(t))
-          : suiteTargetNames;
+          ? test.targets.filter((t) => datasetTargetNames.includes(t))
+          : datasetTargetNames;
       totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
     }
   }
@@ -1177,13 +1176,13 @@ export async function runEvalCommand(
   });
   for (const [testFilePath, meta] of fileMetadata.entries()) {
     for (const { selection, inlineTargetLabel } of meta.selections) {
-      for (const testId of meta.evalIds) {
-        const evalKey = makeEvalKey(
+      for (const testId of meta.testIds) {
+        const testCaseKey = makeTestCaseKey(
           testFilePath,
           meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
         );
-        seenEvalCases.add(evalKey);
-        const displayId = displayIdTracker.getOrAssign(evalKey);
+        seenTestCases.add(testCaseKey);
+        const displayId = displayIdTracker.getOrAssign(testCaseKey);
         progressReporter.update(displayId, {
           workerId: displayId,
           testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
@@ -1207,19 +1206,19 @@ export async function runEvalCommand(
       // Run all targets concurrently (each target has its own worker limit)
       const targetResults = await Promise.all(
         targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
-          // Filter eval cases to those applicable to this target
+          // Filter test cases to those applicable to this target.
           const targetName = selection.targetName;
-          const applicableEvalCases =
+          const applicableTestCases =
             targetPrep.selections.length > 1
-              ? targetPrep.evalCases.filter((test) => {
+              ? targetPrep.testCases.filter((test) => {
                   if (test.targets && test.targets.length > 0) {
                     return test.targets.includes(targetName);
                   }
                   return true;
                 })
-              : targetPrep.evalCases;
+              : targetPrep.testCases;
 
-          if (applicableEvalCases.length === 0) {
+          if (applicableTestCases.length === 0) {
             return [];
           }
 
@@ -1236,11 +1235,11 @@ export async function runEvalCommand(
               workersOverride: perFileWorkers,
               yamlWorkers: targetPrep.yamlWorkers,
               progressReporter,
-              seenEvalCases,
+              seenTestCases,
               displayIdTracker,
               selection,
               inlineTargetLabel,
-              evalCases: applicableEvalCases,
+              testCases: applicableTestCases,
               trialsConfig: targetPrep.trialsConfig,
               matrixMode: targetPrep.selections.length > 1,
               totalBudgetUsd: targetPrep.totalBudgetUsd,
@@ -1254,9 +1253,9 @@ export async function runEvalCommand(
             // Mark all tests in this file as errors and continue with other files.
             const message = fileError instanceof Error ? fileError.message : String(fileError);
             console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
-            const errorResults: EvaluationResult[] = applicableEvalCases.map((evalCase) => ({
+            const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
               timestamp: new Date().toISOString(),
-              testId: evalCase.id,
+              testId: testCase.id,
               score: 0,
               assertions: [],
               output: [],
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index 58a86c271..ee355c5b2 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -37,15 +37,15 @@ export const evalBenchCommand = command({
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
-    const evalSet: string = manifest.dataset ?? '';
+    const datasetName: string = manifest.dataset ?? '';
     const experiment: string | undefined = manifest.experiment;
-    const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const indexLines: string[] = [];
     const allPassRates: number[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
+      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const artifactSubdir = subpath.join('/');
       const evaluators: EvaluatorScore[] = [];
@@ -177,7 +177,7 @@ export const evalBenchCommand = command({
         JSON.stringify({
           timestamp: manifest.timestamp,
           test_id: testId,
-          dataset: evalSet || undefined,
+          dataset: datasetName || undefined,
           experiment: experiment || undefined,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,
diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
index 80729f0b7..45faa8608 100644
--- a/apps/cli/src/commands/pipeline/grade.ts
+++ b/apps/cli/src/commands/pipeline/grade.ts
@@ -10,7 +10,7 @@
  * Progress is printed to stderr so users see real-time feedback.
  *
  * Export directory additions:
- *   <out-dir>/<eval-set>/<test-id>/code_grader_results/<name>.json
+ *   <out-dir>/<dataset>/<test-id>/code_grader_results/<name>.json
  */
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
@@ -196,14 +196,14 @@ export const evalGradeCommand = command({
     const manifestPath = join(exportDir, 'manifest.json');
     const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
     const testIds: string[] = manifest.test_ids;
-    const evalSet: string = manifest.dataset ?? '';
-    const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const datasetName: string = manifest.dataset ?? '';
+    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     // Collect all grader tasks upfront so we know the total count
     const tasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
+      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
index ef53fe5e0..28b43b391 100644
--- a/apps/cli/src/commands/pipeline/input.ts
+++ b/apps/cli/src/commands/pipeline/input.ts
@@ -9,7 +9,7 @@
  * Export directory layout:
  *   <out-dir>/
  *   ├── manifest.json
- *   └── <eval-set>/              (omitted if eval.yaml has no name)
+ *   └── <dataset>/               (omitted if eval.yaml has no name)
  *       └── <test-id>/
  *           ├── input.json
  *           ├── invoke.json
@@ -58,8 +58,8 @@ export const evalInputCommand = command({
     const evalDir = dirname(resolvedEvalPath);
 
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = suite.tests;
+    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = dataset.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -107,13 +107,13 @@ export const evalInputCommand = command({
       // No targets file found — subagent-as-target mode
     }
 
-    const evalSetName = suite.metadata?.name?.trim() ?? '';
-    const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const datasetName = dataset.metadata?.name?.trim() ?? '';
+    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
+      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -168,7 +168,7 @@ export const evalInputCommand = command({
     // manifest.json
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: evalSetName || undefined,
+      dataset: datasetName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: {
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index be062a4c7..372bfd04f 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -100,8 +100,8 @@ export const evalRunCommand = command({
 
     // ── Step 1: Extract inputs (same as pipeline input) ──────────────
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = suite.tests;
+    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = dataset.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -145,13 +145,13 @@ export const evalRunCommand = command({
       // No targets file — subagent-as-target mode
     }
 
-    const evalSetName = suite.metadata?.name?.trim() ?? '';
-    const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const datasetName = dataset.metadata?.name?.trim() ?? '';
+    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
+      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -198,7 +198,7 @@ export const evalRunCommand = command({
 
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: evalSetName || undefined,
+      dataset: datasetName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: { name: targetName, kind: targetKind },
@@ -230,7 +230,7 @@ export const evalRunCommand = command({
       writeInvProgress();
 
       const invokeTarget = async (testId: string): Promise<void> => {
-        const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
+        const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
         const testDir = join(outDir, ...subpath);
         const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8'));
         if (invoke.kind !== 'cli') return;
@@ -341,7 +341,7 @@ export const evalRunCommand = command({
     const graderTasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
+      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
       const testDir = join(outDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');
diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
index 8a73eabef..b03a649af 100644
--- a/apps/cli/src/commands/results/export.ts
+++ b/apps/cli/src/commands/results/export.ts
@@ -21,11 +21,16 @@
  *   - To add new per-test workspace files, add them under each test directory.
  */
 
+import { existsSync } from 'node:fs';
+import { readFile } from 'node:fs/promises';
 import path from 'node:path';
+
 import { command, option, optional, positional, string } from 'cmd-ts';
 
+import type { EvaluationResult } from '@agentv/core';
+
 import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
-import { loadResults as loadSharedResults, patchTestIds, resolveSourceFile } from './shared.js';
+import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js';
 
 // ── Export logic ─────────────────────────────────────────────────────────
 
@@ -40,7 +45,7 @@ export async function exportResults(
     throw new Error(`No results found in ${sourceFile}`);
   }
 
-  await writeArtifactsFromResults(patchTestIds(results), outputDir, {
+  await writeArtifactsFromResults(results, outputDir, {
     evalFile: sourceFile,
   });
 }
@@ -48,23 +53,54 @@ export async function exportResults(
 // ── Helpers ──────────────────────────────────────────────────────────────
 
 /**
- * Derive the default output directory from a JSONL source path.
- * Handles both directory-per-run manifests (<ts>/index.jsonl) and legacy flat files.
+ * Derive the default output directory from a run manifest path.
  */
-function deriveOutputDir(cwd: string, sourceFile: string): string {
-  const parentDir = path.basename(path.dirname(sourceFile));
-  // Directory-per-run: parent is the timestamp dir (or legacy eval_<ts> dir)
-  if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) {
-    return path.join(cwd, '.agentv', 'results', 'export', parentDir);
+export function deriveOutputDir(cwd: string, sourceFile: string): string {
+  const baseName = path.basename(sourceFile);
+  if (baseName !== 'index.jsonl') {
+    const stem = path.basename(sourceFile, path.extname(sourceFile));
+    return path.join(
+      cwd,
+      '.agentv',
+      'results',
+      'export',
+      stem.startsWith('eval_') ? stem.slice(5) : stem,
+    );
   }
+
+  const parentDir = path.basename(path.dirname(sourceFile));
   if (parentDir.startsWith('eval_')) {
-    // Legacy eval_ prefix: strip it
     return path.join(cwd, '.agentv', 'results', 'export', parentDir.slice(5));
   }
-  // Legacy flat file: extract timestamp from filename
-  const basename = path.basename(sourceFile, '.jsonl');
-  const dirName = basename.startsWith('eval_') ? basename.slice(5) : basename;
-  return path.join(cwd, '.agentv', 'results', 'export', dirName);
+  return path.join(cwd, '.agentv', 'results', 'export', parentDir);
+}
+
+export async function loadExportSource(
+  source: string | undefined,
+  cwd: string,
+): Promise<{ sourceFile: string; results: readonly EvaluationResult[] }> {
+  try {
+    const { sourceFile } = await resolveSourceFile(source, cwd);
+    const { results } = await loadSharedResults(source, cwd);
+    return { sourceFile, results };
+  } catch (error) {
+    if (!source) {
+      throw error;
+    }
+
+    const explicitSource = path.isAbsolute(source) ? source : path.resolve(cwd, source);
+    if (!existsSync(explicitSource) || path.extname(explicitSource) !== '.jsonl') {
+      throw error;
+    }
+
+    const content = await readFile(explicitSource, 'utf8');
+    const results = parseJsonlResults(content);
+    if (results.length === 0) {
+      throw new Error(`No results found in ${explicitSource}`);
+    }
+
+    return { sourceFile: explicitSource, results };
+  }
 }
 
 // ── CLI command ──────────────────────────────────────────────────────────
@@ -76,7 +112,8 @@ export const resultsExportCommand = command({
     source: positional({
       type: optional(string),
       displayName: 'source',
-      description: 'JSONL result file to export (defaults to most recent in .agentv/results/)',
+      description:
+        'Run workspace directory or index.jsonl manifest to export (defaults to most recent in .agentv/results/runs/)',
     }),
     out: option({
       type: optional(string),
@@ -95,8 +132,7 @@ export const resultsExportCommand = command({
     const cwd = dir ?? process.cwd();
 
     try {
-      const { sourceFile } = await resolveSourceFile(source, cwd);
-      const { results } = await loadSharedResults(source, cwd);
+      const { sourceFile, results } = await loadExportSource(source, cwd);
 
       const outputDir = out
         ? path.isAbsolute(out)
@@ -111,9 +147,7 @@ export const resultsExportCommand = command({
       // Report exported test IDs
       console.log(`Exported ${results.length} test(s) to ${outputDir}`);
       for (const result of results) {
-        const id =
-          result.testId ?? (result as unknown as Record<string, unknown>).evalId ?? 'unknown';
-        console.log(`  ${id}`);
+        console.log(`  ${result.testId ?? 'unknown'}`);
       }
     } catch (error) {
       console.error(`Error: ${(error as Error).message}`);
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index fb3b4e7a4..cffb4760a 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -4,13 +4,15 @@ import path from 'node:path';
 import type { EvaluationResult } from '@agentv/core';
 
 import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js';
-import { parseJsonlResults } from '../eval/artifact-writer.js';
-import { RESULT_INDEX_FILENAME, resolveWorkspaceOrFilePath } from '../eval/result-layout.js';
+import {
+  RESULT_INDEX_FILENAME,
+  isDirectoryPath,
+  resolveRunManifestPath,
+} from '../eval/result-layout.js';
 
 export interface ResultManifestRecord {
   readonly timestamp?: string;
   readonly test_id?: string;
-  readonly eval_id?: string;
   readonly dataset?: string;
   readonly category?: string;
   readonly experiment?: string;
@@ -41,10 +43,6 @@ function parseJsonlLines<T>(content: string): T[] {
     .map((line) => JSON.parse(line) as T);
 }
 
-function isIndexManifestPath(sourceFile: string): boolean {
-  return path.basename(sourceFile) === RESULT_INDEX_FILENAME;
-}
-
 function parseMarkdownMessages(content: string): { role: string; content: string }[] {
   const trimmed = content.trim();
   if (!trimmed.startsWith('@[')) {
@@ -120,7 +118,7 @@ function hydrateOutput(
 function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult {
   const grading = readOptionalJson<GradingArtifact>(baseDir, record.grading_path);
   const timing = readOptionalJson<TimingArtifact>(baseDir, record.timing_path);
-  const testId = record.test_id ?? record.eval_id ?? 'unknown';
+  const testId = record.test_id ?? 'unknown';
 
   return {
     timestamp: record.timestamp,
@@ -175,16 +173,14 @@ export function parseResultManifest(content: string): ResultManifestRecord[] {
 
 export function resolveResultSourcePath(source: string, cwd?: string): string {
   const resolved = path.isAbsolute(source) ? source : path.resolve(cwd ?? process.cwd(), source);
-  return resolveWorkspaceOrFilePath(resolved);
+  if (isDirectoryPath(resolved) || path.basename(resolved) === RESULT_INDEX_FILENAME) {
+    return resolveRunManifestPath(resolved);
+  }
+  return resolved;
 }
 
 export function loadManifestResults(sourceFile: string): EvaluationResult[] {
-  const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
-
-  if (!isIndexManifestPath(resolvedSourceFile)) {
-    return parseJsonlResults(readFileSync(resolvedSourceFile, 'utf8'));
-  }
-
+  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
   const content = readFileSync(resolvedSourceFile, 'utf8');
   const records = parseResultManifest(content);
   const baseDir = path.dirname(resolvedSourceFile);
@@ -193,7 +189,6 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {
 
 export interface LightweightResultRecord {
   readonly testId: string;
-  readonly dataset?: string;
   readonly target?: string;
   readonly experiment?: string;
   readonly score: number;
@@ -204,64 +199,16 @@ export interface LightweightResultRecord {
 }
 
 export function loadLightweightResults(sourceFile: string): LightweightResultRecord[] {
-  const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
+  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
   const content = readFileSync(resolvedSourceFile, 'utf8');
-
-  if (isIndexManifestPath(resolvedSourceFile)) {
-    return parseResultManifest(content).map((record) => ({
-      testId: record.test_id ?? record.eval_id ?? 'unknown',
-      dataset: record.dataset,
-      target: record.target,
-      experiment: record.experiment,
-      score: record.score,
-      scores: record.scores,
-      executionStatus: record.execution_status,
-      error: record.error,
-      timestamp: record.timestamp,
-    }));
-  }
-
-  const records: LightweightResultRecord[] = [];
-  for (const line of content.split(/\r?\n/)) {
-    const trimmed = line.trim();
-    if (!trimmed) {
-      continue;
-    }
-
-    let record: Record<string, unknown>;
-    try {
-      record = JSON.parse(trimmed) as Record<string, unknown>;
-    } catch {
-      continue;
-    }
-
-    const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
-    if (typeof rawTestId !== 'string') {
-      throw new Error(`Missing test_id in result: ${trimmed}`);
-    }
-
-    if (typeof record.score !== 'number') {
-      throw new Error(`Missing or invalid score in result: ${trimmed}`);
-    }
-
-    records.push({
-      testId: rawTestId,
-      dataset: typeof record.dataset === 'string' ? record.dataset : undefined,
-      target: typeof record.target === 'string' ? record.target : undefined,
-      score: record.score,
-      scores: Array.isArray(record.scores)
-        ? (record.scores as readonly Record<string, unknown>[])
-        : undefined,
-      executionStatus:
-        typeof record.execution_status === 'string'
-          ? record.execution_status
-          : typeof record.executionStatus === 'string'
-            ? record.executionStatus
-            : undefined,
-      error: typeof record.error === 'string' ? record.error : undefined,
-      timestamp: typeof record.timestamp === 'string' ? record.timestamp : undefined,
-    });
-  }
-
-  return records;
+  return parseResultManifest(content).map((record) => ({
+    testId: record.test_id ?? 'unknown',
+    target: record.target,
+    experiment: record.experiment,
+    score: record.score,
+    scores: record.scores,
+    executionStatus: record.execution_status,
+    error: record.error,
+    timestamp: record.timestamp,
+  }));
 }
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 555a3a440..671a5aaa6 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -7,8 +7,8 @@
  *
  * API endpoints:
  *   - GET /           — Studio SPA (React app)
- *   - GET /api/runs   — list available result files with metadata
- *   - GET /api/runs/:filename — load results from a specific run file
+ *   - GET /api/runs   — list available run workspaces with metadata
+ *   - GET /api/runs/:filename — load results from a specific run workspace
  *   - GET /api/feedback  — read feedback reviews
  *   - POST /api/feedback — write feedback reviews
  *   - GET /api/projects  — list registered projects
@@ -20,7 +20,7 @@
  * how searchDir is resolved.
  *
  * Exported functions (for testing):
- *   - resolveSourceFile(source, cwd) — resolves JSONL path
+ *   - resolveSourceFile(source, cwd) — resolves a run manifest path
  *   - loadResults(content) — parses JSONL into EvaluationResult[]
  *   - createApp(results, cwd) — Hono app factory
  */
@@ -43,6 +43,7 @@ import type { Context } from 'hono';
 import { Hono } from 'hono';
 
 import { parseJsonlResults } from '../eval/artifact-writer.js';
+import { resolveRunManifestPath } from '../eval/result-layout.js';
 import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js';
 import { listResultFiles } from '../trace/utils.js';
 import {
@@ -51,21 +52,21 @@ import {
   parseResultManifest,
   resolveResultSourcePath,
 } from './manifest.js';
-import { patchTestIds } from './shared.js';
 import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js';
 
 // ── Source resolution ────────────────────────────────────────────────────
 
 /**
- * Resolve the JSONL result file path from an explicit source, run cache,
- * or directory scan. Throws if no file can be found.
+ * Resolve a run manifest path from an explicit source, run cache,
+ * or directory scan. Throws if no run workspace can be found.
  */
 export async function resolveSourceFile(source: string | undefined, cwd: string): Promise<string> {
   if (source) {
-    const resolved = resolveResultSourcePath(source, cwd);
+    let resolved = resolveResultSourcePath(source, cwd);
     if (!existsSync(resolved)) {
       throw new Error(`Source file not found: ${resolved}`);
     }
+    resolved = resolveRunManifestPath(resolved);
     return resolved;
   }
 
@@ -79,11 +80,11 @@ export async function resolveSourceFile(source: string | undefined, cwd: string)
   const metas = listResultFiles(cwd, 10);
   if (metas.length === 0) {
     throw new Error(
-      'No result files found in .agentv/results/\nRun an evaluation first: agentv eval <eval-file>',
+      'No run workspaces found in .agentv/results/runs/\nRun an evaluation first: agentv eval <eval-file>',
     );
   }
   if (metas.length > 1) {
-    console.log('Available result files:');
+    console.log('Available run workspaces:');
     for (const m of metas) {
       console.log(`  ${m.path}`);
     }
@@ -95,8 +96,7 @@ export async function resolveSourceFile(source: string | undefined, cwd: string)
 // ── JSONL parsing ────────────────────────────────────────────────────────
 
 /**
- * Parse JSONL content into EvaluationResult[], with backward-compat
- * patching of eval_id → testId.
+ * Parse JSONL content into EvaluationResult[].
  */
 export function loadResults(content: string): EvaluationResult[] {
   const results = parseJsonlResults(content);
@@ -104,12 +104,7 @@ export function loadResults(content: string): EvaluationResult[] {
     throw new Error('No valid results found in JSONL content');
   }
 
-  return results.map((r) => {
-    if (!r.testId && (r as unknown as Record<string, unknown>).evalId) {
-      return { ...r, testId: String((r as unknown as Record<string, unknown>).evalId) };
-    }
-    return r;
-  });
+  return results;
 }
 
 // ── Feedback persistence ─────────────────────────────────────────────────
@@ -273,7 +268,7 @@ function handleRunDetail(c: C, { searchDir }: DataContext) {
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
-    const loaded = patchTestIds(loadManifestResults(meta.path));
+    const loaded = loadManifestResults(meta.path);
     return c.json({ results: stripHeavyFields(loaded), source: meta.filename });
   } catch {
     return c.json({ error: 'Failed to load run' }, 500);
@@ -285,7 +280,7 @@ function handleRunDatasets(c: C, { searchDir, agentvDir }: DataContext) {
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
-    const loaded = patchTestIds(loadManifestResults(meta.path));
+    const loaded = loadManifestResults(meta.path);
     const { pass_threshold } = loadStudioConfig(agentvDir);
     const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
     for (const r of loaded) {
@@ -314,7 +309,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
-    const loaded = patchTestIds(loadManifestResults(meta.path));
+    const loaded = loadManifestResults(meta.path);
     const { pass_threshold } = loadStudioConfig(agentvDir);
     const categoryMap = new Map<
       string,
@@ -354,7 +349,7 @@ function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) {
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
-    const loaded = patchTestIds(loadManifestResults(meta.path));
+    const loaded = loadManifestResults(meta.path);
     const { pass_threshold } = loadStudioConfig(agentvDir);
     const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
     const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
@@ -385,7 +380,7 @@ function handleEvalDetail(c: C, { searchDir }: DataContext) {
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
-    const loaded = patchTestIds(loadManifestResults(meta.path));
+    const loaded = loadManifestResults(meta.path);
     const result = loaded.find((r) => r.testId === evalId);
     if (!result) return c.json({ error: 'Eval not found' }, 404);
     return c.json({ eval: result });
@@ -854,7 +849,7 @@ export function createApp(
     const entries = metas.map((m) => {
       let totalCostUsd = 0;
       try {
-        const loaded = patchTestIds(loadManifestResults(m.path));
+        const loaded = loadManifestResults(m.path);
         totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
       } catch {
         // ignore load errors for aggregate
@@ -986,7 +981,8 @@ export const resultsServeCommand = command({
     source: positional({
       type: optional(string),
       displayName: 'source',
-      description: 'JSONL result file to serve (defaults to most recent in .agentv/results/)',
+      description:
+        'Run workspace directory or index.jsonl manifest to serve (defaults to most recent in .agentv/results/runs/)',
     }),
     port: option({
       type: optional(number),
@@ -1078,19 +1074,19 @@ export const resultsServeCommand = command({
           process.exit(1);
         }
         sourceFile = resolved;
-        results = patchTestIds(loadManifestResults(resolved));
+        results = loadManifestResults(resolved);
       } else {
         // Auto-discover: run cache -> directory scan -> empty state
         const cache = await loadRunCache(cwd);
         const cachedFile = cache ? resolveRunCacheFile(cache) : '';
         if (cachedFile && existsSync(cachedFile)) {
           sourceFile = cachedFile;
-          results = patchTestIds(loadManifestResults(cachedFile));
+          results = loadManifestResults(cachedFile);
         } else {
           const metas = listResultFiles(cwd, 1);
           if (metas.length > 0) {
             sourceFile = metas[0].path;
-            results = patchTestIds(loadManifestResults(metas[0].path));
+            results = loadManifestResults(metas[0].path);
           }
           // If no metas, results stays empty — dashboard shows welcome state
         }
diff --git a/apps/cli/src/commands/results/shared.ts b/apps/cli/src/commands/results/shared.ts
index c70267318..874982266 100644
--- a/apps/cli/src/commands/results/shared.ts
+++ b/apps/cli/src/commands/results/shared.ts
@@ -2,8 +2,7 @@
  * Shared utilities for `agentv results` subcommands.
  *
  * Provides:
- * - resolveSourceFile() — find an index/results manifest from explicit path or auto-discover latest
- * - patchTestIds() — backward-compat eval_id -> test_id patching
+ * - resolveSourceFile() — find an index manifest from explicit path or auto-discover latest
  * - sourceArg — cmd-ts positional for optional result source path
  *
  * How to extend:
@@ -14,6 +13,7 @@ import { existsSync } from 'node:fs';
 import { optional, positional, string } from 'cmd-ts';
 
 import type { EvaluationResult } from '@agentv/core';
+import { resolveRunManifestPath } from '../eval/result-layout.js';
 import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js';
 import { listResultFiles } from '../trace/utils.js';
 import { loadManifestResults, resolveResultSourcePath } from './manifest.js';
@@ -22,7 +22,8 @@ import { loadManifestResults, resolveResultSourcePath } from './manifest.js';
 export const sourceArg = positional({
   type: optional(string),
   displayName: 'source',
-  description: 'Result file or workspace directory (defaults to most recent in .agentv/results/)',
+  description:
+    'Run workspace directory or index.jsonl manifest (defaults to most recent in .agentv/results/runs/)',
 });
 
 /**
@@ -40,6 +41,7 @@ export async function resolveSourceFile(
       console.error(`Error: File not found: ${sourceFile}`);
       process.exit(1);
     }
+    sourceFile = resolveRunManifestPath(sourceFile);
   } else {
     const cache = await loadRunCache(cwd);
     const cachedFile = cache ? resolveRunCacheFile(cache) : '';
@@ -48,7 +50,7 @@ export async function resolveSourceFile(
     } else {
       const metas = listResultFiles(cwd, 1);
       if (metas.length === 0) {
-        console.error('Error: No result files found in .agentv/results/');
+        console.error('Error: No run workspaces found in .agentv/results/runs/');
         console.error('Run an evaluation first: agentv eval <eval-file>');
         process.exit(1);
       }
@@ -60,7 +62,7 @@ export async function resolveSourceFile(
 }
 
 /**
- * Load and parse eval results from an index/results source file, with backward-compat patching.
+ * Load and parse eval results from a run workspace or index manifest.
  */
 export async function loadResults(
   source: string | undefined,
@@ -74,17 +76,5 @@ export async function loadResults(
     process.exit(1);
   }
 
-  return { results: patchTestIds(results), sourceFile };
-}
-
-/**
- * Patch older JSONL records that used eval_id instead of test_id.
- */
-export function patchTestIds(results: EvaluationResult[]): EvaluationResult[] {
-  return results.map((r) => {
-    if (!r.testId && (r as unknown as Record<string, unknown>).evalId) {
-      return { ...r, testId: String((r as unknown as Record<string, unknown>).evalId) };
-    }
-    return r;
-  });
+  return { results, sourceFile };
 }
diff --git a/apps/cli/src/commands/trace/list.ts b/apps/cli/src/commands/trace/list.ts
index a923013bf..42bea2b72 100644
--- a/apps/cli/src/commands/trace/list.ts
+++ b/apps/cli/src/commands/trace/list.ts
@@ -14,13 +14,13 @@ function formatListTable(metas: ResultFileMeta[]): string {
   const lines: string[] = [];
 
   if (metas.length === 0) {
-    lines.push(`${c.yellow}No result files found in .agentv/results/${c.reset}`);
+    lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`);
     lines.push(`${c.dim}Run an evaluation first: agentv run <eval-file>${c.reset}`);
     return lines.join('\n');
   }
 
   lines.push('');
-  lines.push(`${c.bold}Evaluation Results${c.reset} ${c.dim}(.agentv/results/)${c.reset}`);
+  lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`);
   lines.push('');
 
   // Column widths
@@ -42,7 +42,9 @@ function formatListTable(metas: ResultFileMeta[]): string {
   }
 
   lines.push('');
-  lines.push(`${c.dim}${metas.length} result file${metas.length !== 1 ? 's' : ''} found${c.reset}`);
+  lines.push(
+    `${c.dim}${metas.length} run workspace${metas.length !== 1 ? 's' : ''} found${c.reset}`,
+  );
   lines.push('');
 
   return lines.join('\n');
@@ -50,7 +52,7 @@ function formatListTable(metas: ResultFileMeta[]): string {
 
 export const traceListCommand = command({
   name: 'list',
-  description: 'List recent evaluation result files from .agentv/results/',
+  description: 'List recent evaluation run workspaces from .agentv/results/runs/',
   args: {
     limit: option({
       type: optional(number),
diff --git a/apps/cli/src/commands/trace/score.ts b/apps/cli/src/commands/trace/score.ts
index cf425f3a1..da986096c 100644
--- a/apps/cli/src/commands/trace/score.ts
+++ b/apps/cli/src/commands/trace/score.ts
@@ -144,7 +144,7 @@ function extractCandidate(raw: RawResult): string {
  * Only used to satisfy the EvaluationContext interface — deterministic and
  * trace-based evaluators don't access these fields.
  */
-function buildEvalTest(raw: RawResult): EvalTest {
+function buildTestCase(raw: RawResult): EvalTest {
   return {
     id: raw.test_id ?? 'unknown',
     question: '',
@@ -210,7 +210,7 @@ async function runScore(
     const output = raw.output as readonly Message[] | undefined;
 
     const evalContext: EvaluationContext = {
-      evalCase: buildEvalTest(raw),
+      evalCase: buildTestCase(raw),
       candidate,
       target: { kind: 'custom' as const, name: raw.target ?? 'unknown', config: {} } as never,
       provider: stubProvider,
@@ -295,7 +295,7 @@ function renderTable(scored: ScoreResult[], assertSpec: string): string {
 
 export const traceScoreCommand = command({
   name: 'score',
-  description: 'Run evaluators against existing result files post-hoc',
+  description: 'Run evaluators against existing trace sources post-hoc',
   args: {
     file: positional({
       type: string,
diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
index 7baec2dc5..45d865ed6 100644
--- a/apps/cli/src/commands/trace/utils.ts
+++ b/apps/cli/src/commands/trace/utils.ts
@@ -104,7 +104,7 @@ export interface RawTraceSpan {
  *
  * Supported sources:
  * - Run workspace directories / index.jsonl manifests
- * - Legacy simple trace JSONL files
+ * - Standalone trace JSONL files for trace-only workflows
  * - OTLP JSON trace files written via --otel-file
  */
 export function loadResultFile(filePath: string): RawResult[] {
@@ -518,7 +518,7 @@ export function toTraceSummary(result: RawResult): TraceSummary | undefined {
 }
 
 /**
- * Metadata about a result file for listing.
+ * Metadata about a discovered run manifest for listing.
  */
 export interface ResultFileMeta {
   path: string;
@@ -531,62 +531,33 @@ export interface ResultFileMeta {
 }
 
 /**
- * Enumerate result files in the .agentv/results/ directory.
- * Scans runs/ for both directory-per-run layouts (index.jsonl preferred inside subdirs)
- * and legacy flat .jsonl files. Also scans the base directory for pre-runs/ files.
+ * Enumerate canonical run manifests in `.agentv/results/runs/`.
  */
 export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
-  const baseDir = path.join(cwd, '.agentv', 'results');
-  const runsDir = path.join(baseDir, RESULT_RUNS_DIRNAME);
+  const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME);
 
   const files: { filePath: string; displayName: string }[] = [];
 
-  // Scan runs/ for both directory-based runs and flat JSONL files.
-  // Process directories first so they take priority in dedup over flat files.
   try {
     const entries = readdirSync(runsDir, { withFileTypes: true });
     for (const entry of entries) {
-      if (entry.isDirectory()) {
-        const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name));
-        if (primaryPath) {
-          files.push({ filePath: primaryPath, displayName: entry.name });
-        }
+      if (!entry.isDirectory()) {
+        continue;
       }
-    }
-    for (const entry of entries) {
-      if (!entry.isDirectory() && entry.name.endsWith('.jsonl')) {
-        files.push({ filePath: path.join(runsDir, entry.name), displayName: entry.name });
+
+      const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name));
+      if (primaryPath) {
+        files.push({ filePath: primaryPath, displayName: entry.name });
       }
     }
   } catch {
     // runs/ doesn't exist yet
   }
 
-  // Also scan base directory for legacy files (backward compat)
-  try {
-    const entries = readdirSync(baseDir).filter((f) => f.endsWith('.jsonl'));
-    for (const entry of entries) {
-      files.push({ filePath: path.join(baseDir, entry), displayName: entry });
-    }
-  } catch {
-    // Base directory doesn't exist yet
-  }
-
-  // Deduplicate by normalized name (strip .jsonl so dir "eval_X" matches file "eval_X.jsonl")
-  const seen = new Set<string>();
-  const uniqueFiles: { filePath: string; displayName: string }[] = [];
-  for (const file of files) {
-    const key = file.displayName.replace(/\.jsonl$/, '');
-    if (!seen.has(key)) {
-      seen.add(key);
-      uniqueFiles.push(file);
-    }
-  }
-
   // Sort by display name descending (most recent first)
-  uniqueFiles.sort((a, b) => b.displayName.localeCompare(a.displayName));
+  files.sort((a, b) => b.displayName.localeCompare(a.displayName));
 
-  const limited = limit !== undefined && limit > 0 ? uniqueFiles.slice(0, limit) : uniqueFiles;
+  const limited = limit !== undefined && limit > 0 ? files.slice(0, limit) : files;
 
   const metas: ResultFileMeta[] = [];
 
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
index 2ffdef178..e548d300d 100644
--- a/apps/cli/test/commands/compare/compare.test.ts
+++ b/apps/cli/test/commands/compare/compare.test.ts
@@ -56,6 +56,21 @@ describe('compare command', () => {
       ]);
     });
 
+    it('should load flat JSONL files with camelCase testId results', () => {
+      const filePath = path.join(tempDir, 'results.jsonl');
+      writeFileSync(
+        filePath,
+        '{"testId": "case-1", "score": 0.8}\n{"testId": "case-2", "score": 0.9}\n',
+      );
+
+      const results = loadJsonlResults(filePath);
+
+      expect(results).toEqual([
+        { testId: 'case-1', score: 0.8 },
+        { testId: 'case-2', score: 0.9 },
+      ]);
+    });
+
     it('should handle empty lines in JSONL', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
diff --git a/apps/cli/test/commands/eval/run-cache.test.ts b/apps/cli/test/commands/eval/run-cache.test.ts
index ff2c852c6..c2ee4f7f6 100644
--- a/apps/cli/test/commands/eval/run-cache.test.ts
+++ b/apps/cli/test/commands/eval/run-cache.test.ts
@@ -5,27 +5,29 @@ import { type RunCache, resolveRunCacheFile } from '../../../src/commands/eval/r
 
 describe('resolveRunCacheFile', () => {
   it('should resolve new directory-based cache to index.jsonl inside dir', () => {
-    const cache: RunCache = { lastRunDir: '/results/runs/eval_2026-03-24', timestamp: '' };
+    const cache: RunCache = { lastRunDir: '/results/runs/2026-03-24T00-00-00-000Z', timestamp: '' };
     expect(resolveRunCacheFile(cache)).toBe(
-      path.join('/results/runs/eval_2026-03-24', 'index.jsonl'),
+      path.join('/results/runs/2026-03-24T00-00-00-000Z', 'index.jsonl'),
     );
   });
 
-  it('should resolve legacy file-based cache to lastResultFile', () => {
+  it('ignores legacy file-based cache entries', () => {
     const cache: RunCache = {
       lastResultFile: '/results/runs/eval_2026-03-24.jsonl',
       timestamp: '',
     };
-    expect(resolveRunCacheFile(cache)).toBe('/results/runs/eval_2026-03-24.jsonl');
+    expect(resolveRunCacheFile(cache)).toBe('');
   });
 
   it('should prefer lastRunDir over lastResultFile when both present', () => {
     const cache: RunCache = {
-      lastRunDir: '/results/runs/eval_dir',
+      lastRunDir: '/results/runs/2026-03-24T00-00-00-000Z',
       lastResultFile: '/results/runs/eval_old.jsonl',
       timestamp: '',
     };
-    expect(resolveRunCacheFile(cache)).toBe(path.join('/results/runs/eval_dir', 'index.jsonl'));
+    expect(resolveRunCacheFile(cache)).toBe(
+      path.join('/results/runs/2026-03-24T00-00-00-000Z', 'index.jsonl'),
+    );
   });
 
   it('should return empty string when neither field is set', () => {
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index 806ad6ae1..f6f8645ff 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -1,5 +1,5 @@
 import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
-import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs';
+import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
@@ -9,7 +9,11 @@ import type {
   IndexArtifactEntry,
   TimingArtifact,
 } from '../../../src/commands/eval/artifact-writer.js';
-import { exportResults } from '../../../src/commands/results/export.js';
+import {
+  deriveOutputDir,
+  exportResults,
+  loadExportSource,
+} from '../../../src/commands/results/export.js';
 
 // ── Sample JSONL content (snake_case, matching on-disk format) ──────────
 
@@ -114,6 +118,25 @@ describe('results export', () => {
     rmSync(tempDir, { recursive: true, force: true });
   });
 
+  it('loadExportSource accepts explicit legacy flat JSONL files', async () => {
+    const sourceFile = path.join(tempDir, 'eval_2026-03-18.jsonl');
+    writeFileSync(
+      sourceFile,
+      toJsonl({ ...RESULT_FULL, eval_id: 'legacy-id', test_id: undefined }),
+    );
+
+    const { sourceFile: loadedSource, results } = await loadExportSource(sourceFile, tempDir);
+
+    expect(loadedSource).toBe(sourceFile);
+    expect(results).toHaveLength(1);
+    expect(results[0].testId).toBe('legacy-id');
+  });
+
+  it('deriveOutputDir uses the source filename for flat JSONL inputs', () => {
+    const outputDir = deriveOutputDir(tempDir, path.join(tempDir, 'eval_2026-03-18.jsonl'));
+    expect(outputDir).toBe(path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18'));
+  });
+
   it('should create benchmark.json matching artifact-writer schema', async () => {
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl(RESULT_FULL, RESULT_PARTIAL);
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 7bee162cd..2d7766622 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -58,6 +58,18 @@ describe('resolveSourceFile', () => {
       'Source file not found',
     );
   });
+
+  it('rejects legacy flat result files', async () => {
+    const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-serve-source-'));
+    const flatFile = path.join(tempDir, 'results.jsonl');
+    writeFileSync(flatFile, toJsonl(RESULT_A));
+
+    await expect(resolveSourceFile(flatFile, tempDir)).rejects.toThrow(
+      'Expected a run workspace directory or index.jsonl manifest',
+    );
+
+    rmSync(tempDir, { recursive: true, force: true });
+  });
 });
 
 // ── loadResults ──────────────────────────────────────────────────────────
@@ -327,8 +339,10 @@ describe('serve app', () => {
     it('loads results from an existing run file', async () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
       mkdirSync(runsDir, { recursive: true });
-      const filename = 'eval_2026-03-25T10-00-00-000Z.jsonl';
-      writeFileSync(path.join(runsDir, filename), toJsonl(RESULT_A, RESULT_B));
+      const filename = '2026-03-25T10-00-00-000Z';
+      const runDir = path.join(runsDir, filename);
+      mkdirSync(runDir, { recursive: true });
+      writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A, RESULT_B));
 
       const app = createApp([], tempDir, tempDir, undefined, { studioDir });
       const res = await app.request(`/api/runs/${filename}`);
diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts
index 6df36ebba..2cd110fc3 100644
--- a/apps/cli/test/commands/results/shared.test.ts
+++ b/apps/cli/test/commands/results/shared.test.ts
@@ -1,43 +1,63 @@
-import { describe, expect, it } from 'bun:test';
-import type { EvaluationResult } from '@agentv/core';
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
 
-import { patchTestIds } from '../../../src/commands/results/shared.js';
+import { resolveRunManifestPath } from '../../../src/commands/eval/result-layout.js';
+import { resolveSourceFile } from '../../../src/commands/results/shared.js';
 
-describe('patchTestIds', () => {
-  it('passes through results with testId', () => {
-    const results = [{ testId: 'test-1', score: 1 }] as unknown as EvaluationResult[];
-    expect(patchTestIds(results)).toEqual(results);
+describe('results shared source resolution', () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-results-shared-'));
   });
 
-  it('patches evalId to testId for backward compatibility', () => {
-    const results = [{ evalId: 'old-1', score: 1 }] as unknown as EvaluationResult[];
-    const patched = patchTestIds(results);
-    expect(patched[0].testId).toBe('old-1');
+  afterEach(() => {
+    rmSync(tempDir, { recursive: true, force: true });
   });
 
-  it('preserves all other fields when patching evalId', () => {
-    const results = [
-      { evalId: 'old-1', score: 0.8, target: 'gpt-4o', timestamp: '2026-01-01' },
-    ] as unknown as EvaluationResult[];
-    const patched = patchTestIds(results);
-    expect(patched[0]).toEqual({
-      evalId: 'old-1',
-      score: 0.8,
-      target: 'gpt-4o',
-      timestamp: '2026-01-01',
-      testId: 'old-1',
-    });
+  it('resolves an explicit run workspace directory to index.jsonl', async () => {
+    const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z');
+    mkdirSync(runDir, { recursive: true });
+    writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"t1","score":1}\n');
+
+    const resolved = await resolveSourceFile(runDir, tempDir);
+
+    expect(resolved.sourceFile).toBe(path.join(runDir, 'index.jsonl'));
   });
 
-  it('does not overwrite existing testId with evalId', () => {
-    const results = [
-      { testId: 'test-1', evalId: 'old-1', score: 1 },
-    ] as unknown as EvaluationResult[];
-    const patched = patchTestIds(results);
-    expect(patched[0].testId).toBe('test-1');
+  it('auto-discovers the most recent canonical run workspace', async () => {
+    const olderRunDir = path.join(
+      tempDir,
+      '.agentv',
+      'results',
+      'runs',
+      '2026-03-24T10-00-00-000Z',
+    );
+    const newerRunDir = path.join(
+      tempDir,
+      '.agentv',
+      'results',
+      'runs',
+      '2026-03-25T10-00-00-000Z',
+    );
+    mkdirSync(olderRunDir, { recursive: true });
+    mkdirSync(newerRunDir, { recursive: true });
+    writeFileSync(path.join(olderRunDir, 'index.jsonl'), '{"test_id":"old","score":1}\n');
+    writeFileSync(path.join(newerRunDir, 'index.jsonl'), '{"test_id":"new","score":1}\n');
+
+    const resolved = await resolveSourceFile(undefined, tempDir);
+
+    expect(resolved.sourceFile).toBe(path.join(newerRunDir, 'index.jsonl'));
   });
 
-  it('handles empty array', () => {
-    expect(patchTestIds([])).toEqual([]);
+  it('rejects legacy flat result files as result sources', () => {
+    const flatFile = path.join(tempDir, 'results.jsonl');
+    writeFileSync(flatFile, '{"test_id":"t1","score":1}\n');
+
+    expect(() => resolveRunManifestPath(flatFile)).toThrow(
+      'Expected a run workspace directory or index.jsonl manifest',
+    );
   });
 });
diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
index 9a7d3c939..b813711d4 100644
--- a/apps/cli/test/commands/trace/trace.test.ts
+++ b/apps/cli/test/commands/trace/trace.test.ts
@@ -256,102 +256,78 @@ describe('trace utils', () => {
       expect(metas).toEqual([]);
     });
 
-    it('should enumerate JSONL files in .agentv/results/runs/', () => {
+    it('should enumerate run workspaces in .agentv/results/runs/', () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
       mkdirSync(runsDir, { recursive: true });
 
+      const olderRunDir = path.join(runsDir, '2026-02-20T21-38-05-833Z');
+      const newerRunDir = path.join(runsDir, '2026-02-21T10-00-00-000Z');
+      mkdirSync(olderRunDir, { recursive: true });
+      mkdirSync(newerRunDir, { recursive: true });
       writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
+        path.join(olderRunDir, 'index.jsonl'),
         `${RESULT_WITH_TRACE}\n${RESULT_WITHOUT_TRACE}\n`,
       );
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z.jsonl'),
-        `${RESULT_FAILING}\n`,
-      );
+      writeFileSync(path.join(newerRunDir, 'index.jsonl'), `${RESULT_FAILING}\n`);
 
       const metas = listResultFiles(tempDir);
 
       expect(metas).toHaveLength(2);
       // Most recent first
-      expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z.jsonl');
+      expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z');
       expect(metas[0].testCount).toBe(1);
       expect(metas[0].passRate).toBe(0);
 
-      expect(metas[1].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl');
+      expect(metas[1].filename).toBe('2026-02-20T21-38-05-833Z');
       expect(metas[1].testCount).toBe(2);
       expect(metas[1].passRate).toBe(0.5);
     });
 
-    it('should find legacy files in .agentv/results/ (backward compat)', () => {
-      const resultsDir = path.join(tempDir, '.agentv', 'results');
-      mkdirSync(resultsDir, { recursive: true });
-
-      writeFileSync(
-        path.join(resultsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
-
-      const metas = listResultFiles(tempDir);
-      expect(metas).toHaveLength(1);
-      expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl');
-    });
-
-    it('should deduplicate files preferring runs/ over legacy root', () => {
+    it('should ignore legacy flat result files in results roots', () => {
       const resultsDir = path.join(tempDir, '.agentv', 'results');
       const runsDir = path.join(resultsDir, 'runs');
       mkdirSync(runsDir, { recursive: true });
-
-      // Same filename in both locations
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
       writeFileSync(
         path.join(resultsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
         `${RESULT_WITH_TRACE}\n`,
       );
+      writeFileSync(path.join(runsDir, '2026-02-21T10-00-00-000Z.jsonl'), `${RESULT_FAILING}\n`);
 
       const metas = listResultFiles(tempDir);
-      expect(metas).toHaveLength(1);
-      // Should prefer the runs/ version
-      expect(metas[0].path).toContain(path.join('runs', 'eval_2026-02-20T21-38-05-833Z.jsonl'));
+
+      expect(metas).toEqual([]);
     });
 
     it('should respect limit', () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
       mkdirSync(runsDir, { recursive: true });
 
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z.jsonl'),
-        `${RESULT_FAILING}\n`,
-      );
+      const olderRunDir = path.join(runsDir, '2026-02-20T21-38-05-833Z');
+      const newerRunDir = path.join(runsDir, '2026-02-21T10-00-00-000Z');
+      mkdirSync(olderRunDir, { recursive: true });
+      mkdirSync(newerRunDir, { recursive: true });
+      writeFileSync(path.join(olderRunDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`);
+      writeFileSync(path.join(newerRunDir, 'index.jsonl'), `${RESULT_FAILING}\n`);
 
       const metas = listResultFiles(tempDir, 1);
       expect(metas).toHaveLength(1);
-      expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z.jsonl');
+      expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z');
     });
 
-    it('should ignore non-JSONL files', () => {
+    it('should ignore non-directory entries in runs/', () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
       mkdirSync(runsDir, { recursive: true });
 
       writeFileSync(path.join(runsDir, 'notes.txt'), 'not a result file');
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
+      writeFileSync(path.join(runsDir, '2026-02-20T21-38-05-833Z.jsonl'), `${RESULT_WITH_TRACE}\n`);
 
       const metas = listResultFiles(tempDir);
-      expect(metas).toHaveLength(1);
+      expect(metas).toHaveLength(0);
     });
 
     it('should discover index.jsonl inside run directories in runs/', () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
-      const runDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z');
+      const runDir = path.join(runsDir, '2026-02-20T21-38-05-833Z');
       mkdirSync(runDir, { recursive: true });
 
       writeFileSync(
@@ -364,55 +340,12 @@ describe('trace utils', () => {
       expect(metas).toHaveLength(1);
       expect(metas[0].testCount).toBe(2);
       expect(metas[0].passRate).toBe(0.5);
-      expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z');
-    });
-
-    it('should list both directory-based and flat-file results together', () => {
-      const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
-      mkdirSync(runsDir, { recursive: true });
-
-      // New directory-based run
-      const runDir = path.join(runsDir, 'eval_2026-02-21T10-00-00-000Z');
-      mkdirSync(runDir, { recursive: true });
-      writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_FAILING}\n`);
-
-      // Legacy flat file
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
-
-      const metas = listResultFiles(tempDir);
-      expect(metas).toHaveLength(2);
-      // Most recent first
-      expect(metas[0].filename).toBe('eval_2026-02-21T10-00-00-000Z');
-      expect(metas[1].filename).toBe('eval_2026-02-20T21-38-05-833Z.jsonl');
-    });
-
-    it('should deduplicate directory and flat file with same timestamp', () => {
-      const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
-      mkdirSync(runsDir, { recursive: true });
-
-      // Directory-based (preferred)
-      const runDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z');
-      mkdirSync(runDir, { recursive: true });
-      writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`);
-
-      // Flat file with same timestamp
-      writeFileSync(
-        path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z.jsonl'),
-        `${RESULT_WITH_TRACE}\n`,
-      );
-
-      const metas = listResultFiles(tempDir);
-      expect(metas).toHaveLength(1);
-      // Prefer directory-based (scanned first)
-      expect(metas[0].filename).toBe('eval_2026-02-20T21-38-05-833Z');
+      expect(metas[0].filename).toBe('2026-02-20T21-38-05-833Z');
     });
 
     it('should skip directories without index.jsonl', () => {
       const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
-      const emptyDir = path.join(runsDir, 'eval_2026-02-20T21-38-05-833Z');
+      const emptyDir = path.join(runsDir, '2026-02-20T21-38-05-833Z');
       mkdirSync(emptyDir, { recursive: true });
 
       // Directory exists but no manifest/result file inside
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 769b18641..5d94ed245 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -11,7 +11,7 @@ sidebar:
 agentv eval evals/my-eval.yaml
 ```
 
-Results are written to `.agentv/results/<timestamp>.jsonl`. Each line is a JSON object with one result per test case.
+Results are written to `.agentv/results/runs/<timestamp>/index.jsonl`. Each line is a JSON object with one result per test case, and the run workspace also stores the manifest and related artifacts.
 
 Each `scores[]` entry includes per-grader timing:
 
@@ -218,10 +218,10 @@ Notes:
 Re-run only the tests that had infrastructure/execution errors from a previous output:
 
 ```bash
-agentv eval evals/my-eval.yaml --retry-errors .agentv/results/eval_previous.jsonl
+agentv eval evals/my-eval.yaml --retry-errors .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
-This reads the previous JSONL, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output.
+This reads the previous run manifest, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output.
 
 ### Execution Error Tolerance
 
diff --git a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx b/apps/web/src/content/docs/docs/evaluators/structured-data.mdx
index bf5b06b36..b6d8e0f23 100644
--- a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx
+++ b/apps/web/src/content/docs/docs/evaluators/structured-data.mdx
@@ -14,7 +14,7 @@ Built-in evaluators for grading structured outputs and gating on execution metri
 
 ## Ground Truth
 
-Put the expected structured output in the evalcase `expected_output` (as an object or message array). Evaluators read expected values from there.
+Put the expected structured output in the test case `expected_output` (as an object or message array). Evaluators read expected values from there.
 
 ```yaml
 tests:
diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
index f4e4bc9f8..ec1fcd2ec 100644
--- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
+++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
@@ -66,7 +66,7 @@ tests:
 agentv eval ./evals/example.yaml
 ```
 
-Results appear in `.agentv/results/eval_<timestamp>.jsonl` with scores, reasoning, and execution traces.
+Results appear in `.agentv/results/runs/<timestamp>/index.jsonl` with scores, reasoning, and execution traces.
 
 ## Next Steps
 
diff --git a/apps/web/src/content/docs/docs/tools/trace.mdx b/apps/web/src/content/docs/docs/tools/trace.mdx
index f1d400622..f3ce06021 100644
--- a/apps/web/src/content/docs/docs/tools/trace.mdx
+++ b/apps/web/src/content/docs/docs/tools/trace.mdx
@@ -19,13 +19,13 @@ For full tool-call inspection, prefer OTLP JSON exports over eval manifests.
 
 ### `trace list`
 
-Enumerate evaluation result files from `.agentv/results/`.
+Enumerate canonical evaluation run workspaces from `.agentv/results/runs/`.
 
 ```bash
 agentv trace list [--limit N] [--format json|table]
 ```
 
-Shows filename, test count, pass rate, average score, file size, and timestamp for each result file.
+Shows filename, test count, pass rate, average score, file size, and timestamp for each run workspace.
 
 ### `trace show`
 
@@ -56,7 +56,7 @@ research-question, 15.1s, 10,167 tok, $0.105
 Scores: response_quality 75% | routing_accuracy 100%
 ```
 
-Falls back to a flat summary when output messages are not present in the result file.
+Falls back to a flat summary when output messages are not present in the run workspace.
 
 ### `trace stats`
 
@@ -94,7 +94,7 @@ agentv trace show trace.otlp.json --format json \
   | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'
 
 # Compare providers
-agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl --group-by target --format json \
+agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl --group-by target --format json \
   | jq '.groups[] | {label, score_mean: .metrics.score.mean}'
 ```
 
diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md
index 68fe45e0c..0af46584c 100644
--- a/examples/features/benchmark-tooling/README.md
+++ b/examples/features/benchmark-tooling/README.md
@@ -34,19 +34,19 @@ Pairwise Summary:
 
 ```bash
 # N-way matrix (all targets)
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 
 # With baseline regression check (exits 1 if any target regresses)
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1
 
 # Pairwise from combined file
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
 
 # Filter to specific targets
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --targets gpt-4.1 --targets gpt-5-mini
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --targets gpt-4.1 --targets gpt-5-mini
 
 # JSON output
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --json
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --json
 ```
 
 ### Pairwise Mode
@@ -54,7 +54,7 @@ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --json
 Extract a head-to-head comparison between two specific targets:
 
 ```bash
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
 ```
 
 ```
@@ -247,7 +247,7 @@ Generates a consolidated benchmark summary across models and metrics from result
 ### Usage
 
 ```bash
-# Summarize all result files in a directory
+# Summarize all run workspaces in a directory
 bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/
 
 # Summarize specific files
diff --git a/examples/features/document-extraction/.agentv/targets.yaml b/examples/features/document-extraction/.agentv/targets.yaml
index 0c3ae6e01..afd7d5ef8 100644
--- a/examples/features/document-extraction/.agentv/targets.yaml
+++ b/examples/features/document-extraction/.agentv/targets.yaml
@@ -4,8 +4,8 @@ targets:
     provider_batching: false
     verbose: true
     
-    # Runs the mock invoice extractor for each evalcase individually
-    # {FILES} is replaced with the input file paths from the evalcase
+    # Runs the mock invoice extractor for each test case individually
+    # {FILES} is replaced with the input file paths from the test case
     # {OUTPUT_FILE} is the temporary file path where output should be written
     command: bun run ../mock_extractor.ts {FILES} {OUTPUT_FILE}
 
diff --git a/examples/features/document-extraction/README.md b/examples/features/document-extraction/README.md
index dd5265de4..4d9416676 100644
--- a/examples/features/document-extraction/README.md
+++ b/examples/features/document-extraction/README.md
@@ -2,14 +2,14 @@
 
 This folder demonstrates two evaluation patterns for document extraction:
 
-1. **`field_accuracy`** (built-in) - Per-evalcase scoring with pass/fail per field
+1. **`field_accuracy`** (built-in) - Per-test-case scoring with pass/fail per field
 2. **`code_grader`** (custom) - TP/TN/FP/FN metrics for cross-document aggregation
 
 ## When to Use Each Pattern
 
 | Pattern | Use Case | Output |
 |---------|----------|--------|
-| `field_accuracy` | Simple pass/fail scoring per evalcase | Score (0-1) per evalcase |
+| `field_accuracy` | Simple pass/fail scoring per test case | Score (0-1) per test case |
 | `code_grader` with `details.metrics` | Aggregate precision/recall across documents | TP/TN/FP/FN per field |
 
 ## Quick Start
@@ -17,7 +17,7 @@ This folder demonstrates two evaluation patterns for document extraction:
 From repo root:
 
 ```bash
-# Pattern 1: Field accuracy (per-evalcase scoring)
+# Pattern 1: Field accuracy (per-test-case scoring)
 bun agentv eval examples/features/document-extraction/evals/field-accuracy.eval.yaml
 
 # Pattern 2: Confusion metrics (cross-document aggregation)
@@ -25,12 +25,12 @@ bun agentv eval examples/features/document-extraction/evals/confusion-metrics.ev
 
 # Aggregate TP/TN/FP/FN into a table (only works with confusion-metrics.eval.yaml)
 bun run examples/features/document-extraction/scripts/aggregate_metrics.ts \
-  .agentv/results/eval_<timestamp>.jsonl
+  .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
 ## Pattern 1: Field Accuracy (`field-accuracy.eval.yaml`)
 
-Uses the built-in `field_accuracy` evaluator for per-evalcase scoring:
+Uses the built-in `field_accuracy` evaluator for per-test-case scoring:
 
 ```yaml
 evaluators:
@@ -47,7 +47,7 @@ evaluators:
         tolerance: 1.0
 ```
 
-**Output**: A score (0-1) per evalcase based on weighted field matches.
+**Output**: A score (0-1) per test case based on weighted field matches.
 
 **Best for**: Quick validation, CI/CD gates, simple pass/fail checks.
 
@@ -71,7 +71,7 @@ evaluators:
 **Output**: Aggregate metrics table with fractional precision/recall:
 
 ```
-Processed 5 evaluation results from .agentv/results/eval_<timestamp>.jsonl
+Processed 5 evaluation results from .agentv/results/runs/<timestamp>/index.jsonl
 
 Field          | TP | TN | FP | FN | Precision | Recall | F1    | Count
 ---------------+----+----+----+----+-----------+--------+-------+------
@@ -96,7 +96,7 @@ Macro-F1: 0.759
 The `aggregate_metrics.ts` script only works with evaluators that emit `details.metrics`:
 
 ```bash
-bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_<timestamp>/index.jsonl [options]
+bun run scripts/aggregate_metrics.ts .agentv/results/runs/<timestamp>/index.jsonl [options]
 
 Options:
   --evaluator <name>  Filter to a specific evaluator
diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml
index cacb52a5f..b1f5be5e3 100644
--- a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml
+++ b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml
@@ -22,7 +22,7 @@
 #
 # Aggregate:
 #   bun run examples/features/document-extraction/scripts/aggregate_metrics.ts \
-#     .agentv/results/eval_<timestamp>.jsonl
+#     .agentv/results/runs/<timestamp>/index.jsonl
 #
 
 description: Header field confusion metrics (TP/TN/FP/FN aggregation)
diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.yaml b/examples/features/document-extraction/evals/field-accuracy.eval.yaml
index a16da674a..1e4a52710 100644
--- a/examples/features/document-extraction/evals/field-accuracy.eval.yaml
+++ b/examples/features/document-extraction/evals/field-accuracy.eval.yaml
@@ -1,6 +1,6 @@
 # Field Accuracy Evaluation Dataset
 #
-# This dataset demonstrates the built-in `field_accuracy` evaluator for per-evalcase scoring.
+# This dataset demonstrates the built-in `field_accuracy` evaluator for per-test-case scoring.
 # Use this pattern when you need simple pass/fail scoring per field.
 #
 # For aggregatable TP/TN/FP/FN metrics across documents, see confusion-metrics.yaml instead.
@@ -22,7 +22,7 @@
 #   invoice-005: ~1.000 (line items extracted correctly)
 #   invoice-006: ~1.000 (greedy matching handles reordered line items)
 #
-description: Field accuracy evaluator patterns (per-evalcase scoring)
+description: Field accuracy evaluator patterns (per-test-case scoring)
 
 execution:
   target: mock_extractor
@@ -416,4 +416,3 @@ tests:
             value: ../fixtures/invoice-006.json
           - type: text
             value: "Extract line items from invoice (may be reordered)."
-
diff --git a/examples/features/document-extraction/scripts/aggregate_metrics.ts b/examples/features/document-extraction/scripts/aggregate_metrics.ts
index 90bcd6684..c20e52a44 100644
--- a/examples/features/document-extraction/scripts/aggregate_metrics.ts
+++ b/examples/features/document-extraction/scripts/aggregate_metrics.ts
@@ -6,9 +6,9 @@
  * per attribute across the whole dataset.
  *
  * Usage:
- *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_<timestamp>/index.jsonl
- *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_<timestamp>/index.jsonl --evaluator header_confusion
- *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_<timestamp>/index.jsonl --format csv
+ *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/<timestamp>/index.jsonl
+ *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/<timestamp>/index.jsonl --evaluator header_confusion
+ *   bun run scripts/aggregate_metrics.ts .agentv/results/runs/<timestamp>/index.jsonl --format csv
  */
 
 import * as fs from 'node:fs';
@@ -241,7 +241,7 @@ Options:
 
 Example:
   bun run scripts/aggregate_metrics.ts .agentv/results/eval-001.jsonl
-  bun run scripts/aggregate_metrics.ts .agentv/results/runs/eval_<timestamp>/index.jsonl --evaluator header_confusion --format csv
+  bun run scripts/aggregate_metrics.ts .agentv/results/runs/<timestamp>/index.jsonl --evaluator header_confusion --format csv
 `);
     process.exit(0);
   }
diff --git a/examples/features/trace-analysis/README.md b/examples/features/trace-analysis/README.md
index 52c3e86bb..c9f167d39 100644
--- a/examples/features/trace-analysis/README.md
+++ b/examples/features/trace-analysis/README.md
@@ -5,11 +5,11 @@ Demonstrates `agentv trace` subcommands for headless trace inspection and analys
 ## Quick Start
 
 ```bash
-# List result files
+# List run workspaces
 bun agentv trace list
 
 # Show summary trace details from the run manifest
-bun agentv trace show .agentv/results/runs/eval_<timestamp>/index.jsonl
+bun agentv trace show .agentv/results/runs/<timestamp>/index.jsonl
 
 # Show hierarchical trace tree from an OTLP export
 bun agentv trace show traces/eval.otlp.json --tree
@@ -18,13 +18,13 @@ bun agentv trace show traces/eval.otlp.json --tree
 bun agentv trace show traces/eval.otlp.json --test-id research-question --tree
 
 # Compute percentile statistics
-bun agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl
+bun agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl
 
 # Group stats by target provider
-bun agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl --group-by target
+bun agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl --group-by target
 
 # JSON output for piping to jq
-bun agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl --format json | jq '.groups[].metrics'
+bun agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl --format json | jq '.groups[].metrics'
 ```
 
 ## What's in the Example Data
@@ -53,6 +53,6 @@ bun agentv trace show traces/eval.otlp.json --format json \
   | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'
 
 # Compare scores by target provider
-bun agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl --group-by target --format json \
+bun agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl --group-by target --format json \
   | jq '.groups[] | {label, score_mean: .metrics.score.mean}'
 ```
diff --git a/examples/showcase/export-screening/README.md b/examples/showcase/export-screening/README.md
index 0975b0503..059a8b7de 100644
--- a/examples/showcase/export-screening/README.md
+++ b/examples/showcase/export-screening/README.md
@@ -51,7 +51,7 @@ Use the wrapper script to compute a confusion matrix and policy-weighted overall
 structured CI result JSON file (defaults to `results.ci_check.json`):
 
 ```bash
-bun run ./evals/ci_check.ts .agentv/results/runs/eval_<timestamp>/index.jsonl --threshold 0.95 --check-class High
+bun run ./evals/ci_check.ts .agentv/results/runs/<timestamp>/index.jsonl --threshold 0.95 --check-class High
 ```
 
 ### Multi-Sample CI Gating
@@ -153,7 +153,7 @@ bun run ./evals/ci_check.ts --eval ./evals/dataset.eval.yaml --threshold 0.95
 bun run ./evals/ci_check.ts --eval ./evals/dataset.eval.yaml --samples 5 --threshold 0.90
 
 # Or check an existing run manifest
-bun run ./evals/ci_check.ts .agentv/results/runs/eval_<timestamp>/index.jsonl --threshold 0.95
+bun run ./evals/ci_check.ts .agentv/results/runs/<timestamp>/index.jsonl --threshold 0.95
 ```
 
 ### Options
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index 28b3efa3f..6d5e08a5e 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -158,10 +158,10 @@ export async function loadTestsFromJsonl(
   const rawFile = await readFile(absoluteTestPath, 'utf8');
   const rawCases = parseJsonlContent(rawFile, evalFilePath);
 
-  // Derive eval set name: sidecar > filename
-  const fallbackEvalSet = path.basename(absoluteTestPath, '.jsonl') || 'eval';
-  const evalSetName =
-    sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
+  // Derive dataset name: sidecar > filename
+  const fallbackDatasetName = path.basename(absoluteTestPath, '.jsonl') || 'eval';
+  const datasetName =
+    sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackDatasetName;
 
   // Global defaults from sidecar
   const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm-grader';
@@ -170,7 +170,7 @@ export async function loadTestsFromJsonl(
   if (verbose) {
     console.log(`\n[JSONL Dataset: ${evalFilePath}]`);
     console.log(`  Cases: ${rawCases.length}`);
-    console.log(`  Eval set: ${evalSetName}`);
+    console.log(`  Dataset: ${datasetName}`);
     if (sidecar.description) {
       console.log(`  Description: ${sidecar.description}`);
     }
@@ -179,34 +179,34 @@ export async function loadTestsFromJsonl(
   const results: EvalTest[] = [];
 
   for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
-    const evalcase = rawCases[lineIndex];
+    const testCaseConfig = rawCases[lineIndex];
     const lineNumber = lineIndex + 1; // 1-based for user-facing messages
-    const id = asString(evalcase.id);
+    const id = asString(testCaseConfig.id);
 
     // Skip eval cases that don't match the filter pattern (glob supported)
     if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
       continue;
     }
 
-    const conversationId = asString(evalcase.conversation_id);
-    let outcome = asString(evalcase.criteria);
-    if (!outcome && evalcase.expected_outcome !== undefined) {
-      outcome = asString(evalcase.expected_outcome);
+    const conversationId = asString(testCaseConfig.conversation_id);
+    let outcome = asString(testCaseConfig.criteria);
+    if (!outcome && testCaseConfig.expected_outcome !== undefined) {
+      outcome = asString(testCaseConfig.expected_outcome);
       if (outcome) {
         logWarning(
-          `Test '${asString(evalcase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
+          `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
         );
       }
     }
 
     // Resolve input with shorthand support
-    const rawInputMessages = resolveInputMessages(evalcase);
+    const rawInputMessages = resolveInputMessages(testCaseConfig);
     // Resolve expected_output with shorthand support
-    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
 
     // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assert
     const hasEvaluationSpec =
-      !!outcome || expectedMessages.length > 0 || evalcase.assert !== undefined;
+      !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== undefined;
     if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
       logError(
         `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`,
@@ -265,13 +265,20 @@ export async function loadTestsFromJsonl(
       .join(' ');
 
     // Merge execution config: per-case overrides sidecar
-    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined;
+    const caseExecution = isJsonObject(testCaseConfig.execution)
+      ? testCaseConfig.execution
+      : undefined;
     const mergedExecution = caseExecution ?? globalExecution;
 
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
     let evaluators: Awaited<ReturnType<typeof parseEvaluators>>;
     try {
-      evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? 'unknown');
+      evaluators = await parseEvaluators(
+        testCaseConfig,
+        mergedExecution,
+        searchRoots,
+        id ?? 'unknown',
+      );
     } catch (error) {
       // Skip entire test if evaluator validation fails
       const message = error instanceof Error ? error.message : String(error);
@@ -280,7 +287,7 @@ export async function loadTestsFromJsonl(
     }
 
     // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead)
-    const inlineRubrics = evalcase.rubrics;
+    const inlineRubrics = testCaseConfig.rubrics;
     if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) {
       const rubricEvaluator = parseInlineRubrics(inlineRubrics);
       if (rubricEvaluator) {
@@ -295,7 +302,7 @@ export async function loadTestsFromJsonl(
 
     const testCase: EvalTest = {
       id,
-      dataset: evalSetName,
+      dataset: datasetName,
       conversation_id: conversationId,
       question: question,
       input: inputMessages,
@@ -303,7 +310,7 @@ export async function loadTestsFromJsonl(
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
       criteria: outcome ?? '',
-      evaluator: evalCaseEvaluatorKind,
+      evaluator: testCaseEvaluatorKind,
       assertions: evaluators,
     };
 
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 0e0cc962f..1117dc7ed 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -273,18 +273,18 @@ async function loadTestsFromYaml(
   }
 
   const suite = interpolated as RawTestSuite;
-  const evalSetNameFromSuite = asString(suite.name)?.trim();
-  const fallbackEvalSet =
+  const datasetNameFromSuite = asString(suite.name)?.trim();
+  const fallbackDatasetName =
     path
       .basename(absoluteTestPath)
       .replace(/\.eval\.ya?ml$/i, '')
       .replace(/\.ya?ml$/i, '') || 'eval';
-  const evalSetName =
-    evalSetNameFromSuite && evalSetNameFromSuite.length > 0
-      ? evalSetNameFromSuite
-      : fallbackEvalSet;
+  const datasetName =
+    datasetNameFromSuite && datasetNameFromSuite.length > 0
+      ? datasetNameFromSuite
+      : fallbackDatasetName;
 
-  const rawTestcases = resolveTests(suite);
+  const rawTestCases = resolveTests(suite);
 
   const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader';
 
@@ -292,14 +292,14 @@ async function loadTestsFromYaml(
   const evalFileDir = path.dirname(absoluteTestPath);
 
   // Resolve tests: string path to external file, inline array, or error
-  let expandedTestcases: readonly JsonValue[];
-  if (typeof rawTestcases === 'string') {
+  let expandedTestCases: readonly JsonValue[];
+  if (typeof rawTestCases === 'string') {
     // String path: load tests from external file (YAML, JSONL)
-    const externalPath = path.resolve(evalFileDir, rawTestcases);
-    expandedTestcases = await loadCasesFromFile(externalPath);
-  } else if (Array.isArray(rawTestcases)) {
+    const externalPath = path.resolve(evalFileDir, rawTestCases);
+    expandedTestCases = await loadCasesFromFile(externalPath);
+  } else if (Array.isArray(rawTestCases)) {
     // Inline array: expand any file:// references
-    expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
+    expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
   } else {
     throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
   }
@@ -329,47 +329,49 @@ async function loadTestsFromYaml(
 
   const results: EvalTest[] = [];
 
-  for (const rawEvalcase of expandedTestcases) {
-    if (!isJsonObject(rawEvalcase)) {
+  for (const rawTestCase of expandedTestCases) {
+    if (!isJsonObject(rawTestCase)) {
       logWarning('Skipping invalid test entry (expected object)');
       continue;
     }
 
-    const evalcase = rawEvalcase as RawEvalCase;
-    const id = asString(evalcase.id);
+    const testCaseConfig = rawTestCase as RawEvalCase;
+    const id = asString(testCaseConfig.id);
 
     // Skip tests that don't match the filter pattern (glob supported)
     if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
       continue;
     }
 
-    const conversationId = asString(evalcase.conversation_id);
-    let outcome = asString(evalcase.criteria);
-    if (!outcome && evalcase.expected_outcome !== undefined) {
-      outcome = asString(evalcase.expected_outcome);
+    const conversationId = asString(testCaseConfig.conversation_id);
+    let outcome = asString(testCaseConfig.criteria);
+    if (!outcome && testCaseConfig.expected_outcome !== undefined) {
+      outcome = asString(testCaseConfig.expected_outcome);
       if (outcome) {
         logWarning(
-          `Test '${asString(evalcase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
+          `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
         );
       }
     }
 
     // Extract per-case execution config early (reused below for skip_defaults)
-    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined;
+    const caseExecution = isJsonObject(testCaseConfig.execution)
+      ? testCaseConfig.execution
+      : undefined;
     const skipDefaults = caseExecution?.skip_defaults === true;
 
     // Resolve input with shorthand support (pass suite-level input_files for merge)
     const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : undefined;
-    const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
+    const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
     // Resolve expected_output with shorthand support
-    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
 
     // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions
     const hasEvaluationSpec =
       !!outcome ||
       expectedMessages.length > 0 ||
-      evalcase.assertions !== undefined ||
-      evalcase.assert !== undefined;
+      testCaseConfig.assertions !== undefined ||
+      testCaseConfig.assert !== undefined;
     if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
       logError(
         `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`,
@@ -444,10 +446,15 @@ async function loadTestsFromYaml(
       .filter((part) => part.length > 0)
       .join(' ');
 
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
     let evaluators: Awaited<ReturnType<typeof parseEvaluators>>;
     try {
-      evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? 'unknown');
+      evaluators = await parseEvaluators(
+        testCaseConfig,
+        globalExecution,
+        searchRoots,
+        id ?? 'unknown',
+      );
     } catch (error) {
       // Skip entire test if evaluator validation fails
       const message = error instanceof Error ? error.message : String(error);
@@ -456,7 +463,7 @@ async function loadTestsFromYaml(
     }
 
     // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead)
-    const inlineRubrics = evalcase.rubrics;
+    const inlineRubrics = testCaseConfig.rubrics;
     if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) {
       const rubricEvaluator = parseInlineRubrics(inlineRubrics);
       if (rubricEvaluator) {
@@ -470,20 +477,20 @@ async function loadTestsFromYaml(
     const userFilePaths = collectResolvedInputFilePaths(inputMessages);
 
     // Parse per-case workspace config and merge with suite-level
-    const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
+    const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
     const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
 
     // Parse per-case metadata
-    const metadata = isJsonObject(evalcase.metadata)
-      ? (evalcase.metadata as Record<string, unknown>)
+    const metadata = isJsonObject(testCaseConfig.metadata)
+      ? (testCaseConfig.metadata as Record<string, unknown>)
       : undefined;
 
     // Extract per-test targets override (matrix evaluation)
-    const caseTargets = extractTargetsFromTestCase(evalcase as JsonObject);
+    const caseTargets = extractTargetsFromTestCase(testCaseConfig as JsonObject);
 
     const testCase: EvalTest = {
       id,
-      dataset: evalSetName,
+      dataset: datasetName,
       category: options?.category,
       conversation_id: conversationId,
       question: question,
@@ -492,7 +499,7 @@ async function loadTestsFromYaml(
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
       criteria: outcome ?? '',
-      evaluator: evalCaseEvaluatorKind,
+      evaluator: testCaseEvaluatorKind,
       assertions: evaluators,
       workspace: mergedWorkspace,
       metadata,

From 24a18dd8e649b342f91fbb000fffaf2873cc3f27 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 05:51:01 +0000
Subject: [PATCH 2/4] fix(trace): preserve canonical run timestamps

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/src/commands/compare/index.ts          | 4 +++-
 apps/cli/src/commands/results/export.ts         | 3 ++-
 apps/cli/src/commands/results/serve.ts          | 9 ++-------
 apps/cli/src/commands/trace/utils.ts            | 4 +++-
 apps/cli/test/commands/trace/trace.test.ts      | 7 +++++++
 apps/web/src/content/docs/docs/tools/studio.mdx | 6 ++++--
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index 5dadaaf06..beab6d2a0 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -13,7 +13,9 @@ import {
   restPositionals,
   string,
 } from 'cmd-ts';
+
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
+import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js';
 import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js';
 
 // ANSI color codes (no dependency needed)
@@ -111,7 +113,7 @@ function loadFlatCompareResults(filePath: string): ParsedCompareResult[] {
 function loadCompareResults(filePath: string): ParsedCompareResult[] {
   try {
     const resolvedPath = resolveResultSourcePath(filePath);
-    if (path.basename(resolvedPath) === 'index.jsonl') {
+    if (path.basename(resolvedPath) === RESULT_INDEX_FILENAME) {
       return loadLightweightResults(resolvedPath).map((record) => ({
         testId: record.testId,
         score: record.score,
diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
index b03a649af..5f11fd489 100644
--- a/apps/cli/src/commands/results/export.ts
+++ b/apps/cli/src/commands/results/export.ts
@@ -30,6 +30,7 @@ import { command, option, optional, positional, string } from 'cmd-ts';
 import type { EvaluationResult } from '@agentv/core';
 
 import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
+import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js';
 import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js';
 
 // ── Export logic ─────────────────────────────────────────────────────────
@@ -57,7 +58,7 @@ export async function exportResults(
  */
 export function deriveOutputDir(cwd: string, sourceFile: string): string {
   const baseName = path.basename(sourceFile);
-  if (baseName !== 'index.jsonl') {
+  if (baseName !== RESULT_INDEX_FILENAME) {
     const stem = path.basename(sourceFile, path.extname(sourceFile));
     return path.join(
       cwd,
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 671a5aaa6..2ea30f357 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -1068,13 +1068,8 @@ export const resultsServeCommand = command({
       // When a source is explicitly provided, it must exist.
       // Otherwise, try to auto-discover results; start empty if none found.
       if (source) {
-        const resolved = resolveResultSourcePath(source, cwd);
-        if (!existsSync(resolved)) {
-          console.error(`Error: Source file not found: ${resolved}`);
-          process.exit(1);
-        }
-        sourceFile = resolved;
-        results = loadManifestResults(resolved);
+        sourceFile = await resolveSourceFile(source, cwd);
+        results = loadManifestResults(sourceFile);
       } else {
         // Auto-discover: run cache -> directory scan -> empty state
         const cache = await loadRunCache(cwd);
diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
index 45d865ed6..443a1466f 100644
--- a/apps/cli/src/commands/trace/utils.ts
+++ b/apps/cli/src/commands/trace/utils.ts
@@ -595,7 +595,9 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
  * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl
  */
 export function extractTimestampFromFilename(filename: string): string | undefined {
-  const match = filename.match(/eval_(\d{4}-\d{2}-\d{2}T[\d-]+Z)/);
+  const match = filename.match(
+    /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/,
+  );
   if (!match) return undefined;
   // Re-convert dashes back to colons/dots for display
   return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ':$1:$2.$3Z');
diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
index b813711d4..32ea668cb 100644
--- a/apps/cli/test/commands/trace/trace.test.ts
+++ b/apps/cli/test/commands/trace/trace.test.ts
@@ -275,10 +275,12 @@ describe('trace utils', () => {
       expect(metas).toHaveLength(2);
       // Most recent first
       expect(metas[0].filename).toBe('2026-02-21T10-00-00-000Z');
+      expect(metas[0].timestamp).toBe('2026-02-21T10:00:00.000Z');
       expect(metas[0].testCount).toBe(1);
       expect(metas[0].passRate).toBe(0);
 
       expect(metas[1].filename).toBe('2026-02-20T21-38-05-833Z');
+      expect(metas[1].timestamp).toBe('2026-02-20T21:38:05.833Z');
       expect(metas[1].testCount).toBe(2);
       expect(metas[1].passRate).toBe(0.5);
     });
@@ -371,6 +373,11 @@ describe('trace utils', () => {
       const result = extractTimestampFromFilename('eval_2026-01-01T00-00-00-000Z.jsonl');
       expect(result).toBe('2026-01-01T00:00:00.000Z');
     });
+
+    it('should extract and format timestamp from bare run directory names', () => {
+      const result = extractTimestampFromFilename('2026-02-20T21-38-05-833Z');
+      expect(result).toBe('2026-02-20T21:38:05.833Z');
+    });
   });
 
   describe('formatDuration', () => {
diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx
index 6f4bc3925..2365b11ef 100644
--- a/apps/web/src/content/docs/docs/tools/studio.mdx
+++ b/apps/web/src/content/docs/docs/tools/studio.mdx
@@ -19,12 +19,14 @@ The `studio` command launches a web-based dashboard for browsing evaluation runs
 agentv studio
 ```
 
-Studio auto-discovers results from `.agentv/results/` in the current directory and opens at `http://localhost:3117`.
+Studio auto-discovers run workspaces from `.agentv/results/runs/` in the current directory and opens at `http://localhost:3117`.
 
-You can also point it at a specific results file:
+You can also point it at a specific run workspace or `index.jsonl` manifest:
 
 ```bash
 agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z/index.jsonl
+# or
+agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z
 ```
 
 ## Options

From f7210663efa8fc7af8087f971be7279818e03554 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 06:04:00 +0000
Subject: [PATCH 3/4] fix(compare): require test_id in flat inputs

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/src/commands/compare/index.ts        | 11 +-----
 .../cli/test/commands/compare/compare.test.ts | 38 -------------------
 2 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index beab6d2a0..a8cf5f5f8 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -82,16 +82,7 @@ function loadFlatCompareResults(filePath: string): ParsedCompareResult[] {
     if (!line) continue;
 
     const parsed = JSON.parse(line) as Record<string, unknown>;
-    const testId =
-      typeof parsed.test_id === 'string'
-        ? parsed.test_id
-        : typeof parsed.testId === 'string'
-          ? parsed.testId
-          : typeof parsed.eval_id === 'string'
-            ? parsed.eval_id
-            : typeof parsed.evalId === 'string'
-              ? parsed.evalId
-              : undefined;
+    const testId = typeof parsed.test_id === 'string' ? parsed.test_id : undefined;
     if (!testId) {
       throw new Error(`Missing test_id in result source: ${filePath}`);
     }
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
index e548d300d..4847d99ba 100644
--- a/apps/cli/test/commands/compare/compare.test.ts
+++ b/apps/cli/test/commands/compare/compare.test.ts
@@ -41,36 +41,6 @@ describe('compare command', () => {
       ]);
     });
 
-    it('should load valid JSONL file with legacy eval_id results', () => {
-      const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(
-        filePath,
-        '{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n',
-      );
-
-      const results = loadJsonlResults(filePath);
-
-      expect(results).toEqual([
-        { testId: 'case-1', score: 0.8 },
-        { testId: 'case-2', score: 0.9 },
-      ]);
-    });
-
-    it('should load flat JSONL files with camelCase testId results', () => {
-      const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(
-        filePath,
-        '{"testId": "case-1", "score": 0.8}\n{"testId": "case-2", "score": 0.9}\n',
-      );
-
-      const results = loadJsonlResults(filePath);
-
-      expect(results).toEqual([
-        { testId: 'case-1', score: 0.8 },
-        { testId: 'case-2', score: 0.9 },
-      ]);
-    });
-
     it('should handle empty lines in JSONL', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
@@ -192,14 +162,6 @@ describe('compare command', () => {
       expect(groups.get('a')).toHaveLength(2);
     });
 
-    it('should support legacy eval_id field', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
-      writeFileSync(filePath, '{"eval_id": "t1", "score": 0.8, "target": "a"}\n');
-
-      const groups = loadCombinedResults(filePath);
-      expect(groups.get('a')).toEqual([{ testId: 't1', score: 0.8 }]);
-    });
-
     it('should group records from index.jsonl manifests', () => {
       const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
       mkdirSync(runDir, { recursive: true });

From 4c8e22dbf5fb98ce597d6d11fe1b4071e9d3f7b1 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 07:25:21 +0000
Subject: [PATCH 4/4] refactor(results): drop flat jsonl compatibility

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/src/commands/compare/index.ts        |  79 ++++--------
 apps/cli/src/commands/eval/artifact-writer.ts |   7 +-
 apps/cli/src/commands/eval/commands/run.ts    |   3 +-
 apps/cli/src/commands/eval/retry-errors.ts    |  11 +-
 apps/cli/src/commands/results/export.ts       |  45 ++-----
 apps/cli/src/commands/results/serve.ts        |   2 +-
 .../cli/test/commands/compare/compare.test.ts | 112 ++++++++++++------
 .../results/export-e2e-providers.test.ts      |  32 +----
 apps/cli/test/commands/results/export.test.ts |  41 ++++---
 apps/cli/test/unit/retry-errors.test.ts       |  83 +++++++------
 examples/features/benchmark-tooling/README.md |  46 +++----
 examples/features/compare/README.md           |  34 +++---
 examples/features/compare/evals/README.md     |  38 +++---
 .../features/compare/evals/dataset.eval.yaml  |   4 +-
 .../showcase/multi-model-benchmark/README.md  |  16 +--
 .../skills/agentv-eval-writer/SKILL.md        |  14 +--
 16 files changed, 258 insertions(+), 309 deletions(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index a8cf5f5f8..53b0c7651 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -1,6 +1,3 @@
-import { readFileSync } from 'node:fs';
-import path from 'node:path';
-
 import {
   array,
   command,
@@ -15,7 +12,6 @@ import {
 } from 'cmd-ts';
 
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
-import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js';
 import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js';
 
 // ANSI color codes (no dependency needed)
@@ -67,55 +63,24 @@ interface MatrixRow {
   scores: Record<string, number>;
 }
 
-interface ParsedCompareResult {
-  testId: string;
-  score: number;
+interface CompareInputRecord extends EvalResult {
   target?: string;
 }
 
-function loadFlatCompareResults(filePath: string): ParsedCompareResult[] {
-  const content = readFileSync(filePath, 'utf8');
-  const results: ParsedCompareResult[] = [];
-
-  for (const rawLine of content.split('\n')) {
-    const line = rawLine.trim();
-    if (!line) continue;
-
-    const parsed = JSON.parse(line) as Record<string, unknown>;
-    const testId = typeof parsed.test_id === 'string' ? parsed.test_id : undefined;
-    if (!testId) {
+function loadCompareResults(filePath: string): CompareInputRecord[] {
+  return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
+    if (!record.testId || record.testId === 'unknown') {
       throw new Error(`Missing test_id in result source: ${filePath}`);
     }
-
-    if (typeof parsed.score !== 'number' || Number.isNaN(parsed.score)) {
+    if (typeof record.score !== 'number' || Number.isNaN(record.score)) {
       throw new Error(`Missing or invalid score in result source: ${filePath}`);
     }
-
-    results.push({
-      testId,
-      score: parsed.score,
-      target: typeof parsed.target === 'string' ? parsed.target : undefined,
-    });
-  }
-
-  return results;
-}
-
-function loadCompareResults(filePath: string): ParsedCompareResult[] {
-  try {
-    const resolvedPath = resolveResultSourcePath(filePath);
-    if (path.basename(resolvedPath) === RESULT_INDEX_FILENAME) {
-      return loadLightweightResults(resolvedPath).map((record) => ({
-        testId: record.testId,
-        score: record.score,
-        target: record.target,
-      }));
-    }
-  } catch {
-    // Fall back to direct JSONL parsing for explicit flat result files.
-  }
-
-  return loadFlatCompareResults(filePath);
+    return {
+      testId: record.testId,
+      score: record.score,
+      target: record.target,
+    };
+  });
 }
 
 export interface MatrixOutput {
@@ -125,10 +90,7 @@ export interface MatrixOutput {
 }
 
 export function loadJsonlResults(filePath: string): EvalResult[] {
-  return loadCompareResults(filePath).map((record) => ({
-    testId: record.testId,
-    score: record.score,
-  }));
+  return loadCompareResults(filePath).map(({ testId, score }) => ({ testId, score }));
 }
 
 export function loadCombinedResults(filePath: string): Map<string, EvalResult[]> {
@@ -469,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
 export const compareCommand = command({
   name: 'compare',
   description:
-    'Compare evaluation result files: two-file pairwise, combined JSONL pairwise, or N-way matrix',
+    'Compare evaluation run manifests: two-run pairwise, single-run pairwise, or N-way matrix',
   args: {
     results: restPositionals({
       type: string,
       displayName: 'results',
-      description: 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.',
+      description:
+        'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.',
     }),
     threshold: option({
       type: optional(number),
@@ -486,13 +449,13 @@ export const compareCommand = command({
       type: optional(string),
       long: 'baseline',
       short: 'b',
-      description: 'Target name to use as baseline (filters combined JSONL)',
+      description: 'Target name to use as baseline (filters a single run manifest)',
     }),
     candidate: option({
       type: optional(string),
       long: 'candidate',
       short: 'c',
-      description: 'Target name to use as candidate (filters combined JSONL)',
+      description: 'Target name to use as candidate (filters a single run manifest)',
     }),
     targets: multioption({
       type: array(string),
@@ -516,7 +479,7 @@ export const compareCommand = command({
 
     try {
       if (results.length === 0) {
-        throw new Error('At least one JSONL result file is required');
+        throw new Error('At least one run workspace or index.jsonl manifest is required');
       }
 
       if (results.length === 2) {
@@ -534,7 +497,7 @@ export const compareCommand = command({
         const exitCode = determineExitCode(comparison.summary.meanDelta);
         process.exit(exitCode);
       } else if (results.length === 1) {
-        // Combined JSONL mode
+        // Single-run manifest mode
         let groups = loadCombinedResults(results[0]);
 
         // Filter by --targets if specified
@@ -570,7 +533,7 @@ export const compareCommand = command({
         }
 
         if (baseline && candidate) {
-          // Pairwise mode from combined JSONL
+          // Pairwise mode from a single run manifest
           const baselineResults = groups.get(baseline);
           const candidateResults = groups.get(candidate);
           if (!baselineResults) {
@@ -604,7 +567,7 @@ export const compareCommand = command({
           process.exit(exitCode);
         }
       } else {
-        throw new Error('Expected 1 or 2 JSONL result files');
+        throw new Error('Expected 1 or 2 run workspaces or index.jsonl manifests');
       }
     } catch (error) {
       console.error(`Error: ${(error as Error).message}`);
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 14035f20b..4c072d661 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -648,12 +648,7 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin
   return {
     ...result,
     timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
-    testId:
-      typeof result.testId === 'string'
-        ? result.testId
-        : typeof result.evalId === 'string'
-          ? result.evalId
-          : 'unknown',
+    testId: typeof result.testId === 'string' ? result.testId : 'unknown',
     score: typeof result.score === 'number' ? result.score : 0,
     assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
     target: typeof result.target === 'string' ? result.target : 'unknown',
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 8e6903c52..f57957503 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -141,7 +141,8 @@ export const evalRunCommand = command({
     retryErrors: option({
       type: optional(string),
       long: 'retry-errors',
-      description: 'Path to previous output JSONL — re-run only execution_error test cases',
+      description:
+        'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
     }),
     strict: flag({
       long: 'strict',
diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
index 8a39bc3bf..65f76c2ea 100644
--- a/apps/cli/src/commands/eval/retry-errors.ts
+++ b/apps/cli/src/commands/eval/retry-errors.ts
@@ -1,18 +1,9 @@
-import { readFile } from 'node:fs/promises';
-
 import type { EvaluationResult } from '@agentv/core';
 
 import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js';
-import { parseJsonlResults } from './artifact-writer.js';
 
 async function loadRetrySourceResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
-  try {
-    const resolvedPath = resolveResultSourcePath(jsonlPath);
-    return loadManifestResults(resolvedPath);
-  } catch {
-    const content = await readFile(jsonlPath, 'utf8');
-    return parseJsonlResults(content);
-  }
+  return loadManifestResults(resolveResultSourcePath(jsonlPath));
 }
 
 /**
diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
index 5f11fd489..c31622cc0 100644
--- a/apps/cli/src/commands/results/export.ts
+++ b/apps/cli/src/commands/results/export.ts
@@ -1,6 +1,6 @@
 /**
- * `agentv results export` — converts JSONL eval results into a directory
- * structure matching the artifact-writer output format.
+ * `agentv results export` — converts a canonical run workspace or index.jsonl
+ * manifest into a directory structure matching the artifact-writer output format.
  *
  * Output structure:
  *   <output-dir>/
@@ -21,8 +21,6 @@
  *   - To add new per-test workspace files, add them under each test directory.
  */
 
-import { existsSync } from 'node:fs';
-import { readFile } from 'node:fs/promises';
 import path from 'node:path';
 
 import { command, option, optional, positional, string } from 'cmd-ts';
@@ -57,16 +55,8 @@ export async function exportResults(
  * Derive the default output directory from a run manifest path.
  */
 export function deriveOutputDir(cwd: string, sourceFile: string): string {
-  const baseName = path.basename(sourceFile);
-  if (baseName !== RESULT_INDEX_FILENAME) {
-    const stem = path.basename(sourceFile, path.extname(sourceFile));
-    return path.join(
-      cwd,
-      '.agentv',
-      'results',
-      'export',
-      stem.startsWith('eval_') ? stem.slice(5) : stem,
-    );
+  if (path.basename(sourceFile) !== RESULT_INDEX_FILENAME) {
+    throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
   }
 
   const parentDir = path.basename(path.dirname(sourceFile));
@@ -80,35 +70,16 @@ export async function loadExportSource(
   source: string | undefined,
   cwd: string,
 ): Promise<{ sourceFile: string; results: readonly EvaluationResult[] }> {
-  try {
-    const { sourceFile } = await resolveSourceFile(source, cwd);
-    const { results } = await loadSharedResults(source, cwd);
-    return { sourceFile, results };
-  } catch (error) {
-    if (!source) {
-      throw error;
-    }
-
-    const explicitSource = path.isAbsolute(source) ? source : path.resolve(cwd, source);
-    if (!existsSync(explicitSource) || path.extname(explicitSource) !== '.jsonl') {
-      throw error;
-    }
-
-    const content = await readFile(explicitSource, 'utf8');
-    const results = parseJsonlResults(content);
-    if (results.length === 0) {
-      throw new Error(`No results found in ${explicitSource}`);
-    }
-
-    return { sourceFile: explicitSource, results };
-  }
+  const { sourceFile } = await resolveSourceFile(source, cwd);
+  const { results } = await loadSharedResults(source, cwd);
+  return { sourceFile, results };
 }
 
 // ── CLI command ──────────────────────────────────────────────────────────
 
 export const resultsExportCommand = command({
   name: 'export',
-  description: 'Export JSONL eval results into a per-test directory structure',
+  description: 'Export a run workspace or index.jsonl manifest into a per-test directory structure',
   args: {
     source: positional({
       type: optional(string),
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 2ea30f357..e00e7e837 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -397,7 +397,7 @@ function handleEvalFiles(c: C, { searchDir }: DataContext) {
   try {
     const content = readFileSync(meta.path, 'utf8');
     const records = parseResultManifest(content);
-    const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
+    const record = records.find((r) => r.test_id === evalId);
     if (!record) return c.json({ error: 'Eval not found' }, 404);
 
     const baseDir = path.dirname(meta.path);
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
index 4847d99ba..3b77b1bb1 100644
--- a/apps/cli/test/commands/compare/compare.test.ts
+++ b/apps/cli/test/commands/compare/compare.test.ts
@@ -26,11 +26,13 @@ describe('compare command', () => {
   });
 
   describe('loadJsonlResults', () => {
-    it('should load valid JSONL file with test_id results', () => {
-      const filePath = path.join(tempDir, 'results.jsonl');
+    it('should load index.jsonl manifests from a run workspace', () => {
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
-        '{"test_id": "case-1", "score": 0.8}\n{"test_id": "case-2", "score": 0.9}\n',
+        '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n',
       );
 
       const results = loadJsonlResults(filePath);
@@ -41,11 +43,13 @@ describe('compare command', () => {
       ]);
     });
 
-    it('should handle empty lines in JSONL', () => {
-      const filePath = path.join(tempDir, 'results.jsonl');
+    it('should handle empty lines in index.jsonl manifests', () => {
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
-        '{"test_id": "case-1", "score": 0.8}\n\n{"test_id": "case-2", "score": 0.9}\n',
+        '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n',
       );
 
       const results = loadJsonlResults(filePath);
@@ -53,48 +57,52 @@ describe('compare command', () => {
       expect(results).toHaveLength(2);
     });
 
-    it('should load index.jsonl manifests from a run workspace', () => {
+    it('should throw error for missing test_id', () => {
       const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
       mkdirSync(runDir, { recursive: true });
       const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
-        '{"test_id": "case-1", "score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n{"test_id": "case-2", "score": 0.9, "grading_path": "case-2/grading.json", "timing_path": "case-2/timing.json"}\n',
+        '{"score": 0.8, "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n',
       );
 
-      const results = loadJsonlResults(filePath);
-
-      expect(results).toEqual([
-        { testId: 'case-1', score: 0.8 },
-        { testId: 'case-2', score: 0.9 },
-      ]);
+      expect(() => loadJsonlResults(filePath)).toThrow('Missing test_id');
     });
 
-    it('should throw error for missing test_id', () => {
-      const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(filePath, '{"score": 0.8}\n');
+    it('should throw error for missing score', () => {
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
+      writeFileSync(
+        filePath,
+        '{"test_id": "case-1", "grading_path": "case-1/grading.json", "timing_path": "case-1/timing.json"}\n',
+      );
 
-      expect(() => loadJsonlResults(filePath)).toThrow('Missing test_id');
+      expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score');
     });
 
-    it('should throw error for missing score', () => {
+    it('should reject flat JSONL result files', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(filePath, '{"test_id": "case-1"}\n');
+      writeFileSync(filePath, '{"test_id": "case-1", "score": 0.8}\n');
 
-      expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score');
+      expect(() => loadJsonlResults(filePath)).toThrow(
+        'Expected a run workspace directory or index.jsonl manifest',
+      );
     });
   });
 
   describe('loadCombinedResults', () => {
     it('should group records by target field', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
         [
-          '{"test_id": "t1", "score": 0.8, "target": "model-a"}',
-          '{"test_id": "t2", "score": 0.9, "target": "model-a"}',
-          '{"test_id": "t1", "score": 0.7, "target": "model-b"}',
-          '{"test_id": "t2", "score": 0.85, "target": "model-b"}',
+          '{"test_id": "t1", "score": 0.8, "target": "model-a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}',
+          '{"test_id": "t2", "score": 0.9, "target": "model-a", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}',
+          '{"test_id": "t1", "score": 0.7, "target": "model-b", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}',
+          '{"test_id": "t2", "score": 0.85, "target": "model-b", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}',
         ].join('\n'),
       );
 
@@ -112,13 +120,15 @@ describe('compare command', () => {
     });
 
     it('should handle three or more targets', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
         [
-          '{"test_id": "t1", "score": 0.8, "target": "a"}',
-          '{"test_id": "t1", "score": 0.7, "target": "b"}',
-          '{"test_id": "t1", "score": 0.9, "target": "c"}',
+          '{"test_id": "t1", "score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}',
+          '{"test_id": "t1", "score": 0.7, "target": "b", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}',
+          '{"test_id": "t1", "score": 0.9, "target": "c", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}',
         ].join('\n'),
       );
 
@@ -131,31 +141,48 @@ describe('compare command', () => {
     });
 
     it('should throw error for missing target field', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
-      writeFileSync(filePath, '{"test_id": "t1", "score": 0.8}\n');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
+      writeFileSync(
+        filePath,
+        '{"test_id": "t1", "score": 0.8, "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n',
+      );
 
       expect(() => loadCombinedResults(filePath)).toThrow('Missing target field');
     });
 
     it('should throw error for missing test_id', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
-      writeFileSync(filePath, '{"score": 0.8, "target": "a"}\n');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
+      writeFileSync(
+        filePath,
+        '{"score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n',
+      );
 
       expect(() => loadCombinedResults(filePath)).toThrow('Missing test_id');
     });
 
     it('should throw error for missing score', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
-      writeFileSync(filePath, '{"test_id": "t1", "target": "a"}\n');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
+      writeFileSync(
+        filePath,
+        '{"test_id": "t1", "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n',
+      );
 
       expect(() => loadCombinedResults(filePath)).toThrow('Missing or invalid score');
     });
 
     it('should handle empty lines', () => {
-      const filePath = path.join(tempDir, 'combined.jsonl');
+      const runDir = path.join(tempDir, 'eval_2026-03-24T00-00-00-000Z');
+      mkdirSync(runDir, { recursive: true });
+      const filePath = path.join(runDir, 'index.jsonl');
       writeFileSync(
         filePath,
-        '{"test_id": "t1", "score": 0.8, "target": "a"}\n\n{"test_id": "t2", "score": 0.9, "target": "a"}\n',
+        '{"test_id": "t1", "score": 0.8, "target": "a", "grading_path": "t1/grading.json", "timing_path": "t1/timing.json"}\n\n{"test_id": "t2", "score": 0.9, "target": "a", "grading_path": "t2/grading.json", "timing_path": "t2/timing.json"}\n',
       );
 
       const groups = loadCombinedResults(filePath);
@@ -179,6 +206,15 @@ describe('compare command', () => {
       expect(groups.get('model-a')).toEqual([{ testId: 't1', score: 0.8 }]);
       expect(groups.get('model-b')).toEqual([{ testId: 't1', score: 0.7 }]);
     });
+
+    it('should reject flat combined JSONL files', () => {
+      const filePath = path.join(tempDir, 'combined-results.jsonl');
+      writeFileSync(filePath, '{"test_id": "t1", "score": 0.8, "target": "a"}\n');
+
+      expect(() => loadCombinedResults(filePath)).toThrow(
+        'Expected a run workspace directory or index.jsonl manifest',
+      );
+    });
   });
 
   describe('classifyOutcome', () => {
diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
index e162687b0..47bba1768 100644
--- a/apps/cli/test/commands/results/export-e2e-providers.test.ts
+++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -210,11 +210,8 @@ function toJsonl(...records: object[]): string {
   return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`;
 }
 
-function artifactDir(
-  outputDir: string,
-  record: { dataset?: string; test_id?: string; eval_id?: string },
-): string {
-  const testId = record.test_id ?? record.eval_id ?? 'unknown';
+function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string {
+  const testId = record.test_id ?? 'unknown';
   return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId);
 }
 
@@ -666,30 +663,5 @@ describe('export e2e — multi-provider metrics verification', () => {
       expect(timing.token_usage.reasoning).toBe(75);
       expect(timing.duration_ms).toBe(1000);
     });
-
-    it('should handle eval_id (legacy) as test_id alias', async () => {
-      const outputDir = path.join(tempDir, 'legacy');
-      const record = {
-        timestamp: '2026-03-18T10:00:00.000Z',
-        eval_id: 'legacy-test-id',
-        dataset: 'test',
-        score: 1.0,
-        assertions: [{ text: 'ok', passed: true }],
-        output_text: 'ok',
-        target: 'mock',
-        execution_status: 'ok',
-      };
-
-      await exportResults('test.jsonl', toJsonl(record), outputDir);
-
-      expect(
-        existsSync(
-          path.join(
-            artifactDir(outputDir, { ...record, test_id: undefined, target: 'mock' as const }),
-            'grading.json',
-          ),
-        ),
-      ).toBe(true);
-    });
   });
 });
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index f6f8645ff..8b123bc57 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -1,5 +1,5 @@
 import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
-import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
@@ -99,11 +99,8 @@ function toJsonl(...records: object[]): string {
   return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`;
 }
 
-function artifactDir(
-  outputDir: string,
-  record: { dataset?: string; test_id?: string; eval_id?: string },
-): string {
-  const testId = record.test_id ?? record.eval_id ?? 'unknown';
+function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string {
+  const testId = record.test_id ?? 'unknown';
   return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId);
 }
 
@@ -118,23 +115,33 @@ describe('results export', () => {
     rmSync(tempDir, { recursive: true, force: true });
   });
 
-  it('loadExportSource accepts explicit legacy flat JSONL files', async () => {
-    const sourceFile = path.join(tempDir, 'eval_2026-03-18.jsonl');
-    writeFileSync(
-      sourceFile,
-      toJsonl({ ...RESULT_FULL, eval_id: 'legacy-id', test_id: undefined }),
-    );
+  it('loadExportSource resolves run workspaces to index.jsonl', async () => {
+    const runDir = path.join(tempDir, '2026-03-18T10-00-00-000Z');
+    mkdirSync(runDir, { recursive: true });
+    const sourceFile = path.join(runDir, 'index.jsonl');
+    writeFileSync(sourceFile, toJsonl(RESULT_FULL));
 
-    const { sourceFile: loadedSource, results } = await loadExportSource(sourceFile, tempDir);
+    const { sourceFile: loadedSource, results } = await loadExportSource(runDir, tempDir);
 
     expect(loadedSource).toBe(sourceFile);
     expect(results).toHaveLength(1);
-    expect(results[0].testId).toBe('legacy-id');
+    expect(results[0].testId).toBe('test-greeting');
+  });
+
+  it('deriveOutputDir uses the run directory name for manifest inputs', () => {
+    const outputDir = deriveOutputDir(
+      tempDir,
+      path.join(tempDir, '2026-03-18T10-00-00-000Z', 'index.jsonl'),
+    );
+    expect(outputDir).toBe(
+      path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18T10-00-00-000Z'),
+    );
   });
 
-  it('deriveOutputDir uses the source filename for flat JSONL inputs', () => {
-    const outputDir = deriveOutputDir(tempDir, path.join(tempDir, 'eval_2026-03-18.jsonl'));
-    expect(outputDir).toBe(path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18'));
+  it('deriveOutputDir rejects non-manifest paths', () => {
+    expect(() => deriveOutputDir(tempDir, path.join(tempDir, 'results.jsonl'))).toThrow(
+      'Expected a run manifest named index.jsonl',
+    );
   });
 
   it('should create benchmark.json matching artifact-writer schema', async () => {
diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
index bbc54a7ed..9aca5b16b 100644
--- a/apps/cli/test/unit/retry-errors.test.ts
+++ b/apps/cli/test/unit/retry-errors.test.ts
@@ -14,27 +14,32 @@ describe('retry-errors', () => {
     }
   });
 
-  function createJsonlFile(lines: object[]): string {
+  function createIndexFile(lines: object[]): string {
     tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
-    const filePath = path.join(tmpDir, 'results.jsonl');
+    const filePath = path.join(tmpDir, 'index.jsonl');
+    mkdirSync(tmpDir, { recursive: true });
     writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n'));
     return filePath;
   }
 
-  function createIndexFile(lines: object[]): string {
-    tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-index-test-'));
-    const filePath = path.join(tmpDir, 'index.jsonl');
-    mkdirSync(tmpDir, { recursive: true });
+  function createFlatJsonlFile(lines: object[]): string {
+    tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-flat-test-'));
+    const filePath = path.join(tmpDir, 'results.jsonl');
     writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n'));
     return filePath;
   }
 
   it('loadErrorTestIds returns only execution_error test IDs', async () => {
-    const filePath = createJsonlFile([
-      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
-      { testId: 'case-2', executionStatus: 'execution_error', score: 0, error: 'timeout' },
-      { testId: 'case-3', executionStatus: 'quality_failure', score: 0.3 },
-      { testId: 'case-4', executionStatus: 'execution_error', score: 0, error: 'provider failed' },
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
+      { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 },
+      {
+        test_id: 'case-4',
+        execution_status: 'execution_error',
+        score: 0,
+        error: 'provider failed',
+      },
     ]);
 
     const ids = await loadErrorTestIds(filePath);
@@ -42,9 +47,9 @@ describe('retry-errors', () => {
   });
 
   it('loadErrorTestIds deduplicates IDs', async () => {
-    const filePath = createJsonlFile([
-      { testId: 'case-1', executionStatus: 'execution_error', score: 0 },
-      { testId: 'case-1', executionStatus: 'execution_error', score: 0 },
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0 },
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0 },
     ]);
 
     const ids = await loadErrorTestIds(filePath);
@@ -52,9 +57,9 @@ describe('retry-errors', () => {
   });
 
   it('loadErrorTestIds returns empty array when no errors', async () => {
-    const filePath = createJsonlFile([
-      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
-      { testId: 'case-2', executionStatus: 'quality_failure', score: 0.5 },
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'quality_failure', score: 0.5 },
     ]);
 
     const ids = await loadErrorTestIds(filePath);
@@ -62,10 +67,10 @@ describe('retry-errors', () => {
   });
 
   it('loadNonErrorResults returns only non-error results', async () => {
-    const filePath = createJsonlFile([
-      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
-      { testId: 'case-2', executionStatus: 'execution_error', score: 0 },
-      { testId: 'case-3', executionStatus: 'quality_failure', score: 0.5 },
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
+      { test_id: 'case-3', execution_status: 'quality_failure', score: 0.5 },
     ]);
 
     const results = await loadNonErrorResults(filePath);
@@ -74,8 +79,8 @@ describe('retry-errors', () => {
     expect(results[1].testId).toBe('case-3');
   });
 
-  it('supports snake_case result files written by the CLI', async () => {
-    const filePath = createJsonlFile([
+  it('supports index.jsonl manifests written by the CLI', async () => {
+    const filePath = createIndexFile([
       { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
       { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
       { test_id: 'case-3', execution_status: 'quality_failure', score: 0.5 },
@@ -90,7 +95,21 @@ describe('retry-errors', () => {
     expect(results[1].testId).toBe('case-3');
   });
 
-  it('supports index.jsonl manifests during the migration', async () => {
+  it('rejects flat JSONL result files', async () => {
+    const filePath = createFlatJsonlFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
+    ]);
+
+    await expect(loadErrorTestIds(filePath)).rejects.toThrow(
+      'Expected a run workspace directory or index.jsonl manifest',
+    );
+    await expect(loadNonErrorResults(filePath)).rejects.toThrow(
+      'Expected a run workspace directory or index.jsonl manifest',
+    );
+  });
+
+  it('supports index.jsonl manifests', async () => {
     const filePath = createIndexFile([
       {
         test_id: 'case-1',
@@ -112,24 +131,20 @@ describe('retry-errors', () => {
     expect(ids).toEqual(['case-2']);
   });
 
-  it('skips malformed JSON lines', async () => {
+  it('throws on malformed index.jsonl lines', async () => {
     tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
-    const filePath = path.join(tmpDir, 'results.jsonl');
+    const filePath = path.join(tmpDir, 'index.jsonl');
     writeFileSync(
       filePath,
       [
-        JSON.stringify({ testId: 'case-1', executionStatus: 'execution_error', score: 0 }),
+        JSON.stringify({ test_id: 'case-1', execution_status: 'execution_error', score: 0 }),
         'not valid json',
         '',
-        JSON.stringify({ testId: 'case-2', executionStatus: 'ok', score: 0.9 }),
+        JSON.stringify({ test_id: 'case-2', execution_status: 'ok', score: 0.9 }),
       ].join('\n'),
     );
 
-    const ids = await loadErrorTestIds(filePath);
-    expect(ids).toEqual(['case-1']);
-
-    const results = await loadNonErrorResults(filePath);
-    expect(results).toHaveLength(1);
-    expect(results[0].testId).toBe('case-2');
+    await expect(loadErrorTestIds(filePath)).rejects.toThrow();
+    await expect(loadNonErrorResults(filePath)).rejects.toThrow();
   });
 });
diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md
index 0af46584c..dc336d10e 100644
--- a/examples/features/benchmark-tooling/README.md
+++ b/examples/features/benchmark-tooling/README.md
@@ -4,13 +4,13 @@ Utilities for multi-model benchmarking workflows with AgentV.
 
 ## N-Way Multi-Model Comparison (built-in)
 
-`agentv compare` natively supports combined JSONL files with a `target` field, enabling N-way matrix comparison without splitting files.
+`agentv compare` natively supports canonical run manifests with a `target` field, enabling N-way matrix comparison without splitting files.
 
 ### Quick Start
 
 ```bash
-# Try it now — fixture included, no API keys needed
-agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl
+# Compare a recent canonical run
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
 Output:
@@ -88,7 +88,7 @@ Each line includes a `target` field to identify which model produced the result:
 ### Key Files
 
 - `evals/benchmark.eval.yaml` - Example eval config with 3 tests
-- `fixtures/combined-results.jsonl` - Sample combined output (9 records: 3 tests x 3 targets)
+- canonical run workspaces under `.agentv/results/runs/<timestamp>/`
 
 ## split-by-target
 
@@ -123,20 +123,19 @@ Target names are normalized for safe filenames:
 
 ### Downstream Compare Workflow
 
-After splitting, use `agentv compare` to perform pairwise model comparisons:
+Use `agentv compare` directly on the canonical run manifest for pairwise or matrix comparisons:
 
 ```bash
-# 1. Run a matrix evaluation that produces a combined results file
+# 1. Run a matrix evaluation that produces a canonical run workspace
 bun agentv eval my-eval.yaml
 
-# 2. Split results by target
-bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target
-
-# 3. Compare any two targets
-bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl
+# 2. Compare any two targets from the same run
+bun agentv compare .agentv/results/runs/<timestamp>/index.jsonl \
+  --baseline gpt-4.1 --candidate claude-sonnet-4
 
-# 4. JSON output for CI pipelines
-bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json
+# 3. JSON output for CI pipelines
+bun agentv compare .agentv/results/runs/<timestamp>/index.jsonl \
+  --baseline gpt-4.1 --candidate claude-sonnet-4 --json
 ```
 
 The `compare` command matches records by `test_id`, calculates score deltas, and classifies each as win/loss/tie. It exits non-zero on regressions, making it suitable for CI gates.
@@ -149,7 +148,8 @@ Computes aggregate win/loss/tie rates from `agentv compare --json` output, makin
 
 ```bash
 # Save comparison output to a file
-bun agentv compare baseline.jsonl candidate.jsonl --json > comparison.json
+bun agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl --json > comparison.json
 
 # Print a human-readable summary table
 bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.json
@@ -167,8 +167,10 @@ Pass a directory of comparison JSON files to get per-metric win rates. Each file
 
 ```bash
 # Run comparisons for different metrics
-bun agentv compare base.jsonl cand.jsonl --json > comparisons/accuracy.json
-bun agentv compare base-latency.jsonl cand-latency.jsonl --json > comparisons/latency.json
+bun agentv compare .agentv/results/runs/<baseline-accuracy>/index.jsonl \
+  .agentv/results/runs/<candidate-accuracy>/index.jsonl --json > comparisons/accuracy.json
+bun agentv compare .agentv/results/runs/<baseline-latency>/index.jsonl \
+  .agentv/results/runs/<candidate-latency>/index.jsonl --json > comparisons/latency.json
 
 # Aggregate across all metrics
 bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparisons/
@@ -282,16 +284,14 @@ bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/
 # 1. Run multi-model evaluation
 bun agentv eval my-eval.yaml
 
-# 2. Split results by target
-bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target
-
-# 3. Compare two targets
-bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json > comparison.json
+# 2. Compare two targets from the run manifest
+bun agentv compare .agentv/results/runs/<timestamp>/index.jsonl \
+  --baseline gpt-4.1 --candidate claude-sonnet-4 --json > comparison.json
 
-# 4. Get win-rate summary
+# 3. Get win-rate summary
 bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.json
 
-# 5. Statistical significance test
+# 4. Statistical significance test
 bun examples/features/benchmark-tooling/scripts/significance-test.ts \
   ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl
 
diff --git a/examples/features/compare/README.md b/examples/features/compare/README.md
index 04e41cb8e..05df34e8f 100644
--- a/examples/features/compare/README.md
+++ b/examples/features/compare/README.md
@@ -1,11 +1,11 @@
 # Baseline vs Candidate Comparison
 
-Demonstrates comparing evaluation results using the `agentv compare` command.
+Demonstrates comparing canonical run manifests using the `agentv compare` command.
 
 ## What This Shows
 
-- N-way matrix comparison from a combined JSONL file
-- Two-file pairwise comparison (baseline vs candidate)
+- N-way matrix comparison from a run manifest with multiple targets
+- Two-run pairwise comparison (baseline vs candidate)
 - Score delta calculation and win/loss classification
 - Baseline regression detection via exit codes
 - Human-readable and JSON output formats
@@ -15,33 +15,31 @@ Demonstrates comparing evaluation results using the `agentv compare` command.
 ```bash
 # From repository root
 
-# N-way matrix from a combined results file (see ../benchmark-tooling/ for fixture)
-agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl
+# N-way matrix from a canonical run manifest
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 
-# Pairwise from combined file
-agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl \
+# Pairwise from the same combined run manifest
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl \
   --baseline gpt-4.1 --candidate gpt-5-mini
 
 # CI regression gate: exit 1 if any target regresses vs baseline
-agentv compare examples/features/benchmark-tooling/fixtures/combined-results.jsonl \
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl \
   --baseline gpt-4.1
 
-# Two-file pairwise comparison (legacy)
-agentv compare examples/features/compare/evals/baseline-results.jsonl \
-  examples/features/compare/evals/candidate-results.jsonl
+# Two-run pairwise comparison
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl
 
 # With custom threshold for win/loss classification
-agentv compare examples/features/compare/evals/baseline-results.jsonl \
-  examples/features/compare/evals/candidate-results.jsonl --threshold 0.05
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl --threshold 0.05
 
 # JSON output for CI pipelines
-agentv compare examples/features/compare/evals/baseline-results.jsonl \
-  examples/features/compare/evals/candidate-results.jsonl --json
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl --json
 ```
 
 ## Key Files
 
-- `evals/baseline-results.jsonl` - Results from baseline configuration
-- `evals/candidate-results.jsonl` - Results from candidate configuration
+- canonical run workspaces under `.agentv/results/runs/<timestamp>/`
 - `evals/README.md` - Detailed usage documentation
-- `../benchmark-tooling/fixtures/combined-results.jsonl` - Combined multi-target fixture for N-way matrix
diff --git a/examples/features/compare/evals/README.md b/examples/features/compare/evals/README.md
index c01a261f0..79fa226d3 100644
--- a/examples/features/compare/evals/README.md
+++ b/examples/features/compare/evals/README.md
@@ -1,27 +1,25 @@
 # Compare Command Example
 
-The `agentv compare` command supports three modes: N-way matrix from a combined JSONL, pairwise from a combined JSONL, and two-file pairwise.
+The `agentv compare` command supports three modes: N-way matrix from a canonical run manifest, pairwise from a canonical run manifest, and two-run pairwise.
 
 ## Use Case
 
 Compare model performance across different configurations:
-- N-way matrix comparison across 3+ models from a single combined results file
+- N-way matrix comparison across 3+ models from a single run manifest
 - Baseline regression gating in CI (exit 1 if any target regresses)
 - Head-to-head pairwise between two specific targets
-- Before/after optimization runs (two-file pairwise)
+- Before/after optimization runs (two-run pairwise)
 
 ## Sample Files
 
-- `baseline-results.jsonl` - Results from baseline configuration (GPT-4.1)
-- `candidate-results.jsonl` - Results from candidate configuration (GPT-5)
-- `../../benchmark-tooling/fixtures/combined-results.jsonl` - Combined multi-target results (3 tests x 3 targets)
+- canonical run workspaces under `.agentv/results/runs/<timestamp>/`
 
 ## Usage
 
-### N-Way Matrix (combined JSONL)
+### N-Way Matrix (run manifest)
 
 ```bash
-agentv compare combined-results.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
 Output:
@@ -43,14 +41,14 @@ Pairwise Summary:
 ### Baseline Regression Check
 
 ```bash
-agentv compare combined-results.jsonl --baseline gpt-4.1
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1
 # Exits 1 if any target regresses vs gpt-4.1
 ```
 
-### Pairwise from Combined JSONL
+### Pairwise from a Single Run Manifest
 
 ```bash
-agentv compare combined-results.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
 ```
 
 ```
@@ -65,15 +63,16 @@ Comparing: gpt-4.1 → gpt-5-mini
 Summary: 0 wins, 0 losses, 3 ties | Mean Δ: -0.017 | Status: regressed
 ```
 
-### Two-File Pairwise (legacy)
+### Two-Run Pairwise
 
 ```bash
-agentv compare baseline-results.jsonl candidate-results.jsonl
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl
 ```
 
 Output:
 ```
-Comparing: baseline-results.jsonl → candidate-results.jsonl
+Comparing: .agentv/results/runs/<baseline-timestamp>/index.jsonl → .agentv/results/runs/<candidate-timestamp>/index.jsonl
 
   Test ID          Baseline  Candidate     Delta  Result
   ───────────────  ────────  ─────────  ────────  ────────
@@ -91,7 +90,8 @@ Summary: 1 win, 0 losses, 4 ties | Mean Δ: +0.054 | Status: improved
 Use a stricter threshold (0.05) for win/loss classification:
 
 ```bash
-agentv compare baseline-results.jsonl candidate-results.jsonl --threshold 0.05
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl \
+  .agentv/results/runs/<candidate-timestamp>/index.jsonl --threshold 0.05
 ```
 
 ### JSON Output
@@ -99,7 +99,7 @@ agentv compare baseline-results.jsonl candidate-results.jsonl --threshold 0.05
 For machine-readable output (CI pipelines, scripts):
 
 ```bash
-agentv compare combined-results.jsonl --json
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --json
 ```
 
 Output uses snake_case for Python ecosystem compatibility:
@@ -130,8 +130,8 @@ Use exit codes for automated quality gates:
 
 ```bash
 # N-way: fail if any target regresses vs baseline
-agentv compare results.jsonl --baseline gpt-4.1 || echo "Regression detected!"
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline gpt-4.1 || echo "Regression detected!"
 
-# Two-file: fail if candidate regresses
-agentv compare baseline.jsonl candidate.jsonl || echo "Regression detected!"
+# Two-run: fail if candidate regresses
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl .agentv/results/runs/<candidate-timestamp>/index.jsonl || echo "Regression detected!"
 ```
diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml
index 2d7209118..b9cff5bc0 100644
--- a/examples/features/compare/evals/dataset.eval.yaml
+++ b/examples/features/compare/evals/dataset.eval.yaml
@@ -1,9 +1,9 @@
 # Demo eval for the compare example.
-# Run against two targets to generate baseline and candidate result files:
+# Run against two targets to generate canonical run workspaces:
 #   agentv eval evals/dataset.eval.yaml --target baseline
 #   agentv eval evals/dataset.eval.yaml --target candidate
 # Then compare:
-#   agentv compare evals/baseline-results.jsonl evals/candidate-results.jsonl
+#   agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl .agentv/results/runs/<candidate-timestamp>/index.jsonl
 
 name: compare-demo
 description: Demo eval for generating baseline and candidate results to compare
diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md
index 52fcad680..b519a28d4 100644
--- a/examples/showcase/multi-model-benchmark/README.md
+++ b/examples/showcase/multi-model-benchmark/README.md
@@ -51,20 +51,20 @@ bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yam
 
 ## Comparing Models
 
-The eval produces a combined results file with a `target` field per record. Use `agentv compare` to see all models side by side:
+The eval produces a canonical run workspace with `target` in each `index.jsonl` record. Use `agentv compare` to see all models side by side:
 
 ```bash
 # N-way matrix — see all models at once
-agentv compare results.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 
 # Designate a baseline for CI regression gating
-agentv compare results.jsonl --baseline copilot
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline copilot
 
 # Pairwise: compare two specific targets
-agentv compare results.jsonl --baseline copilot --candidate claude
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline copilot --candidate claude
 
 # JSON output for CI integration
-agentv compare results.jsonl --json
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --json
 ```
 
 ### Expected Output
@@ -134,7 +134,7 @@ This surfaces non-determinism — if a model passes on trial 1 but fails on tria
 
 ### 4. Compare
 
-The `agentv compare` command reads a combined JSONL (with `target` field) and shows an N-way matrix with pairwise summaries. Each pair classifies per-test deltas:
+The `agentv compare` command reads a canonical run manifest (`index.jsonl`, with `target` per record) and shows an N-way matrix with pairwise summaries. Each pair classifies per-test deltas:
 
 - **Win**: candidate score exceeds baseline by threshold (default 0.10)
 - **Loss**: baseline score exceeds candidate by threshold
@@ -154,8 +154,8 @@ benchmark.eval.yaml
 └────────┬────────────────┘
          │
          ▼
-   combined results.jsonl
-   (all targets in one file)
+  .agentv/results/runs/<timestamp>/
+           index.jsonl
          │
          ▼
 ┌─────────────────────────┐
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
index 5d9cc8331..1894d2f14 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
@@ -541,17 +541,17 @@ agentv eval assert <grader-name> --agent-output "..." --agent-input "..."
 # Import agent transcripts for offline grading
 agentv import claude --discover latest
 
-# Re-run only execution errors from a previous output
-agentv eval <file.yaml> --retry-errors <previous-output.jsonl>
+# Re-run only execution errors from a previous run
+agentv eval <file.yaml> --retry-errors .agentv/results/runs/<timestamp>/index.jsonl
 
 # Validate eval file
 agentv validate <file.yaml>
 
-# Compare results — N-way matrix from combined JSONL
-agentv compare <combined-results.jsonl>
-agentv compare <combined-results.jsonl> --baseline <target>                   # CI regression gate
-agentv compare <combined-results.jsonl> --baseline <target> --candidate <target>  # pairwise
-agentv compare <results1.jsonl> <results2.jsonl>                              # two-file pairwise
+# Compare results — N-way matrix from a canonical run manifest
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline <target>                   # CI regression gate
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl --baseline <target> --candidate <target>  # pairwise
+agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl .agentv/results/runs/<candidate-timestamp>/index.jsonl
 
 # Author assertions directly in the eval file
 # Prefer simple assertions when they fit the criteria; use deterministic or LLM-based graders when needed