EntityProcess · christso · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ agentv eval evals/my-eval.yaml
 
 **5. Compare results across targets:**
 ```bash
-agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
+agentv compare .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
 ## Output formats

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
@@ -10,6 +10,7 @@ import {
   restPositionals,
   string,
 } from 'cmd-ts';
+
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
 import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js';
 
@@ -62,23 +63,40 @@ interface MatrixRow {
   scores: Record<string, number>;
 }
 
+interface CompareInputRecord extends EvalResult {
+  target?: string;
+}
+
+function loadCompareResults(filePath: string): CompareInputRecord[] {
+  return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
+    if (!record.testId || record.testId === 'unknown') {
+      throw new Error(`Missing test_id in result source: ${filePath}`);
+    }
+    if (typeof record.score !== 'number' || Number.isNaN(record.score)) {
+      throw new Error(`Missing or invalid score in result source: ${filePath}`);
+    }
+    return {
+      testId: record.testId,
+      score: record.score,
+      target: record.target,
+    };
+  });
+}
+
 export interface MatrixOutput {
   matrix: MatrixRow[];
   pairwise: ComparisonOutput[];
   targets: string[];
 }
 
 export function loadJsonlResults(filePath: string): EvalResult[] {
-  return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => ({
-    testId: record.testId,
-    score: record.score,
-  }));
+  return loadCompareResults(filePath).map(({ testId, score }) => ({ testId, score }));
 }
 
 export function loadCombinedResults(filePath: string): Map<string, EvalResult[]> {
   const groups = new Map<string, EvalResult[]>();
 
-  for (const record of loadLightweightResults(resolveResultSourcePath(filePath))) {
+  for (const record of loadCompareResults(filePath)) {
     if (typeof record.target !== 'string') {
       throw new Error(`Missing target field in combined result source: ${filePath}`);
     }
@@ -413,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
 export const compareCommand = command({
   name: 'compare',
   description:
-    'Compare evaluation result files: two-file pairwise, combined JSONL pairwise, or N-way matrix',
+    'Compare evaluation run manifests: two-run pairwise, single-run pairwise, or N-way matrix',
   args: {
     results: restPositionals({
       type: string,
       displayName: 'results',
-      description: 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.',
+      description:
+        'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.',
     }),
     threshold: option({
       type: optional(number),
@@ -430,13 +449,13 @@ export const compareCommand = command({
       type: optional(string),
       long: 'baseline',
       short: 'b',
-      description: 'Target name to use as baseline (filters combined JSONL)',
+      description: 'Target name to use as baseline (filters a single run manifest)',
     }),
     candidate: option({
       type: optional(string),
       long: 'candidate',
       short: 'c',
-      description: 'Target name to use as candidate (filters combined JSONL)',
+      description: 'Target name to use as candidate (filters a single run manifest)',
     }),
     targets: multioption({
       type: array(string),
@@ -460,7 +479,7 @@ export const compareCommand = command({
 
     try {
       if (results.length === 0) {
-        throw new Error('At least one JSONL result file is required');
+        throw new Error('At least one run workspace or index.jsonl manifest is required');
       }
 
       if (results.length === 2) {
@@ -478,7 +497,7 @@ export const compareCommand = command({
         const exitCode = determineExitCode(comparison.summary.meanDelta);
         process.exit(exitCode);
       } else if (results.length === 1) {
-        // Combined JSONL mode
+        // Single-run manifest mode
         let groups = loadCombinedResults(results[0]);
 
         // Filter by --targets if specified
@@ -514,7 +533,7 @@ export const compareCommand = command({
         }
 
         if (baseline && candidate) {
-          // Pairwise mode from combined JSONL
+          // Pairwise mode from a single run manifest
           const baselineResults = groups.get(baseline);
           const candidateResults = groups.get(candidate);
           if (!baselineResults) {
@@ -548,7 +567,7 @@ export const compareCommand = command({
           process.exit(exitCode);
         }
       } else {
-        throw new Error('Expected 1 or 2 JSONL result files');
+        throw new Error('Expected 1 or 2 run workspaces or index.jsonl manifests');
       }
     } catch (error) {
       console.error(`Error: ${(error as Error).message}`);

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -594,6 +594,69 @@ function toCamelCaseDeep(obj: unknown): unknown {
   return obj;
 }
 
+type ParsedEvaluationResult = Record<string, unknown> & {
+  timestamp: string;
+  testId: string;
+  score: number;
+  assertions: EvaluationResult['assertions'];
+  target: string;
+  output: EvaluationResult['output'];
+  executionStatus: EvaluationResult['executionStatus'];
+};
+
+const EXECUTION_STATUSES = new Set<EvaluationResult['executionStatus']>([
+  'ok',
+  'quality_failure',
+  'execution_error',
+]);
+
+function isAssertionEntry(value: unknown): value is EvaluationResult['assertions'][number] {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return false;
+  }
+
+  const candidate = value as { text?: unknown; passed?: unknown; evidence?: unknown };
+  return (
+    typeof candidate.text === 'string' &&
+    typeof candidate.passed === 'boolean' &&
+    (candidate.evidence === undefined || typeof candidate.evidence === 'string')
+  );
+}
+
+function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return false;
+  }
+
+  const candidate = value as { role?: unknown };
+  return typeof candidate.role === 'string';
+}
+
+function isExecutionStatus(value: unknown): value is EvaluationResult['executionStatus'] {
+  return (
+    typeof value === 'string' &&
+    EXECUTION_STATUSES.has(value as EvaluationResult['executionStatus'])
+  );
+}
+
+function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return undefined;
+  }
+
+  const result = value as Record<string, unknown>;
+  return {
+    ...result,
+    timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
+    testId: typeof result.testId === 'string' ? result.testId : 'unknown',
+    score: typeof result.score === 'number' ? result.score : 0,
+    assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
+    target: typeof result.target === 'string' ? result.target : 'unknown',
+    output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
+    executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
+  };
+}
+
 // ---------------------------------------------------------------------------
 // JSONL parsing
 // ---------------------------------------------------------------------------
@@ -610,7 +673,10 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
       const parsed = JSON.parse(trimmed);
       // JSONL files from AgentV use snake_case; convert back to camelCase
       const camelCased = toCamelCaseDeep(parsed);
-      results.push(camelCased as EvaluationResult);
+      const normalized = normalizeParsedResult(camelCased);
+      if (normalized) {
+        results.push(normalized);
+      }
     } catch {
       // Skip malformed lines
     }

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -141,7 +141,8 @@ export const evalRunCommand = command({
     retryErrors: option({
       type: optional(string),
       long: 'retry-errors',
-      description: 'Path to previous output JSONL — re-run only execution_error test cases',
+      description:
+        'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
     }),
     strict: flag({
       long: 'strict',

diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts
@@ -20,6 +20,10 @@ export function resolveRunIndexPath(runDir: string): string {
   return path.join(runDir, RESULT_INDEX_FILENAME);
 }
 
+export function isRunManifestPath(filePath: string): boolean {
+  return path.basename(filePath) === RESULT_INDEX_FILENAME;
+}
+
 export function resolveExistingRunPrimaryPath(runDir: string): string | undefined {
   const indexPath = resolveRunIndexPath(runDir);
   if (existsSync(indexPath)) {
@@ -49,3 +53,17 @@ export function resolveWorkspaceOrFilePath(filePath: string): string {
 
   return existing;
 }
+
+export function resolveRunManifestPath(filePath: string): string {
+  if (isDirectoryPath(filePath)) {
+    return resolveWorkspaceOrFilePath(filePath);
+  }
+
+  if (!isRunManifestPath(filePath)) {
+    throw new Error(
+      `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`,
+    );
+  }
+
+  return filePath;
+}
diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
@@ -1,17 +1,16 @@
 import type { EvaluationResult } from '@agentv/core';
 
-import {
-  loadLightweightResults,
-  loadManifestResults,
-  resolveResultSourcePath,
-} from '../results/manifest.js';
+import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js';
+
+async function loadRetrySourceResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
+  return loadManifestResults(resolveResultSourcePath(jsonlPath));
+}
 
 /**
  * Load test IDs from an index/results source that have executionStatus === 'execution_error'.
  */
 export async function loadErrorTestIds(jsonlPath: string): Promise<readonly string[]> {
-  const resolvedPath = resolveResultSourcePath(jsonlPath);
-  const ids = loadLightweightResults(resolvedPath)
+  const ids = (await loadRetrySourceResults(jsonlPath))
     .filter((result) => result.executionStatus === 'execution_error')
     .map((result) => result.testId);
 
@@ -23,8 +22,7 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
  * These are the "good" results that should be preserved when merging retry output.
  */
 export async function loadNonErrorResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
-  const resolvedPath = resolveResultSourcePath(jsonlPath);
-  return loadManifestResults(resolvedPath).filter(
+  return (await loadRetrySourceResults(jsonlPath)).filter(
     (result) => result.testId && result.executionStatus !== 'execution_error',
   );
 }
diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts
@@ -16,21 +16,19 @@ const CACHE_FILENAME = 'cache.json';
 export interface RunCache {
   /** Directory path for new per-run directory format (e.g. .agentv/results/runs/<ts>/) */
   readonly lastRunDir?: string;
-  /** JSONL file path for legacy flat-file format. Kept for backward compat. */
+  /** @deprecated Legacy flat-file pointer from old cache files. Ignored on read. */
   readonly lastResultFile?: string;
   readonly timestamp: string;
 }
 
 /**
  * Resolve the primary result manifest path from a RunCache entry.
- * New format: lastRunDir/index.jsonl
- * Legacy format: lastResultFile (flat JSONL path)
  */
 export function resolveRunCacheFile(cache: RunCache): string {
   if (cache.lastRunDir) {
     return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
   }
-  return cache.lastResultFile ?? '';
+  return '';
 }
 
 function cachePath(cwd: string): string {
@@ -47,18 +45,15 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
 }
 
 export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
+  if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
+    return;
+  }
+
   const dir = path.join(cwd, '.agentv');
   await mkdir(dir, { recursive: true });
-  const basename = path.basename(resultPath);
-  const cache: RunCache =
-    basename === RESULT_INDEX_FILENAME
-      ? {
-          lastRunDir: path.dirname(resultPath),
-          timestamp: new Date().toISOString(),
-        }
-      : {
-          lastResultFile: resultPath,
-          timestamp: new Date().toISOString(),
-        };
+  const cache: RunCache = {
+    lastRunDir: path.dirname(resultPath),
+    timestamp: new Date().toISOString(),
+  };
   await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8');
 }