EntityProcess · christso · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -94,7 +94,7 @@ export interface AggregateGradingArtifact {
 export interface IndexArtifactEntry {
   readonly timestamp: string;
   readonly test_id: string;
-  readonly eval_set?: string;
+  readonly dataset?: string;
   readonly conversation_id?: string;
   readonly score: number;
   readonly target: string;
@@ -462,14 +462,14 @@ function safeTargetId(target: string | undefined): string {
   return safeArtifactPathSegment(target, 'default');
 }
 
-function getEvalSet(result: EvaluationResult): string | undefined {
-  const record = result as EvaluationResult & { evalSet?: string };
-  return result.eval_set ?? record.evalSet;
+function getDataset(result: EvaluationResult): string | undefined {
+  const record = result as EvaluationResult & { eval_set?: string; evalSet?: string };
+  return result.dataset ?? record.eval_set ?? record.evalSet;
 }
 
 function buildArtifactSubdir(result: EvaluationResult): string {
   const segments = [];
-  const evalSet = getEvalSet(result);
+  const evalSet = getDataset(result);
   if (evalSet) {
     segments.push(safeArtifactPathSegment(evalSet, 'default'));
   }
@@ -508,7 +508,7 @@ export function buildIndexArtifactEntry(
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    eval_set: getEvalSet(result),
+    dataset: getDataset(result),
     conversation_id: result.conversationId,
     score: result.score,
     target: result.target ?? 'unknown',
@@ -539,7 +539,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    eval_set: getEvalSet(result),
+    dataset: getDataset(result),
     conversation_id: result.conversationId,
     score: result.score,
     target: result.target ?? 'unknown',

diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
@@ -47,7 +47,7 @@ export class JunitWriter {
 
     const grouped = new Map<string, EvaluationResult[]>();
     for (const result of this.results) {
-      const suite = result.eval_set ?? 'default';
+      const suite = result.dataset ?? 'default';
       const existing = grouped.get(suite);
       if (existing) {
         existing.push(result);

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -45,7 +45,7 @@ export const evalBenchCommand = command({
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
-    const evalSet: string = manifest.eval_set ?? '';
+    const evalSet: string = manifest.dataset ?? manifest.eval_set ?? '';
     const experiment: string | undefined = manifest.experiment;
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
@@ -189,7 +189,7 @@ export const evalBenchCommand = command({
         JSON.stringify({
           timestamp: manifest.timestamp,
           test_id: testId,
-          eval_set: evalSet || undefined,
+          dataset: evalSet || undefined,
           experiment: experiment || undefined,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,

diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
@@ -40,7 +40,7 @@ export const evalGradeCommand = command({
     const manifestPath = join(exportDir, 'manifest.json');
     const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
     const testIds: string[] = manifest.test_ids;
-    const evalSet: string = manifest.eval_set ?? '';
+    const evalSet: string = manifest.dataset ?? manifest.eval_set ?? '';
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     let totalGraders = 0;

diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -167,7 +167,7 @@ export const evalInputCommand = command({
     // manifest.json
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      eval_set: evalSetName || undefined,
+      dataset: evalSetName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: {

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -187,7 +187,7 @@ export const evalRunCommand = command({
 
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      eval_set: evalSetName || undefined,
+      dataset: evalSetName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: { name: targetName, kind: targetKind },

diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
@@ -11,7 +11,8 @@ export interface ResultManifestRecord {
   readonly timestamp?: string;
   readonly test_id?: string;
   readonly eval_id?: string;
-  readonly eval_set?: string;
+  readonly dataset?: string;
+  readonly eval_set?: string; // deprecated alias for dataset
   readonly experiment?: string;
   readonly target?: string;
   readonly score: number;
@@ -124,7 +125,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
   return {
     timestamp: record.timestamp,
     testId,
-    eval_set: record.eval_set,
+    dataset: record.dataset ?? record.eval_set,
     target: record.target,
     score: record.score,
     executionStatus: record.execution_status,

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -272,8 +272,8 @@ export function createApp(
 
   // ── New Studio API endpoints ──────────────────────────────────────────
 
-  // Categories for a specific run (grouped by eval_set or target)
-  app.get('/api/runs/:filename/categories', (c) => {
+  // Datasets for a specific run (grouped by dataset or target)
+  app.get('/api/runs/:filename/datasets', (c) => {
     const filename = c.req.param('filename');
     const metas = listResultFiles(searchDir);
     const meta = metas.find((m) => m.filename === filename);
@@ -282,25 +282,25 @@ export function createApp(
     }
     try {
       const loaded = patchTestIds(loadManifestResults(meta.path));
-      const categoryMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
+      const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
       for (const r of loaded) {
-        const cat = r.eval_set ?? r.target ?? 'default';
-        const entry = categoryMap.get(cat) ?? { total: 0, passed: 0, scoreSum: 0 };
+        const ds = r.dataset ?? r.target ?? 'default';
+        const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
         entry.total++;
         if (r.score >= 1) entry.passed++;
         entry.scoreSum += r.score;
-        categoryMap.set(cat, entry);
+        datasetMap.set(ds, entry);
       }
-      const categories = [...categoryMap.entries()].map(([name, entry]) => ({
+      const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
         name,
         total: entry.total,
         passed: entry.passed,
         failed: entry.total - entry.passed,
         avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
       }));
-      return c.json({ categories });
+      return c.json({ datasets });
     } catch {
-      return c.json({ error: 'Failed to load categories' }, 500);
+      return c.json({ error: 'Failed to load datasets' }, 500);
     }
   });
 

diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts
@@ -225,7 +225,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
   // Standard flat view
   const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
   lines.push(
-    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.eval_set ? `  ${c.dim}eval-set: ${result.eval_set}${c.reset}` : ''}`,
+    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? `  ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`,
   );
 
   if (result.error) {

diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts
@@ -109,8 +109,9 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[]
       case 'target':
         key = result.target ?? 'unknown';
         break;
+      case 'dataset':
       case 'eval-set':
-        key = result.eval_set ?? 'unknown';
+        key = result.dataset ?? 'unknown';
         break;
       case 'test-id':
         key = result.test_id ?? result.eval_id ?? 'unknown';

diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
@@ -51,7 +51,7 @@ export interface RawResult {
   timestamp?: string;
   test_id?: string;
   eval_id?: string;
-  eval_set?: string;
+  dataset?: string;
   conversation_id?: string;
   score: number;
   assertions?: { text: string; passed: boolean; evidence?: string }[];
@@ -149,7 +149,7 @@ function toRawResult(result: EvaluationResult): RawResult {
   return {
     timestamp: result.timestamp,
     test_id: result.testId,
-    eval_set: result.eval_set,
+    dataset: result.dataset,
     conversation_id: result.conversationId,
     score: result.score,
     assertions: result.assertions?.map((assertion) => ({
@@ -334,7 +334,7 @@ function loadOtlpTraceFile(filePath: string): RawResult[] {
         stringAttr(rootAttrs.agentv_test_id) ??
         stringAttr(rootAttrs.agentv_eval_id) ??
         `trace-${index + 1}`,
-      eval_set: stringAttr(rootAttrs.agentv_eval_set),
+      dataset: stringAttr(rootAttrs.agentv_dataset),
       target: stringAttr(rootAttrs.agentv_target),
       score,
       error: root.status?.code === 2 ? root.status.message : undefined,

diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -417,7 +417,7 @@ describe('buildIndexArtifactEntry', () => {
       makeResult({
         testId: 'alpha',
         target: 'claude',
-        eval_set: 'demo',
+        dataset: 'demo',
         scores: [makeEvaluatorResult({ name: 'quality', score: 0.7 })],
         executionStatus: 'quality_failure',
         error: 'model drift',
@@ -434,7 +434,7 @@ describe('buildIndexArtifactEntry', () => {
     expect(JSON.parse(JSON.stringify(entry))).toEqual({
       timestamp: '2026-03-13T00:00:00.000Z',
       test_id: 'alpha',
-      eval_set: 'demo',
+      dataset: 'demo',
       score: 0.9,
       target: 'claude',
       scores: [
@@ -717,9 +717,9 @@ describe('writeArtifactsFromResults', () => {
     expect(candidateGrading.assertions[0].text).toBe('candidate-check');
   });
 
-  it('prefixes artifact paths with eval_set when present', async () => {
+  it('prefixes artifact paths with dataset when present', async () => {
     const paths = await writeArtifactsFromResults(
-      [makeResult({ eval_set: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
+      [makeResult({ dataset: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
       testDir,
     );
 

diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts
@@ -125,9 +125,9 @@ describe('JunitWriter', () => {
   it('should group results by dataset as testsuites', async () => {
     const writer = await JunitWriter.open(testFilePath);
 
-    await writer.append(makeResult({ testId: 'a-1', eval_set: 'suite-a', score: 1.0 }));
-    await writer.append(makeResult({ testId: 'a-2', eval_set: 'suite-a', score: 0.8 }));
-    await writer.append(makeResult({ testId: 'b-1', eval_set: 'suite-b', score: 0.5 }));
+    await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 }));
+    await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 }));
+    await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 }));
     await writer.close();
 
     const xml = await readFile(testFilePath, 'utf8');

diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
 const CLAUDE_CLI_RESULT = {
   timestamp: '2026-03-18T10:00:00.000Z',
   test_id: 'test-claude-reasoning',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 1.0,
   assertions: [
     { text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
@@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = {
 const CODEX_RESULT = {
   timestamp: '2026-03-18T10:01:00.000Z',
   test_id: 'test-codex-edit',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0.9,
   assertions: [
     { text: 'File edited correctly', passed: true },
@@ -96,7 +96,7 @@ const CODEX_RESULT = {
 const COPILOT_RESULT = {
   timestamp: '2026-03-18T10:02:00.000Z',
   test_id: 'test-copilot-complete',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0.85,
   assertions: [
     { text: 'Code completion correct', passed: true },
@@ -125,7 +125,7 @@ const COPILOT_RESULT = {
 const PI_RESULT = {
   timestamp: '2026-03-18T10:03:00.000Z',
   test_id: 'test-pi-refactor',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0.75,
   assertions: [
     { text: 'Refactored correctly', passed: true },
@@ -143,7 +143,7 @@ const PI_RESULT = {
 const LLM_AZURE_RESULT = {
   timestamp: '2026-03-18T10:04:00.000Z',
   test_id: 'test-llm-analysis',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 1.0,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
@@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = {
 const LLM_GPT_RESULT = {
   timestamp: '2026-03-18T10:05:00.000Z',
   test_id: 'test-llm-analysis',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0.8,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
@@ -181,7 +181,7 @@ const LLM_GPT_RESULT = {
 const MINIMAL_RESULT = {
   timestamp: '2026-03-18T10:06:00.000Z',
   test_id: 'test-minimal',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0.5,
   assertions: [{ text: 'Exists', passed: true }],
   output: [{ role: 'assistant', content: 'Response.' }],
@@ -193,7 +193,7 @@ const MINIMAL_RESULT = {
 const ERROR_RESULT = {
   timestamp: '2026-03-18T10:07:00.000Z',
   test_id: 'test-error-case',
-  eval_set: 'multi-provider',
+  dataset: 'multi-provider',
   score: 0,
   assertions: [],
   output: [],
@@ -212,12 +212,12 @@ function toJsonl(...records: object[]): string {
 
 function artifactDir(
   outputDir: string,
-  record: { eval_set?: string; test_id?: string; eval_id?: string; target?: string },
+  record: { dataset?: string; test_id?: string; eval_id?: string; target?: string },
 ): string {
   const testId = record.test_id ?? record.eval_id ?? 'unknown';
   return path.join(
     outputDir,
-    ...(record.eval_set ? [record.eval_set] : []),
+    ...(record.dataset ? [record.dataset] : []),
     testId,
     record.target ?? 'default',
   );
@@ -642,7 +642,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       const record = {
         timestamp: '2026-03-18T10:00:00.000Z',
         test_id: 'test-case-convert',
-        eval_set: 'test',
+        dataset: 'test',
         score: 1.0,
         assertions: [{ text: 'ok', passed: true }],
         output_text: 'ok',
@@ -677,7 +677,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       const record = {
         timestamp: '2026-03-18T10:00:00.000Z',
         eval_id: 'legacy-test-id',
-        eval_set: 'test',
+        dataset: 'test',
         score: 1.0,
         assertions: [{ text: 'ok', passed: true }],
         output_text: 'ok',