Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export interface AggregateGradingArtifact {
export interface IndexArtifactEntry {
readonly timestamp: string;
readonly test_id: string;
readonly eval_set?: string;
readonly dataset?: string;
readonly conversation_id?: string;
readonly score: number;
readonly target: string;
Expand Down Expand Up @@ -462,14 +462,14 @@ function safeTargetId(target: string | undefined): string {
return safeArtifactPathSegment(target, 'default');
}

function getEvalSet(result: EvaluationResult): string | undefined {
const record = result as EvaluationResult & { evalSet?: string };
return result.eval_set ?? record.evalSet;
function getDataset(result: EvaluationResult): string | undefined {
const record = result as EvaluationResult & { eval_set?: string; evalSet?: string };
return result.dataset ?? record.eval_set ?? record.evalSet;
}

function buildArtifactSubdir(result: EvaluationResult): string {
const segments = [];
const evalSet = getEvalSet(result);
const evalSet = getDataset(result);
if (evalSet) {
segments.push(safeArtifactPathSegment(evalSet, 'default'));
}
Expand Down Expand Up @@ -508,7 +508,7 @@ export function buildIndexArtifactEntry(
return {
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
eval_set: getEvalSet(result),
dataset: getDataset(result),
conversation_id: result.conversationId,
score: result.score,
target: result.target ?? 'unknown',
Expand Down Expand Up @@ -539,7 +539,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
return {
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
eval_set: getEvalSet(result),
dataset: getDataset(result),
conversation_id: result.conversationId,
score: result.score,
target: result.target ?? 'unknown',
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/junit-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export class JunitWriter {

const grouped = new Map<string, EvaluationResult[]>();
for (const result of this.results) {
const suite = result.eval_set ?? 'default';
const suite = result.dataset ?? 'default';
const existing = grouped.get(suite);
if (existing) {
existing.push(result);
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export const evalBenchCommand = command({
const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
const testIds: string[] = manifest.test_ids;
const targetName: string = manifest.target?.name ?? 'unknown';
const evalSet: string = manifest.eval_set ?? '';
const evalSet: string = manifest.dataset ?? manifest.eval_set ?? '';
const experiment: string | undefined = manifest.experiment;
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';

Expand Down Expand Up @@ -189,7 +189,7 @@ export const evalBenchCommand = command({
JSON.stringify({
timestamp: manifest.timestamp,
test_id: testId,
eval_set: evalSet || undefined,
dataset: evalSet || undefined,
experiment: experiment || undefined,
score: Math.round(weightedScore * 1000) / 1000,
target: targetName,
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ export const evalGradeCommand = command({
const manifestPath = join(exportDir, 'manifest.json');
const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
const testIds: string[] = manifest.test_ids;
const evalSet: string = manifest.eval_set ?? '';
const evalSet: string = manifest.dataset ?? manifest.eval_set ?? '';
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';

let totalGraders = 0;
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ export const evalInputCommand = command({
// manifest.json
await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
eval_set: evalSetName || undefined,
dataset: evalSetName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: {
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ export const evalRunCommand = command({

await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
eval_set: evalSetName || undefined,
dataset: evalSetName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: { name: targetName, kind: targetKind },
Expand Down
5 changes: 3 additions & 2 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ export interface ResultManifestRecord {
readonly timestamp?: string;
readonly test_id?: string;
readonly eval_id?: string;
readonly eval_set?: string;
readonly dataset?: string;
readonly eval_set?: string; // deprecated alias for dataset
readonly experiment?: string;
readonly target?: string;
readonly score: number;
Expand Down Expand Up @@ -124,7 +125,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
return {
timestamp: record.timestamp,
testId,
eval_set: record.eval_set,
dataset: record.dataset ?? record.eval_set,
target: record.target,
score: record.score,
executionStatus: record.execution_status,
Expand Down
18 changes: 9 additions & 9 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ export function createApp(

// ── New Studio API endpoints ──────────────────────────────────────────

// Categories for a specific run (grouped by eval_set or target)
app.get('/api/runs/:filename/categories', (c) => {
// Datasets for a specific run (grouped by dataset or target)
app.get('/api/runs/:filename/datasets', (c) => {
const filename = c.req.param('filename');
const metas = listResultFiles(searchDir);
const meta = metas.find((m) => m.filename === filename);
Expand All @@ -282,25 +282,25 @@ export function createApp(
}
try {
const loaded = patchTestIds(loadManifestResults(meta.path));
const categoryMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
for (const r of loaded) {
const cat = r.eval_set ?? r.target ?? 'default';
const entry = categoryMap.get(cat) ?? { total: 0, passed: 0, scoreSum: 0 };
const ds = r.dataset ?? r.target ?? 'default';
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
entry.total++;
if (r.score >= 1) entry.passed++;
entry.scoreSum += r.score;
categoryMap.set(cat, entry);
datasetMap.set(ds, entry);
}
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
name,
total: entry.total,
passed: entry.passed,
failed: entry.total - entry.passed,
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
}));
return c.json({ categories });
return c.json({ datasets });
} catch {
return c.json({ error: 'Failed to load categories' }, 500);
return c.json({ error: 'Failed to load datasets' }, 500);
}
});

Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/trace/show.ts
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
// Standard flat view
const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
lines.push(
`${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.eval_set ? ` ${c.dim}eval-set: ${result.eval_set}${c.reset}` : ''}`,
`${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? ` ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`,
);

if (result.error) {
Expand Down
3 changes: 2 additions & 1 deletion apps/cli/src/commands/trace/stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[]
case 'target':
key = result.target ?? 'unknown';
break;
case 'dataset':
case 'eval-set':
key = result.eval_set ?? 'unknown';
key = result.dataset ?? 'unknown';
break;
case 'test-id':
key = result.test_id ?? result.eval_id ?? 'unknown';
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/src/commands/trace/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ export interface RawResult {
timestamp?: string;
test_id?: string;
eval_id?: string;
eval_set?: string;
dataset?: string;
conversation_id?: string;
score: number;
assertions?: { text: string; passed: boolean; evidence?: string }[];
Expand Down Expand Up @@ -149,7 +149,7 @@ function toRawResult(result: EvaluationResult): RawResult {
return {
timestamp: result.timestamp,
test_id: result.testId,
eval_set: result.eval_set,
dataset: result.dataset,
conversation_id: result.conversationId,
score: result.score,
assertions: result.assertions?.map((assertion) => ({
Expand Down Expand Up @@ -334,7 +334,7 @@ function loadOtlpTraceFile(filePath: string): RawResult[] {
stringAttr(rootAttrs.agentv_test_id) ??
stringAttr(rootAttrs.agentv_eval_id) ??
`trace-${index + 1}`,
eval_set: stringAttr(rootAttrs.agentv_eval_set),
dataset: stringAttr(rootAttrs.agentv_dataset),
target: stringAttr(rootAttrs.agentv_target),
score,
error: root.status?.code === 2 ? root.status.message : undefined,
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ describe('buildIndexArtifactEntry', () => {
makeResult({
testId: 'alpha',
target: 'claude',
eval_set: 'demo',
dataset: 'demo',
scores: [makeEvaluatorResult({ name: 'quality', score: 0.7 })],
executionStatus: 'quality_failure',
error: 'model drift',
Expand All @@ -434,7 +434,7 @@ describe('buildIndexArtifactEntry', () => {
expect(JSON.parse(JSON.stringify(entry))).toEqual({
timestamp: '2026-03-13T00:00:00.000Z',
test_id: 'alpha',
eval_set: 'demo',
dataset: 'demo',
score: 0.9,
target: 'claude',
scores: [
Expand Down Expand Up @@ -717,9 +717,9 @@ describe('writeArtifactsFromResults', () => {
expect(candidateGrading.assertions[0].text).toBe('candidate-check');
});

it('prefixes artifact paths with eval_set when present', async () => {
it('prefixes artifact paths with dataset when present', async () => {
const paths = await writeArtifactsFromResults(
[makeResult({ eval_set: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
[makeResult({ dataset: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
testDir,
);

Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/output-writers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ describe('JunitWriter', () => {
it('should group results by dataset as testsuites', async () => {
const writer = await JunitWriter.open(testFilePath);

await writer.append(makeResult({ testId: 'a-1', eval_set: 'suite-a', score: 1.0 }));
await writer.append(makeResult({ testId: 'a-2', eval_set: 'suite-a', score: 0.8 }));
await writer.append(makeResult({ testId: 'b-1', eval_set: 'suite-b', score: 0.5 }));
await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 }));
await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 }));
await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 }));
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
Expand Down
24 changes: 12 additions & 12 deletions apps/cli/test/commands/results/export-e2e-providers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
const CLAUDE_CLI_RESULT = {
timestamp: '2026-03-18T10:00:00.000Z',
test_id: 'test-claude-reasoning',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 1.0,
assertions: [
{ text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
Expand Down Expand Up @@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = {
const CODEX_RESULT = {
timestamp: '2026-03-18T10:01:00.000Z',
test_id: 'test-codex-edit',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0.9,
assertions: [
{ text: 'File edited correctly', passed: true },
Expand Down Expand Up @@ -96,7 +96,7 @@ const CODEX_RESULT = {
const COPILOT_RESULT = {
timestamp: '2026-03-18T10:02:00.000Z',
test_id: 'test-copilot-complete',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0.85,
assertions: [
{ text: 'Code completion correct', passed: true },
Expand Down Expand Up @@ -125,7 +125,7 @@ const COPILOT_RESULT = {
const PI_RESULT = {
timestamp: '2026-03-18T10:03:00.000Z',
test_id: 'test-pi-refactor',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0.75,
assertions: [
{ text: 'Refactored correctly', passed: true },
Expand All @@ -143,7 +143,7 @@ const PI_RESULT = {
const LLM_AZURE_RESULT = {
timestamp: '2026-03-18T10:04:00.000Z',
test_id: 'test-llm-analysis',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 1.0,
assertions: [{ text: 'Analysis correct', passed: true }],
output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
Expand All @@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = {
const LLM_GPT_RESULT = {
timestamp: '2026-03-18T10:05:00.000Z',
test_id: 'test-llm-analysis',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0.8,
assertions: [{ text: 'Analysis correct', passed: true }],
output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
Expand All @@ -181,7 +181,7 @@ const LLM_GPT_RESULT = {
const MINIMAL_RESULT = {
timestamp: '2026-03-18T10:06:00.000Z',
test_id: 'test-minimal',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0.5,
assertions: [{ text: 'Exists', passed: true }],
output: [{ role: 'assistant', content: 'Response.' }],
Expand All @@ -193,7 +193,7 @@ const MINIMAL_RESULT = {
const ERROR_RESULT = {
timestamp: '2026-03-18T10:07:00.000Z',
test_id: 'test-error-case',
eval_set: 'multi-provider',
dataset: 'multi-provider',
score: 0,
assertions: [],
output: [],
Expand All @@ -212,12 +212,12 @@ function toJsonl(...records: object[]): string {

function artifactDir(
outputDir: string,
record: { eval_set?: string; test_id?: string; eval_id?: string; target?: string },
record: { dataset?: string; test_id?: string; eval_id?: string; target?: string },
): string {
const testId = record.test_id ?? record.eval_id ?? 'unknown';
return path.join(
outputDir,
...(record.eval_set ? [record.eval_set] : []),
...(record.dataset ? [record.dataset] : []),
testId,
record.target ?? 'default',
);
Expand Down Expand Up @@ -642,7 +642,7 @@ describe('export e2e — multi-provider metrics verification', () => {
const record = {
timestamp: '2026-03-18T10:00:00.000Z',
test_id: 'test-case-convert',
eval_set: 'test',
dataset: 'test',
score: 1.0,
assertions: [{ text: 'ok', passed: true }],
output_text: 'ok',
Expand Down Expand Up @@ -677,7 +677,7 @@ describe('export e2e — multi-provider metrics verification', () => {
const record = {
timestamp: '2026-03-18T10:00:00.000Z',
eval_id: 'legacy-test-id',
eval_set: 'test',
dataset: 'test',
score: 1.0,
assertions: [{ text: 'ok', passed: true }],
output_text: 'ok',
Expand Down
Loading
Loading