Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -264,19 +264,31 @@ interface DataContext {
// biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
type C = Context<any, any, any>;

function inferExperimentFromRunId(runId: string): string | undefined {
const separatorIndex = runId.lastIndexOf('::');
if (separatorIndex === -1) {
return undefined;
}
const experiment = runId.slice(0, separatorIndex).trim();
if (!experiment || experiment === 'default') {
return undefined;
}
return experiment;
}

async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
const { runs: metas } = await listMergedResultFiles(searchDir);
const { threshold: passThreshold } = loadStudioConfig(agentvDir);
return c.json({
runs: metas.map((m) => {
let target: string | undefined;
let experiment: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
let passRate = m.passRate;
try {
const records = loadLightweightResults(m.path);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment;
experiment = records[0].experiment ?? experiment;
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
}
} catch {
Expand Down Expand Up @@ -1041,12 +1053,12 @@ export function createApp(
const { runs: metas } = await listMergedResultFiles(p.path);
for (const m of metas) {
let target: string | undefined;
let experiment: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
try {
const records = loadLightweightResults(m.path);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment;
experiment = records[0].experiment ?? experiment;
}
} catch {
// ignore enrichment errors
Expand Down
66 changes: 66 additions & 0 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
import { tmpdir } from 'node:os';
import path from 'node:path';

import { addBenchmark } from '@agentv/core';

import {
createApp,
loadResults,
Expand Down Expand Up @@ -485,6 +487,28 @@ describe('serve app', () => {
expect(data.runs[0].pass_rate).toBe(1);
});

it('infers the experiment name from the run id when live results have not written it yet', async () => {
const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'issue-1198-live-name');
mkdirSync(runsDir, { recursive: true });
const filename = '2026-03-25T12-00-00-000Z';
const runDir = path.join(runsDir, filename);
mkdirSync(runDir, { recursive: true });
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));

const app = createApp([], tempDir, tempDir, undefined, { studioDir });
const res = await app.request('/api/runs');

expect(res.status).toBe(200);
const data = (await res.json()) as {
runs: Array<{ experiment?: string; target?: string }>;
};
expect(data.runs).toHaveLength(1);
expect(data.runs[0]).toMatchObject({
experiment: 'issue-1198-live-name',
target: 'gpt-4o',
});
});

it('merges cached remote runs and tags them with remote source metadata', async () => {
const previousHome = process.env.AGENTV_HOME;
process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
Expand Down Expand Up @@ -536,6 +560,48 @@ describe('serve app', () => {
});
});

describe('GET /api/benchmarks/all-runs', () => {
it('infers experiment names for live benchmark runs before records persist them', async () => {
const previousHome = process.env.AGENTV_HOME;
process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');

try {
const benchmarkDir = path.join(tempDir, 'bench-one');
const runDir = path.join(
benchmarkDir,
'.agentv',
'results',
'runs',
'issue-1198-benchmark',
'2026-03-25T12-00-00-000Z',
);
mkdirSync(runDir, { recursive: true });
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));
const benchmark = addBenchmark(benchmarkDir);

const app = createApp([], tempDir, tempDir, undefined, { studioDir });
const res = await app.request('/api/benchmarks/all-runs');

expect(res.status).toBe(200);
const data = (await res.json()) as {
runs: Array<{ benchmark_id: string; experiment?: string; target?: string }>;
};
expect(data.runs).toHaveLength(1);
expect(data.runs[0]).toMatchObject({
benchmark_id: benchmark.id,
experiment: 'issue-1198-benchmark',
target: 'gpt-4o',
});
} finally {
if (previousHome === undefined) {
process.env.AGENTV_HOME = undefined;
} else {
process.env.AGENTV_HOME = previousHome;
}
}
});
});

describe('GET /api/remote/status', () => {
it('reports configured remote status with graceful local-only fallback', async () => {
mkdirSync(path.join(tempDir, '.agentv'), { recursive: true });
Expand Down
11 changes: 2 additions & 9 deletions apps/studio/src/components/RunList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import type React from 'react';
import { Link } from '@tanstack/react-router';

import { DEFAULT_PASS_THRESHOLD, useStudioConfig } from '~/lib/api';
import { formatRunLabel } from '~/lib/run-label';
import type { RunMeta } from '~/lib/types';

import { PassRatePill } from './PassRatePill';
Expand Down Expand Up @@ -41,14 +42,6 @@ function formatDate(ts: string | undefined | null): { date: string; full: string
}
}

/** Human-readable run label: "target · experiment" or filename fallback. */
function runLabel(run: RunMeta): string {
const parts = [run.target, run.experiment].filter((p) => p && p !== 'default' && p !== '-');
if (parts.length > 0) return parts.join(' · ');
if (run.target) return run.target;
return run.display_name ?? run.filename;
}

export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
const { data: config } = useStudioConfig();
const passThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD;
Expand Down Expand Up @@ -89,7 +82,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
{runs.map((run) => {
const ts = formatDate(run.timestamp);
const passing = run.pass_rate >= passThreshold;
const label = runLabel(run);
const label = formatRunLabel(run);
const passedCount = Math.round(run.pass_rate * run.test_count);
const failedCount = run.test_count - passedCount;
return (
Expand Down
7 changes: 4 additions & 3 deletions apps/studio/src/components/Sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import {
useRunList,
useStudioConfig,
} from '~/lib/api';
import { formatRunLabel } from '~/lib/run-label';
import { useSidebarContext } from '~/lib/sidebar-context';

/** Responsive <aside> wrapper. Handles mobile overlay and desktop static placement. */
Expand Down Expand Up @@ -213,7 +214,7 @@ function RunSidebar() {
className="mb-0.5 block truncate rounded-md px-2 py-1.5 text-sm text-gray-400 transition-colors hover:bg-gray-800/50 hover:text-gray-200"
title={run.benchmark_name}
>
{run.display_name ?? run.filename}
{formatRunLabel(run)}
</Link>
);
}
Expand All @@ -229,7 +230,7 @@ function RunSidebar() {
: 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200'
}`}
>
{run.display_name ?? run.filename}
{formatRunLabel(run)}
</Link>
);
})}
Expand Down Expand Up @@ -451,7 +452,7 @@ function BenchmarkRunDetailSidebar({
: 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200'
}`}
>
{run.display_name ?? run.filename}
{formatRunLabel(run)}
</Link>
);
})}
Expand Down
25 changes: 25 additions & 0 deletions apps/studio/src/lib/run-label.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { describe, expect, it } from 'bun:test';

import { formatRunLabel } from './run-label';

describe('formatRunLabel', () => {
it('prefers target and experiment over the timestamp display name', () => {
expect(
formatRunLabel({
filename: 'issue-1198::2026-04-29T09-17-30-111Z',
display_name: '2026-04-29T09-17-30-111Z',
target: 'llm-dry-run',
experiment: 'issue-1198',
}),
).toBe('llm-dry-run · issue-1198');
});

it('falls back to the display name when no richer metadata is available', () => {
expect(
formatRunLabel({
filename: '2026-04-29T09-17-30-111Z',
display_name: '2026-04-29T09-17-30-111Z',
}),
).toBe('2026-04-29T09-17-30-111Z');
});
});
14 changes: 14 additions & 0 deletions apps/studio/src/lib/run-label.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import type { RunMeta } from './types';

type RunLabelInput = Pick<RunMeta, 'display_name' | 'experiment' | 'filename' | 'target'>;

/** Format a run label consistently across tables and nav surfaces. */
export function formatRunLabel(run: RunLabelInput): string {
const parts = [run.target, run.experiment].filter(
(part): part is string => !!part && part !== 'default' && part !== '-',
);
if (parts.length > 0) {
return parts.join(' · ');
}
return run.display_name ?? run.filename;
}
Loading