From 828aab8d21b7c3ebbe452f891fb10607ae481a6a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 30 Apr 2026 03:39:35 +0200 Subject: [PATCH 1/2] fix: show live run names before completion Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/results/serve.ts | 20 ++++-- apps/cli/test/commands/results/serve.test.ts | 66 ++++++++++++++++++++ apps/studio/src/components/RunList.tsx | 11 +--- apps/studio/src/components/Sidebar.tsx | 7 ++- apps/studio/src/lib/run-label.test.ts | 25 ++++++++ apps/studio/src/lib/run-label.ts | 14 +++++ 6 files changed, 127 insertions(+), 16 deletions(-) create mode 100644 apps/studio/src/lib/run-label.test.ts create mode 100644 apps/studio/src/lib/run-label.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 580d4e1f..acea1e5a 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -264,19 +264,31 @@ interface DataContext { // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route type C = Context; +function inferExperimentFromRunId(runId: string): string | undefined { + const separatorIndex = runId.lastIndexOf('::'); + if (separatorIndex === -1) { + return undefined; + } + const experiment = runId.slice(0, separatorIndex).trim(); + if (!experiment || experiment === 'default') { + return undefined; + } + return experiment; +} + async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: passThreshold } = loadStudioConfig(agentvDir); return c.json({ runs: metas.map((m) => { let target: string | undefined; - let experiment: string | undefined; + let experiment = inferExperimentFromRunId(m.raw_filename); let passRate = m.passRate; try { const records = loadLightweightResults(m.path); if (records.length > 0) { target = records[0].target; - experiment = records[0].experiment; + experiment = records[0].experiment ?? experiment; passRate = records.filter((r) => r.score >= passThreshold).length / records.length; } } catch { @@ -1041,12 +1053,12 @@ export function createApp( const { runs: metas } = await listMergedResultFiles(p.path); for (const m of metas) { let target: string | undefined; - let experiment: string | undefined; + let experiment = inferExperimentFromRunId(m.raw_filename); try { const records = loadLightweightResults(m.path); if (records.length > 0) { target = records[0].target; - experiment = records[0].experiment; + experiment = records[0].experiment ?? experiment; } } catch { // ignore enrichment errors diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 702cd9ec..bf81540f 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -3,6 +3,8 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync import { tmpdir } from 'node:os'; import path from 'node:path'; +import { addBenchmark } from '@agentv/core'; + import { createApp, loadResults, @@ -485,6 +487,28 @@ describe('serve app', () => { expect(data.runs[0].pass_rate).toBe(1); }); + it('infers the experiment name from the run id when live results have not written it yet', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'issue-1198-live-name'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T12-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + runs: Array<{ experiment?: string; target?: string }>; + }; + expect(data.runs).toHaveLength(1); + expect(data.runs[0]).toMatchObject({ + experiment: 'issue-1198-live-name', + target: 'gpt-4o', + }); + }); + it('merges cached remote runs and tags them with remote source metadata', async () => { const previousHome = process.env.AGENTV_HOME; process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); @@ -536,6 +560,48 @@ describe('serve app', () => { }); }); + describe('GET /api/benchmarks/all-runs', () => { + it('infers experiment names for live benchmark runs before records persist them', async () => { + const previousHome = process.env.AGENTV_HOME; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + + try { + const benchmarkDir = path.join(tempDir, 'bench-one'); + const runDir = path.join( + benchmarkDir, + '.agentv', + 'results', + 'runs', + 'issue-1198-benchmark', + '2026-03-25T12-00-00-000Z', + ); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + addBenchmark(benchmarkDir); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/benchmarks/all-runs'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + runs: Array<{ benchmark_id: string; experiment?: string; target?: string }>; + }; + expect(data.runs).toHaveLength(1); + expect(data.runs[0]).toMatchObject({ + benchmark_id: 'bench-one', + experiment: 'issue-1198-benchmark', + target: 'gpt-4o', + }); + } finally { + if (previousHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousHome; + } + } + }); + }); + describe('GET /api/remote/status', () => { it('reports configured remote status with graceful local-only fallback', async () => { mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 1d3f7be7..03f90779 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -11,6 +11,7 @@ import type React from 'react'; import { Link } from '@tanstack/react-router'; import { DEFAULT_PASS_THRESHOLD, useStudioConfig } from '~/lib/api'; +import { formatRunLabel } from '~/lib/run-label'; import type { RunMeta } from '~/lib/types'; import { PassRatePill } from './PassRatePill'; @@ -41,14 +42,6 @@ function formatDate(ts: string | undefined | null): { date: string; full: string } } -/** Human-readable run label: "target · experiment" or filename fallback. */ -function runLabel(run: RunMeta): string { - const parts = [run.target, run.experiment].filter((p) => p && p !== 'default' && p !== '-'); - if (parts.length > 0) return parts.join(' · '); - if (run.target) return run.target; - return run.display_name ?? run.filename; -} - export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { const { data: config } = useStudioConfig(); const passThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD; @@ -89,7 +82,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { {runs.map((run) => { const ts = formatDate(run.timestamp); const passing = run.pass_rate >= passThreshold; - const label = runLabel(run); + const label = formatRunLabel(run); const passedCount = Math.round(run.pass_rate * run.test_count); const failedCount = run.test_count - passedCount; return ( diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 5933d791..9ae5cc8d 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -29,6 +29,7 @@ import { useRunList, useStudioConfig, } from '~/lib/api'; +import { formatRunLabel } from '~/lib/run-label'; import { useSidebarContext } from '~/lib/sidebar-context'; /** Responsive