From 828aab8d21b7c3ebbe452f891fb10607ae481a6a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 30 Apr 2026 03:39:35 +0200
Subject: [PATCH 1/2] fix: show live run names before completion

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/src/commands/results/serve.ts       | 20 ++++--
 apps/cli/test/commands/results/serve.test.ts | 66 ++++++++++++++++++++
 apps/studio/src/components/RunList.tsx       | 11 +---
 apps/studio/src/components/Sidebar.tsx       |  7 ++-
 apps/studio/src/lib/run-label.test.ts        | 25 ++++++++
 apps/studio/src/lib/run-label.ts             | 14 +++++
 6 files changed, 127 insertions(+), 16 deletions(-)
 create mode 100644 apps/studio/src/lib/run-label.test.ts
 create mode 100644 apps/studio/src/lib/run-label.ts

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 580d4e1f..acea1e5a 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -264,19 +264,31 @@ interface DataContext {
 // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
 type C = Context<any, any, any>;
 
+function inferExperimentFromRunId(runId: string): string | undefined {
+  const separatorIndex = runId.lastIndexOf('::');
+  if (separatorIndex === -1) {
+    return undefined;
+  }
+  const experiment = runId.slice(0, separatorIndex).trim();
+  if (!experiment || experiment === 'default') {
+    return undefined;
+  }
+  return experiment;
+}
+
 async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
   const { runs: metas } = await listMergedResultFiles(searchDir);
   const { threshold: passThreshold } = loadStudioConfig(agentvDir);
   return c.json({
     runs: metas.map((m) => {
       let target: string | undefined;
-      let experiment: string | undefined;
+      let experiment = inferExperimentFromRunId(m.raw_filename);
       let passRate = m.passRate;
       try {
         const records = loadLightweightResults(m.path);
         if (records.length > 0) {
           target = records[0].target;
-          experiment = records[0].experiment;
+          experiment = records[0].experiment ?? experiment;
           passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
         }
       } catch {
@@ -1041,12 +1053,12 @@ export function createApp(
         const { runs: metas } = await listMergedResultFiles(p.path);
         for (const m of metas) {
           let target: string | undefined;
-          let experiment: string | undefined;
+          let experiment = inferExperimentFromRunId(m.raw_filename);
           try {
             const records = loadLightweightResults(m.path);
             if (records.length > 0) {
               target = records[0].target;
-              experiment = records[0].experiment;
+              experiment = records[0].experiment ?? experiment;
             }
           } catch {
             // ignore enrichment errors
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 702cd9ec..bf81540f 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -3,6 +3,8 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
+import { addBenchmark } from '@agentv/core';
+
 import {
   createApp,
   loadResults,
@@ -485,6 +487,28 @@ describe('serve app', () => {
       expect(data.runs[0].pass_rate).toBe(1);
     });
 
+    it('infers the experiment name from the run id when live results have not written it yet', async () => {
+      const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'issue-1198-live-name');
+      mkdirSync(runsDir, { recursive: true });
+      const filename = '2026-03-25T12-00-00-000Z';
+      const runDir = path.join(runsDir, filename);
+      mkdirSync(runDir, { recursive: true });
+      writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));
+
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+      const res = await app.request('/api/runs');
+
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as {
+        runs: Array<{ experiment?: string; target?: string }>;
+      };
+      expect(data.runs).toHaveLength(1);
+      expect(data.runs[0]).toMatchObject({
+        experiment: 'issue-1198-live-name',
+        target: 'gpt-4o',
+      });
+    });
+
     it('merges cached remote runs and tags them with remote source metadata', async () => {
       const previousHome = process.env.AGENTV_HOME;
       process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
@@ -536,6 +560,48 @@ describe('serve app', () => {
     });
   });
 
+  describe('GET /api/benchmarks/all-runs', () => {
+    it('infers experiment names for live benchmark runs before records persist them', async () => {
+      const previousHome = process.env.AGENTV_HOME;
+      process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
+
+      try {
+        const benchmarkDir = path.join(tempDir, 'bench-one');
+        const runDir = path.join(
+          benchmarkDir,
+          '.agentv',
+          'results',
+          'runs',
+          'issue-1198-benchmark',
+          '2026-03-25T12-00-00-000Z',
+        );
+        mkdirSync(runDir, { recursive: true });
+        writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));
+        addBenchmark(benchmarkDir);
+
+        const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+        const res = await app.request('/api/benchmarks/all-runs');
+
+        expect(res.status).toBe(200);
+        const data = (await res.json()) as {
+          runs: Array<{ benchmark_id: string; experiment?: string; target?: string }>;
+        };
+        expect(data.runs).toHaveLength(1);
+        expect(data.runs[0]).toMatchObject({
+          benchmark_id: 'bench-one',
+          experiment: 'issue-1198-benchmark',
+          target: 'gpt-4o',
+        });
+      } finally {
+        if (previousHome === undefined) {
+          process.env.AGENTV_HOME = undefined;
+        } else {
+          process.env.AGENTV_HOME = previousHome;
+        }
+      }
+    });
+  });
+
   describe('GET /api/remote/status', () => {
     it('reports configured remote status with graceful local-only fallback', async () => {
       mkdirSync(path.join(tempDir, '.agentv'), { recursive: true });
diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx
index 1d3f7be7..03f90779 100644
--- a/apps/studio/src/components/RunList.tsx
+++ b/apps/studio/src/components/RunList.tsx
@@ -11,6 +11,7 @@ import type React from 'react';
 import { Link } from '@tanstack/react-router';
 
 import { DEFAULT_PASS_THRESHOLD, useStudioConfig } from '~/lib/api';
+import { formatRunLabel } from '~/lib/run-label';
 import type { RunMeta } from '~/lib/types';
 
 import { PassRatePill } from './PassRatePill';
@@ -41,14 +42,6 @@ function formatDate(ts: string | undefined | null): { date: string; full: string
   }
 }
 
-/** Human-readable run label: "target · experiment" or filename fallback. */
-function runLabel(run: RunMeta): string {
-  const parts = [run.target, run.experiment].filter((p) => p && p !== 'default' && p !== '-');
-  if (parts.length > 0) return parts.join(' · ');
-  if (run.target) return run.target;
-  return run.display_name ?? run.filename;
-}
-
 export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
   const { data: config } = useStudioConfig();
   const passThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD;
@@ -89,7 +82,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
           {runs.map((run) => {
             const ts = formatDate(run.timestamp);
             const passing = run.pass_rate >= passThreshold;
-            const label = runLabel(run);
+            const label = formatRunLabel(run);
             const passedCount = Math.round(run.pass_rate * run.test_count);
             const failedCount = run.test_count - passedCount;
             return (
diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx
index 5933d791..9ae5cc8d 100644
--- a/apps/studio/src/components/Sidebar.tsx
+++ b/apps/studio/src/components/Sidebar.tsx
@@ -29,6 +29,7 @@ import {
   useRunList,
   useStudioConfig,
 } from '~/lib/api';
+import { formatRunLabel } from '~/lib/run-label';
 import { useSidebarContext } from '~/lib/sidebar-context';
 
 /** Responsive <aside> wrapper. Handles mobile overlay and desktop static placement. */
@@ -213,7 +214,7 @@ function RunSidebar() {
                 className="mb-0.5 block truncate rounded-md px-2 py-1.5 text-sm text-gray-400 transition-colors hover:bg-gray-800/50 hover:text-gray-200"
                 title={run.benchmark_name}
               >
-                {run.display_name ?? run.filename}
+                {formatRunLabel(run)}
               </Link>
             );
           }
@@ -229,7 +230,7 @@ function RunSidebar() {
                   : 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200'
               }`}
             >
-              {run.display_name ?? run.filename}
+              {formatRunLabel(run)}
             </Link>
           );
         })}
@@ -451,7 +452,7 @@ function BenchmarkRunDetailSidebar({
                   : 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200'
               }`}
             >
-              {run.display_name ?? run.filename}
+              {formatRunLabel(run)}
             </Link>
           );
         })}
diff --git a/apps/studio/src/lib/run-label.test.ts b/apps/studio/src/lib/run-label.test.ts
new file mode 100644
index 00000000..57628799
--- /dev/null
+++ b/apps/studio/src/lib/run-label.test.ts
@@ -0,0 +1,25 @@
+import { describe, expect, it } from 'bun:test';
+
+import { formatRunLabel } from './run-label';
+
+describe('formatRunLabel', () => {
+  it('prefers target and experiment over the timestamp display name', () => {
+    expect(
+      formatRunLabel({
+        filename: 'issue-1198::2026-04-29T09-17-30-111Z',
+        display_name: '2026-04-29T09-17-30-111Z',
+        target: 'llm-dry-run',
+        experiment: 'issue-1198',
+      }),
+    ).toBe('llm-dry-run · issue-1198');
+  });
+
+  it('falls back to the display name when no richer metadata is available', () => {
+    expect(
+      formatRunLabel({
+        filename: '2026-04-29T09-17-30-111Z',
+        display_name: '2026-04-29T09-17-30-111Z',
+      }),
+    ).toBe('2026-04-29T09-17-30-111Z');
+  });
+});
diff --git a/apps/studio/src/lib/run-label.ts b/apps/studio/src/lib/run-label.ts
new file mode 100644
index 00000000..8f0ded6e
--- /dev/null
+++ b/apps/studio/src/lib/run-label.ts
@@ -0,0 +1,14 @@
+import type { RunMeta } from './types';
+
+type RunLabelInput = Pick<RunMeta, 'display_name' | 'experiment' | 'filename' | 'target'>;
+
+/** Format a run label consistently across tables and nav surfaces. */
+export function formatRunLabel(run: RunLabelInput): string {
+  const parts = [run.target, run.experiment].filter(
+    (part): part is string => !!part && part !== 'default' && part !== '-',
+  );
+  if (parts.length > 0) {
+    return parts.join(' · ');
+  }
+  return run.display_name ?? run.filename;
+}

From 3e21872940d8ab8b5685252512a5bf1c7cc39ffe Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 30 Apr 2026 03:41:19 +0200
Subject: [PATCH 2/2] test: stabilize benchmark live-name regression

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/test/commands/results/serve.test.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index bf81540f..5c411718 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -577,7 +577,7 @@ describe('serve app', () => {
         );
         mkdirSync(runDir, { recursive: true });
         writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));
-        addBenchmark(benchmarkDir);
+        const benchmark = addBenchmark(benchmarkDir);
 
         const app = createApp([], tempDir, tempDir, undefined, { studioDir });
         const res = await app.request('/api/benchmarks/all-runs');
@@ -588,7 +588,7 @@ describe('serve app', () => {
         };
         expect(data.runs).toHaveLength(1);
         expect(data.runs[0]).toMatchObject({
-          benchmark_id: 'bench-one',
+          benchmark_id: benchmark.id,
           experiment: 'issue-1198-benchmark',
           target: 'gpt-4o',
         });