From dbf1920c00edde01183014fce1627b98bf5511f9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 06:53:02 +0200 Subject: [PATCH 01/12] feat(studio): runtime benchmark discovery without server restart Persist optional discoveryRoots in ~/.agentv/projects.yaml and resolve the effective benchmark set at request time so repos appearing or disappearing under a root are reflected in Studio without restarting `agentv serve`. - Add BenchmarkEntry.source ('manual' | 'discovered') and BenchmarkRegistry.discoveryRoots; keep YAML unchanged when empty. - Add resolveActiveBenchmarks / getActiveBenchmark: merges persisted entries with a live rescan of every root; persisted wins on path conflict; discovered entries are never written to disk. - Route /api/benchmarks, /api/benchmarks/all-runs, /api/benchmarks/:id/summary and withBenchmark / registerEvalRoutes through the active list so discovered repos participate in every benchmark-scoped route. - New HTTP endpoints: GET/POST/DELETE /api/benchmarks/discovery-roots and POST /api/benchmarks/rescan. DELETE /api/benchmarks/:id rejects discovered entries with a clear error. - New --discovery-root CLI flag (repeatable) that persists a root and continues to start the server; --discover's one-shot semantics are preserved. - Count active benchmarks when picking single/multi dashboard mode. - Unit tests in packages/core/test/benchmarks.test.ts cover add/remove/ idempotency, live appear/disappear, manual-vs-discovered precedence, and standalone manual entries. Closes #1144. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/src/commands/results/serve.ts | 162 ++++++++++++++++-- .../plans/1144-runtime-benchmark-discovery.md | 70 ++++++++ packages/core/src/benchmarks.ts | 139 ++++++++++++++- packages/core/src/index.ts | 6 + packages/core/test/benchmarks.test.ts | 97 +++++++++++ 5 files changed, 450 insertions(+), 24 deletions(-) create mode 100644 docs/plans/1144-runtime-benchmark-discovery.md create mode 100644 packages/core/test/benchmarks.test.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index eeb3a4d6..901b2bb2 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -11,7 +11,9 @@ * - GET /api/runs/:filename — load results from a specific run workspace * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews - * - GET /api/benchmarks — list registered benchmarks + * - GET /api/benchmarks — list active benchmarks (persisted + live-discovered) + * - POST /api/benchmarks/rescan — force a discovery-root rescan + * - GET/POST/DELETE /api/benchmarks/discovery-roots — manage runtime discovery roots * - GET /api/benchmarks/:benchmarkId/runs — benchmark-scoped run list * * All data routes (runs, suites, categories, evals, experiments, targets) @@ -32,17 +34,30 @@ import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { command, flag, number, option, optional, positional, string } from 'cmd-ts'; +import { + array, + command, + flag, + multioption, + number, + option, + optional, + positional, + string, +} from 'cmd-ts'; import { DEFAULT_CATEGORY, type EvaluationResult, addBenchmark, + addDiscoveryRoot, discoverBenchmarks, - getBenchmark, - loadBenchmarkRegistry, + getActiveBenchmark, + getDiscoveryRoots, loadConfig, removeBenchmark, + removeDiscoveryRoot, + resolveActiveBenchmarks, } from '@agentv/core'; import type { Context } from 'hono'; import { Hono } from 'hono'; @@ -897,11 +912,13 @@ export function createApp( // ── Benchmark resolution wrapper ────────────────────────────────────── // Resolves benchmarkId → DataContext, returning 404 if not found. + // Looks up against the *active* set (persisted + live-discovered) so repos + // under a configured discovery root resolve without a server restart. function withBenchmark( c: C, handler: (c: C, ctx: DataContext) => Response | Promise, ): Response | Promise { - const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); + const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark || !existsSync(benchmark.path)) { return c.json({ error: 'Project not found' }, 404); } @@ -940,6 +957,7 @@ export function createApp( path: string; addedAt: string; lastOpenedAt: string; + source?: 'manual' | 'discovered'; }) { return { id: entry.id, @@ -947,13 +965,14 @@ export function createApp( path: entry.path, added_at: entry.addedAt, last_opened_at: entry.lastOpenedAt, + source: entry.source ?? 'manual', }; } app.get('/api/benchmarks', async (c) => { - const registry = loadBenchmarkRegistry(); + const active = resolveActiveBenchmarks(); const benchmarks = await Promise.all( - registry.benchmarks.map(async (p) => { + active.map(async (p) => { let runCount = 0; let passRate = 0; let lastRun: string | null = null; @@ -997,13 +1016,24 @@ export function createApp( if (readOnly) { return c.json({ error: 'Studio is running in read-only mode' }, 403); } - const removed = removeBenchmark(c.req.param('benchmarkId') ?? ''); + const benchmarkId = c.req.param('benchmarkId') ?? ''; + const active = getActiveBenchmark(benchmarkId); + if (active?.source === 'discovered') { + return c.json( + { + error: + 'This project was discovered from a configured root. Remove the root or delete its .agentv/ directory to drop it.', + }, + 400, + ); + } + const removed = removeBenchmark(benchmarkId); if (!removed) return c.json({ error: 'Project not found' }, 404); return c.json({ ok: true }); }); app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { - const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); + const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark) return c.json({ error: 'Project not found' }, 404); try { const { runs: metas } = await listMergedResultFiles(benchmark.path); @@ -1038,9 +1068,9 @@ export function createApp( } }); - /** Aggregate runs from all registered benchmarks, sorted by timestamp descending. */ + /** Aggregate runs from all active benchmarks, sorted by timestamp descending. */ app.get('/api/benchmarks/all-runs', async (c) => { - const registry = loadBenchmarkRegistry(); + const active = resolveActiveBenchmarks(); const allRuns: Array<{ filename: string; display_name: string; @@ -1057,7 +1087,7 @@ export function createApp( project_name: string; }> = []; - for (const p of registry.benchmarks) { + for (const p of active) { try { const { runs: metas } = await listMergedResultFiles(p.path); for (const m of metas) { @@ -1097,6 +1127,74 @@ export function createApp( return c.json({ runs: allRuns }); }); + // ── Discovery roots (runtime benchmark auto-discovery) ─────────────── + // Roots are persisted in ~/.agentv/projects.yaml. On each GET + // /api/benchmarks, Studio rescans them and surfaces new `.agentv/` repos — + // no server restart required (#1144). + + app.get('/api/benchmarks/discovery-roots', (c) => { + return c.json({ roots: getDiscoveryRoots() }); + }); + + app.post('/api/benchmarks/discovery-roots', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + try { + const body = await c.req.json<{ path: string }>(); + if (!body.path) return c.json({ error: 'Missing path' }, 400); + const root = addDiscoveryRoot(body.path); + return c.json({ root }, 201); + } catch (err) { + return c.json({ error: (err as Error).message }, 400); + } + }); + + app.delete('/api/benchmarks/discovery-roots', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + try { + const body = await c.req.json<{ path: string }>(); + if (!body.path) return c.json({ error: 'Missing path' }, 400); + const removed = removeDiscoveryRoot(body.path); + if (!removed) return c.json({ error: 'Root not found' }, 404); + return c.json({ ok: true }); + } catch (err) { + return c.json({ error: (err as Error).message }, 400); + } + }); + + /** Explicit rescan hook — useful when the UI wants a refresh without the poll tick. */ + app.post('/api/benchmarks/rescan', async (c) => { + const active = resolveActiveBenchmarks(); + const benchmarks = await Promise.all( + active.map(async (p) => { + let runCount = 0; + let passRate = 0; + let lastRun: string | null = null; + try { + const { runs: metas } = await listMergedResultFiles(p.path); + runCount = metas.length; + if (metas.length > 0) { + const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0); + passRate = totalPassRate / metas.length; + lastRun = metas[0].timestamp; + } + } catch { + // inaccessible + } + return { + ...benchmarkEntryToWire(p), + run_count: runCount, + pass_rate: passRate, + last_run: lastRun, + }; + }), + ); + return c.json({ projects: benchmarks }); + }); + // ── Data routes (unscoped) ──────────────────────────────────────────── app.get('/api/config', (c) => @@ -1276,7 +1374,7 @@ export function createApp( // For benchmark-scoped routes, resolve to benchmark path; otherwise use searchDir const benchmarkId = c.req.param('benchmarkId'); if (benchmarkId) { - const benchmark = getBenchmark(benchmarkId); + const benchmark = getActiveBenchmark(benchmarkId); if (benchmark) return benchmark.path; } return searchDir; @@ -1408,14 +1506,31 @@ export const resultsServeCommand = command({ discover: option({ type: optional(string), long: 'discover', - description: 'Scan a directory tree for repos with .agentv/', + description: 'Scan a directory tree for repos with .agentv/ (one-shot; exits after)', + }), + discoveryRoot: multioption({ + type: array(string), + long: 'discovery-root', + description: + 'Persist a directory that Studio continuously rescans for .agentv/ repos. Repeatable.', }), readOnly: flag({ long: 'read-only', description: 'Disable write operations and launch Studio in read-only leaderboard mode', }), }, - handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => { + handler: async ({ + source, + port, + dir, + multi, + single, + add, + remove, + discover, + discoveryRoot, + readOnly, + }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1456,6 +1571,15 @@ export const resultsServeCommand = command({ return; } + // Persist --discovery-root paths before starting the server. The server + // keeps running after this so Studio continuously rescans the roots. + if (discoveryRoot.length > 0) { + for (const root of discoveryRoot) { + const abs = addDiscoveryRoot(root); + console.log(`Watching discovery root: ${abs}`); + } + } + // ── Version check ──────────────────────────────────────────────── // Enforce `required_version` from .agentv/config.yaml so Studio/serve // match `agentv eval` behavior. Same prompt in TTY, warn+continue @@ -1469,8 +1593,10 @@ export const resultsServeCommand = command({ } // ── Determine multi-project mode ──────────────────────────────── - const registry = loadBenchmarkRegistry(); - const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.benchmarks.length, { + // Count active (persisted + live-discovered) benchmarks so that the + // dashboard mode reflects what the user will actually see in the UI. + const activeBenchmarks = resolveActiveBenchmarks(); + const { isMultiProject, showMultiWarning } = resolveDashboardMode(activeBenchmarks.length, { multi, single, }); @@ -1515,7 +1641,7 @@ export const resultsServeCommand = command({ } if (isMultiProject) { - console.log(`Multi-project mode: ${registry.benchmarks.length} project(s) registered`); + console.log(`Multi-project mode: ${activeBenchmarks.length} project(s) active`); } else if (results.length > 0 && sourceFile) { console.log(`Serving ${results.length} result(s) from ${sourceFile}`); } else { diff --git a/docs/plans/1144-runtime-benchmark-discovery.md b/docs/plans/1144-runtime-benchmark-discovery.md new file mode 100644 index 00000000..442e0a7e --- /dev/null +++ b/docs/plans/1144-runtime-benchmark-discovery.md @@ -0,0 +1,70 @@ +# Studio Runtime Benchmark Discovery (#1144) + +## Problem +Studio reads `~/.agentv/projects.yaml` fresh on every `/api/benchmarks` request, so +edits to that file are already picked up live. What doesn't work is **filesystem +discovery**: `--discover ` is a one-shot scan at startup, so any `.agentv/` +repo that appears/disappears under that path while `agentv serve` is running is +invisible until restart. + +## Design + +### Persisted state (projects.yaml) +Extend `BenchmarkRegistry` with an optional `discoveryRoots?: string[]`. This is +the persisted list of directories Studio should continuously scan for +`.agentv/` repos. Existing `benchmarks` entries remain untouched. + +### Active-vs-persisted split +Introduce `resolveActiveBenchmarks()` in `packages/core/src/benchmarks.ts`: +- Start with the persisted `benchmarks` array (manually added entries). +- For each discovery root, call `discoverBenchmarks(root)` and generate + synthetic entries with `source: 'discovered'`. Absolute path is the identity; + id is derived from basename + dedup against persisted ids. +- Persisted wins on path conflict (so a user can opt a discovered repo into + manual management). +- Return the merged list. Nothing is written to disk. + +This is cheap (depth-2 `readdirSync`) and avoids write contention. Discovered +entries are ephemeral — removing a `.agentv/` directory causes the next scan to +drop it. Manually-added entries are never auto-removed. + +### API changes (apps/cli/src/commands/results/serve.ts) +- `/api/benchmarks`, `/api/benchmarks/all-runs`, `/api/benchmarks/:id/summary`, + and `withBenchmark()` switch from `loadBenchmarkRegistry()` / + `getBenchmark()` to the resolved list, so discovered entries participate in + every benchmark-scoped route. +- New endpoints: + - `GET /api/benchmarks/discovery-roots` → `{ roots: string[] }` + - `POST /api/benchmarks/discovery-roots` `{ path }` → `{ root }` + - `DELETE /api/benchmarks/discovery-roots` `{ path }` → `{ ok: true }` + - `POST /api/benchmarks/rescan` → same shape as `GET /api/benchmarks` + +### CLI changes +Add `--discovery-root ` (repeatable via `multioption`). Paths are resolved +to absolute and appended to the persisted `discoveryRoots` (idempotent). The +server still starts — this is not a one-shot flag. + +The existing `--discover ` flag keeps its one-shot semantics for backward +compatibility. + +### Wire format +Discovered entries return `source: "discovered"` in the snake_case response so +the frontend can optionally disable the Remove button for them. The default is +`"manual"` (preserving the existing response shape for registered repos). + +## Acceptance-criteria mapping + +| Criterion | Handled by | +| ------------------------------------------- | --------------------------------------- | +| Start with zero projects, stay healthy | Already works; no change | +| New `.agentv/` repo appears without restart | `resolveActiveBenchmarks()` on each GET | +| Removed repo disappears without restart | Same — scan is recomputed per request | +| `/api/benchmarks` reflects live state | Same | + +## Test plan +1. Unit test `resolveActiveBenchmarks` with temp directories (add + remove + `.agentv/` and assert the returned list reflects it). +2. Unit test that persisted entries win over discovered ones at the same path. +3. Red/green UAT: start `agentv serve --discovery-root `; `curl + /api/benchmarks` → empty; `mkdir /r1/.agentv`; re-curl → shows `r1`; + `rm -rf /r1/.agentv`; re-curl → gone. Same server process throughout. diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 53a8a2ed..7b7ae5fe 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -2,7 +2,9 @@ * Benchmark registry for AgentV Studio multi-benchmark support. * * A Benchmark = any directory containing a `.agentv/` folder. - * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks. + * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks + * plus an optional list of discovery roots that Studio continuously rescans at + * runtime so repos can appear/disappear without a server restart (#1144). * * YAML format: * benchmarks: @@ -11,9 +13,21 @@ * path: /home/user/projects/my-app * addedAt: "2026-03-20T10:00:00Z" * lastOpenedAt: "2026-03-30T14:00:00Z" + * discoveryRoots: + * - /home/user/agentv-repos * - * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD, - * discoverBenchmarks() to scan a directory tree for `.agentv/` directories. + * Runtime model: + * - Entries in `benchmarks` are persisted (manual add/remove). + * - Entries under `discoveryRoots` are resolved live on each call to + * `resolveActiveBenchmarks()` — they are NOT written to disk. This means + * a repo appearing or disappearing under a root is reflected immediately, + * and manual entries are never auto-removed. + * + * To extend: + * - For CRUD on persisted entries: loadBenchmarkRegistry() / saveBenchmarkRegistry(). + * - For live discovery: addDiscoveryRoot() / removeDiscoveryRoot() / + * resolveActiveBenchmarks(). + * - discoverBenchmarks() scans a single directory tree for `.agentv/` folders. */ import { @@ -33,16 +47,22 @@ import { getAgentvConfigDir, getAgentvHome } from './paths.js'; // ── Types ─────────────────────────────────────────────────────────────── +export type BenchmarkSource = 'manual' | 'discovered'; + export interface BenchmarkEntry { id: string; name: string; path: string; addedAt: string; lastOpenedAt: string; + /** How this entry was registered. Absent (undefined) ≡ 'manual'. */ + source?: BenchmarkSource; } export interface BenchmarkRegistry { benchmarks: BenchmarkEntry[]; + /** Directories continuously rescanned for `.agentv/` repos. Optional. */ + discoveryRoots?: string[]; } // ── Registry path ─────────────────────────────────────────────────────── @@ -79,10 +99,16 @@ export function loadBenchmarkRegistry(): BenchmarkRegistry { try { const raw = readFileSync(registryPath, 'utf-8'); const parsed = parseYaml(raw); - if (!parsed || !Array.isArray(parsed.benchmarks)) { + if (!parsed || typeof parsed !== 'object') { return { benchmarks: [] }; } - return { benchmarks: parsed.benchmarks as BenchmarkEntry[] }; + const benchmarks = Array.isArray(parsed.benchmarks) + ? (parsed.benchmarks as BenchmarkEntry[]) + : []; + const discoveryRoots = Array.isArray(parsed.discoveryRoots) + ? (parsed.discoveryRoots as unknown[]).filter((v): v is string => typeof v === 'string') + : undefined; + return discoveryRoots !== undefined ? { benchmarks, discoveryRoots } : { benchmarks }; } catch { return { benchmarks: [] }; } @@ -94,7 +120,13 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - writeFileSync(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), 'utf-8'); + // Omit empty/undefined discoveryRoots from the serialized form so existing + // registries without the feature don't grow a stray key. + const payload: Record = { benchmarks: registry.benchmarks }; + if (registry.discoveryRoots && registry.discoveryRoots.length > 0) { + payload.discoveryRoots = registry.discoveryRoots; + } + writeFileSync(registryPath, stringifyYaml(payload), 'utf-8'); } // ── CRUD operations ───────────────────────────────────────────────────── @@ -226,3 +258,98 @@ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { scan(absRoot, 0); return results; } + +// ── Discovery roots (persisted) ───────────────────────────────────────── + +/** + * Return the persisted discovery roots as absolute paths. Never returns undefined. + */ +export function getDiscoveryRoots(): string[] { + const registry = loadBenchmarkRegistry(); + return [...(registry.discoveryRoots ?? [])]; +} + +/** + * Add an absolute discovery root to the persisted registry (idempotent). + * Returns the resolved absolute path. Does NOT validate that the directory + * currently exists — a root may become populated after Studio starts. + */ +export function addDiscoveryRoot(rootPath: string): string { + const absRoot = path.resolve(rootPath); + const registry = loadBenchmarkRegistry(); + const roots = registry.discoveryRoots ?? []; + if (!roots.includes(absRoot)) { + roots.push(absRoot); + } + saveBenchmarkRegistry({ benchmarks: registry.benchmarks, discoveryRoots: roots }); + return absRoot; +} + +/** + * Remove a discovery root. Returns true if it was present, false otherwise. + */ +export function removeDiscoveryRoot(rootPath: string): boolean { + const absRoot = path.resolve(rootPath); + const registry = loadBenchmarkRegistry(); + const roots = registry.discoveryRoots ?? []; + const idx = roots.indexOf(absRoot); + if (idx < 0) return false; + roots.splice(idx, 1); + saveBenchmarkRegistry({ benchmarks: registry.benchmarks, discoveryRoots: roots }); + return true; +} + +// ── Active benchmarks (persisted + live-discovered) ───────────────────── + +/** + * Return the effective benchmark list: persisted entries merged with a live + * scan of every discovery root. Discovered entries are synthesized on the fly + * (tagged `source: 'discovered'`) and are NOT written to disk, so a repo + * disappearing from a root drops out of subsequent calls. Persisted entries + * win on absolute-path conflict, letting a user opt a discovered repo into + * manual management. + */ +export function resolveActiveBenchmarks(): BenchmarkEntry[] { + const registry = loadBenchmarkRegistry(); + const persisted = registry.benchmarks.map((b) => ({ + ...b, + source: b.source ?? ('manual' as const), + })); + const roots = registry.discoveryRoots ?? []; + if (roots.length === 0) return persisted; + + const takenPaths = new Set(persisted.map((b) => b.path)); + const takenIds = new Set(persisted.map((b) => b.id)); + const discovered: BenchmarkEntry[] = []; + for (const root of roots) { + for (const repoPath of discoverBenchmarks(root)) { + if (takenPaths.has(repoPath)) continue; + takenPaths.add(repoPath); + const id = deriveBenchmarkId(repoPath, [...takenIds]); + takenIds.add(id); + // Synthetic timestamps: use the .agentv dir mtime if readable, else now. + let ts = new Date().toISOString(); + try { + ts = statSync(path.join(repoPath, '.agentv')).mtime.toISOString(); + } catch { + // Keep the fallback timestamp. + } + discovered.push({ + id, + name: path.basename(repoPath), + path: repoPath, + addedAt: ts, + lastOpenedAt: ts, + source: 'discovered', + }); + } + } + return [...persisted, ...discovered]; +} + +/** + * Look up an active benchmark (persisted or discovered) by id. + */ +export function getActiveBenchmark(benchmarkId: string): BenchmarkEntry | undefined { + return resolveActiveBenchmarks().find((b) => b.id === benchmarkId); +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 3e9a475d..f418339a 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -88,6 +88,7 @@ export { export { type BenchmarkEntry, type BenchmarkRegistry, + type BenchmarkSource, loadBenchmarkRegistry, saveBenchmarkRegistry, addBenchmark, @@ -97,6 +98,11 @@ export { discoverBenchmarks, deriveBenchmarkId, getBenchmarksRegistryPath, + getDiscoveryRoots, + addDiscoveryRoot, + removeDiscoveryRoot, + resolveActiveBenchmarks, + getActiveBenchmark, } from './benchmarks.js'; export { trimBaselineResult } from './evaluation/baseline.js'; export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts new file mode 100644 index 00000000..116e1c24 --- /dev/null +++ b/packages/core/test/benchmarks.test.ts @@ -0,0 +1,97 @@ +import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync } from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { + addBenchmark, + addDiscoveryRoot, + getDiscoveryRoots, + loadBenchmarkRegistry, + removeDiscoveryRoot, + resolveActiveBenchmarks, +} from '../src/benchmarks.js'; + +describe('benchmarks registry + runtime discovery', () => { + let fakeHome: string; + let reposRoot: string; + // biome-ignore lint/suspicious/noExplicitAny: spy typing from bun:test is intentionally loose. + let homedirSpy: any; + + beforeEach(() => { + fakeHome = mkdtempSync(path.join(os.tmpdir(), 'agentv-benchmarks-')); + reposRoot = mkdtempSync(path.join(os.tmpdir(), 'agentv-repos-')); + homedirSpy = spyOn(os, 'homedir').mockReturnValue(fakeHome); + }); + + afterEach(() => { + homedirSpy?.mockRestore?.(); + rmSync(fakeHome, { recursive: true, force: true }); + rmSync(reposRoot, { recursive: true, force: true }); + }); + + function makeRepo(name: string): string { + const dir = path.join(reposRoot, name); + mkdirSync(path.join(dir, '.agentv'), { recursive: true }); + return dir; + } + + it('persists and lists discovery roots, omitting the key when empty', () => { + expect(getDiscoveryRoots()).toEqual([]); + expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); + + const added = addDiscoveryRoot(reposRoot); + expect(added).toBe(path.resolve(reposRoot)); + expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); + + // Adding the same root again is idempotent. + addDiscoveryRoot(reposRoot); + expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); + + expect(removeDiscoveryRoot(reposRoot)).toBe(true); + expect(getDiscoveryRoots()).toEqual([]); + expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); + }); + + it('surfaces repos appearing under a discovery root without restart', () => { + addDiscoveryRoot(reposRoot); + + expect(resolveActiveBenchmarks()).toEqual([]); + + makeRepo('r1'); + const afterAdd = resolveActiveBenchmarks(); + expect(afterAdd).toHaveLength(1); + expect(afterAdd[0]).toMatchObject({ + name: 'r1', + path: path.resolve(reposRoot, 'r1'), + source: 'discovered', + }); + + // Simulate removal: rm -rf the repo dir. + rmSync(path.join(reposRoot, 'r1'), { recursive: true, force: true }); + expect(resolveActiveBenchmarks()).toEqual([]); + }); + + it('keeps manually-added entries even when their path is not under a root', () => { + const outside = makeRepo('manual'); + const entry = addBenchmark(outside); + + const active = resolveActiveBenchmarks(); + expect(active).toHaveLength(1); + expect(active[0].id).toBe(entry.id); + expect(active[0].source).toBe('manual'); + }); + + it('prefers the persisted entry when a discovery root would produce a duplicate path', () => { + const repoPath = makeRepo('shared'); + // Register manually first. + const manual = addBenchmark(repoPath); + // Then configure a discovery root covering the same repo. + addDiscoveryRoot(reposRoot); + + const active = resolveActiveBenchmarks(); + expect(active).toHaveLength(1); + expect(active[0].id).toBe(manual.id); + expect(active[0].source).toBe('manual'); + }); +}); From 31f8b5a259618fcbed3368755e9bcab8a57052c6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 07:00:47 +0200 Subject: [PATCH 02/12] review: address code-review notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop external-issue-reference comments per AGENTS.md §7 (AI-First). - Document single-writer assumption in benchmarks.ts header; the existing read-modify-write model is safe for the single-process Studio case that motivated the change. - Sort discoverBenchmarks output so id assignment under basename collisions is deterministic across filesystems. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/src/commands/results/serve.ts | 2 +- packages/core/src/benchmarks.ts | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 901b2bb2..8ee3c8e0 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1130,7 +1130,7 @@ export function createApp( // ── Discovery roots (runtime benchmark auto-discovery) ─────────────── // Roots are persisted in ~/.agentv/projects.yaml. On each GET // /api/benchmarks, Studio rescans them and surfaces new `.agentv/` repos — - // no server restart required (#1144). + // no server restart required. app.get('/api/benchmarks/discovery-roots', (c) => { return c.json({ roots: getDiscoveryRoots() }); diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 7b7ae5fe..9177884f 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -4,7 +4,7 @@ * A Benchmark = any directory containing a `.agentv/` folder. * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks * plus an optional list of discovery roots that Studio continuously rescans at - * runtime so repos can appear/disappear without a server restart (#1144). + * runtime so repos can appear/disappear without a server restart. * * YAML format: * benchmarks: @@ -23,11 +23,19 @@ * a repo appearing or disappearing under a root is reflected immediately, * and manual entries are never auto-removed. * + * Concurrency: the registry assumes a single writer. All mutating calls + * (add/remove/touchBenchmark, add/removeDiscoveryRoot) do read-modify-write on + * projects.yaml without a lock. Interleaved writes from multiple processes + * can clobber each other; Studio's HTTP handlers are serialized by Node's + * single-threaded event loop, which satisfies the 24/7 Studio case. Run only + * one `agentv` process against a given home at a time. + * * To extend: * - For CRUD on persisted entries: loadBenchmarkRegistry() / saveBenchmarkRegistry(). * - For live discovery: addDiscoveryRoot() / removeDiscoveryRoot() / * resolveActiveBenchmarks(). - * - discoverBenchmarks() scans a single directory tree for `.agentv/` folders. + * - discoverBenchmarks() scans a single directory tree for `.agentv/` folders; + * its output is sorted for deterministic id assignment under basename collisions. */ import { @@ -256,7 +264,9 @@ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { } scan(absRoot, 0); - return results; + // Sort for deterministic iteration — readdirSync order is filesystem-dependent, + // and basename collisions produce collision-suffix ids that must be stable. + return results.sort(); } // ── Discovery roots (persisted) ───────────────────────────────────────── From 968525336bbf8e5b2db972e0e53604d24c245e70 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 07:54:49 +0200 Subject: [PATCH 03/12] fix(core): serialize discovery_roots with snake_case YAML key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AGENTS.md §"Wire Format Convention" mandates snake_case for YAML config fields, with camelCase reserved for internal TypeScript. The previous commit emitted discoveryRoots (camelCase) on disk. TS field name stays discoveryRoots; only the serialization boundary changes. Adds a regression test that reads projects.yaml after a write and asserts the on-disk key is discovery_roots. Pre-existing benchmarks[] fields (addedAt, lastOpenedAt) are left as-is in this PR since changing them would be a back-compat-breaking migration orthogonal to runtime discovery; they're flagged in the file header. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/src/benchmarks.ts | 17 +++++++++-------- packages/core/test/benchmarks.test.ts | 9 ++++++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 9177884f..4c278e2b 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -11,9 +11,9 @@ * - id: my-app * name: My App * path: /home/user/projects/my-app - * addedAt: "2026-03-20T10:00:00Z" - * lastOpenedAt: "2026-03-30T14:00:00Z" - * discoveryRoots: + * addedAt: "2026-03-20T10:00:00Z" # camelCase: pre-existing, kept for back-compat + * lastOpenedAt: "2026-03-30T14:00:00Z" # camelCase: pre-existing, kept for back-compat + * discovery_roots: # snake_case per AGENTS.md §"Wire Format Convention" * - /home/user/agentv-repos * * Runtime model: @@ -113,8 +113,8 @@ export function loadBenchmarkRegistry(): BenchmarkRegistry { const benchmarks = Array.isArray(parsed.benchmarks) ? (parsed.benchmarks as BenchmarkEntry[]) : []; - const discoveryRoots = Array.isArray(parsed.discoveryRoots) - ? (parsed.discoveryRoots as unknown[]).filter((v): v is string => typeof v === 'string') + const discoveryRoots = Array.isArray(parsed.discovery_roots) + ? (parsed.discovery_roots as unknown[]).filter((v): v is string => typeof v === 'string') : undefined; return discoveryRoots !== undefined ? { benchmarks, discoveryRoots } : { benchmarks }; } catch { @@ -128,11 +128,12 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - // Omit empty/undefined discoveryRoots from the serialized form so existing - // registries without the feature don't grow a stray key. + // Omit empty/undefined discovery_roots from the serialized form so existing + // registries without the feature don't grow a stray key. YAML uses snake_case + // per AGENTS.md §"Wire Format Convention"; TS internals stay camelCase. const payload: Record = { benchmarks: registry.benchmarks }; if (registry.discoveryRoots && registry.discoveryRoots.length > 0) { - payload.discoveryRoots = registry.discoveryRoots; + payload.discovery_roots = registry.discoveryRoots; } writeFileSync(registryPath, stringifyYaml(payload), 'utf-8'); } diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index 116e1c24..051f7a1a 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -1,11 +1,12 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; -import { mkdirSync, mkdtempSync, rmSync } from 'node:fs'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; import os from 'node:os'; import path from 'node:path'; import { addBenchmark, addDiscoveryRoot, + getBenchmarksRegistryPath, getDiscoveryRoots, loadBenchmarkRegistry, removeDiscoveryRoot, @@ -44,6 +45,12 @@ describe('benchmarks registry + runtime discovery', () => { expect(added).toBe(path.resolve(reposRoot)); expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); + // Serialized key on disk is snake_case per AGENTS.md wire-format convention, + // even though the in-memory TS field is discoveryRoots. + const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); + expect(yamlOnDisk).toContain('discovery_roots:'); + expect(yamlOnDisk).not.toContain('discoveryRoots:'); + // Adding the same root again is idempotent. addDiscoveryRoot(reposRoot); expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); From 5ea77d93caa9c564efe28222c1e8d6b193706944 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 08:00:32 +0200 Subject: [PATCH 04/12] fix(core)!: migrate projects.yaml benchmark keys to snake_case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING: benchmark entries in ~/.agentv/projects.yaml are serialized with snake_case keys (added_at, last_opened_at, source) instead of camelCase (addedAt, lastOpenedAt). Single-project Studio users are unaffected because they don't touch projects.yaml; multi-project users on pre-release builds must re-register projects (`agentv serve --add `). - Introduce BenchmarkEntryYaml + fromYaml/toYaml in packages/core/src/benchmarks.ts so TS internals stay camelCase and the YAML boundary stays snake_case. - Drop the camelCase → snake_case carry-over for addedAt / lastOpenedAt; the file header now documents the fully snake_case format as canonical. - Tighten AGENTS.md §"Wire Format Convention" to apply the rule blanket across all on-disk YAML (eval configs, projects.yaml, future files), add an anti-patterns list, and cite fromYaml/toYaml as the reference pattern for YAML boundaries. - Add a regression test asserting serialized keys on disk are snake_case and that the file round-trips into the camelCase TS shape. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 70 +++++++++++++++++++-------- packages/core/src/benchmarks.ts | 60 +++++++++++++++++++---- packages/core/test/benchmarks.test.ts | 23 ++++++++- 3 files changed, 121 insertions(+), 32 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 7d78e1cc..d159bd37 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -146,36 +146,64 @@ cd ../agentv.worktrees/- ## Wire Format Convention -**All external-facing JSON and JSONL output uses `snake_case` keys.** This applies to: -- JSONL result files on disk (`test_id`, `token_usage`, `duration_ms`) -- Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`) -- CLI command JSON output (`results summary`, `results failures`, `results show`) -- YAML eval config fields +**Everything that crosses a process boundary uses `snake_case` keys. Internal TypeScript uses `camelCase`. Translate at the boundary — never in the middle.** -**Internal TypeScript uses `camelCase`** as standard. Convert at the serialization boundary only: +The rule is blanket: if the key is going to disk, to a user's editor, into a JSON response, or onto a CLI, it's snake_case. There is no "well this file is internal-ish" carve-out. If in doubt, snake_case. + +### snake_case surfaces +- All YAML files on disk: `*.eval.yaml`, `agentv.config.yaml`, `projects.yaml`, `studio/config.yaml`, any future YAML we add. +- JSONL result files (`test_id`, `token_usage`, `duration_ms`). +- Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`). +- HTTP response bodies from `agentv serve` / Studio (`added_at`, `pass_rate`, `project_id`). +- CLI JSON output (`agentv results summary`, `results failures`, `results show`). +- Anything consumed by non-TS tooling (Python, jq pipelines, external dashboards). + +### camelCase surfaces +- TypeScript source: all variables, parameters, fields, type members. +- Internal in-memory shapes passed between TS modules. + +### Translate only at the boundary +Define a second interface for the wire shape and convert in one place — don't smear snake_case through TS internals. ```typescript -// Interfaces for JSON output use snake_case (they define the wire format) -interface SummaryJson { - total: number; - pass_rate: number; - failed_test_ids: string[]; +// Wire shape — snake_case, matches what hits disk / the network +interface BenchmarkEntryYaml { + id: string; + name: string; + path: string; + added_at: string; + last_opened_at: string; } -// Function internals use camelCase (idiomatic TypeScript) -function formatSummary(results: EvaluationResult[]): SummaryJson { - const passRate = computePassRate(results); - const failedTestIds = findFailed(results); +// Internal shape — camelCase, what every TS call site sees +interface BenchmarkEntry { + id: string; + name: string; + path: string; + addedAt: string; + lastOpenedAt: string; +} - return { - total: results.length, - pass_rate: passRate, - failed_test_ids: failedTestIds, - }; +function fromYaml(e: BenchmarkEntryYaml): BenchmarkEntry { + return { id: e.id, name: e.name, path: e.path, addedAt: e.added_at, lastOpenedAt: e.last_opened_at }; +} + +function toYaml(e: BenchmarkEntry): BenchmarkEntryYaml { + return { id: e.id, name: e.name, path: e.path, added_at: e.addedAt, last_opened_at: e.lastOpenedAt }; } ``` -**Reading back:** `parseJsonlResults()` in `artifact-writer.ts` converts snake_case → camelCase when reading JSONL into TypeScript. +Yes, this is two interfaces and two functions per entity. That's the price of keeping TS idiomatic while staying faithful to the wire contract. Don't skip it — dumping TS objects directly to YAML leaks `addedAt`-style camelCase onto disk and breaks jq/Python consumers. + +### Anti-patterns +- `writeFileSync(path, stringifyYaml(tsObject))` — dumps TS field names verbatim. Wrong. +- `interface Foo { testId: string; ... }` for a JSON response body — `test_id`, always. +- Accepting both `testId` and `test_id` on input "for back-compat" when nothing is shipped yet. Just snake_case. + +### Existing divergences +If you spot a camelCase key already on disk or in a response (e.g. historical `projects.yaml`, a legacy endpoint), treat it as a bug: migrate it to snake_case in the same PR where you touch that code path. Don't grandfather it in. + +**Reading back:** `parseJsonlResults()` in `artifact-writer.ts` converts snake_case → camelCase when reading JSONL into TypeScript. `fromYaml` / `toYaml` in `packages/core/src/benchmarks.ts` is the model for YAML boundaries. **Why:** Aligns with skill-creator (claude-plugins-official) and broader Python/JSON ecosystem conventions where snake_case is the standard wire format. diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 4c278e2b..dd848120 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -6,14 +6,14 @@ * plus an optional list of discovery roots that Studio continuously rescans at * runtime so repos can appear/disappear without a server restart. * - * YAML format: + * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"): * benchmarks: * - id: my-app * name: My App * path: /home/user/projects/my-app - * addedAt: "2026-03-20T10:00:00Z" # camelCase: pre-existing, kept for back-compat - * lastOpenedAt: "2026-03-30T14:00:00Z" # camelCase: pre-existing, kept for back-compat - * discovery_roots: # snake_case per AGENTS.md §"Wire Format Convention" + * added_at: "2026-03-20T10:00:00Z" + * last_opened_at: "2026-03-30T14:00:00Z" + * discovery_roots: * - /home/user/agentv-repos * * Runtime model: @@ -95,6 +95,45 @@ function migrateProjectsYaml(targetPath: string): void { } // ── Load / Save ───────────────────────────────────────────────────────── +// YAML uses snake_case per AGENTS.md §"Wire Format Convention"; TypeScript +// internals stay camelCase. fromYaml / toYaml handle the translation; every +// other function in this module works in camelCase only. + +interface BenchmarkEntryYaml { + id: string; + name: string; + path: string; + added_at: string; + last_opened_at: string; + source?: BenchmarkSource; +} + +function fromYaml(raw: unknown): BenchmarkEntry | null { + if (!raw || typeof raw !== 'object') return null; + const e = raw as Partial; + if (typeof e.id !== 'string' || typeof e.name !== 'string' || typeof e.path !== 'string') { + return null; + } + return { + id: e.id, + name: e.name, + path: e.path, + addedAt: typeof e.added_at === 'string' ? e.added_at : '', + lastOpenedAt: typeof e.last_opened_at === 'string' ? e.last_opened_at : '', + ...(e.source && { source: e.source }), + }; +} + +function toYaml(entry: BenchmarkEntry): BenchmarkEntryYaml { + return { + id: entry.id, + name: entry.name, + path: entry.path, + added_at: entry.addedAt, + last_opened_at: entry.lastOpenedAt, + ...(entry.source && { source: entry.source }), + }; +} export function loadBenchmarkRegistry(): BenchmarkRegistry { const registryPath = getBenchmarksRegistryPath(); @@ -111,7 +150,9 @@ export function loadBenchmarkRegistry(): BenchmarkRegistry { return { benchmarks: [] }; } const benchmarks = Array.isArray(parsed.benchmarks) - ? (parsed.benchmarks as BenchmarkEntry[]) + ? (parsed.benchmarks as unknown[]) + .map(fromYaml) + .filter((e): e is BenchmarkEntry => e !== null) : []; const discoveryRoots = Array.isArray(parsed.discovery_roots) ? (parsed.discovery_roots as unknown[]).filter((v): v is string => typeof v === 'string') @@ -128,10 +169,11 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - // Omit empty/undefined discovery_roots from the serialized form so existing - // registries without the feature don't grow a stray key. YAML uses snake_case - // per AGENTS.md §"Wire Format Convention"; TS internals stay camelCase. - const payload: Record = { benchmarks: registry.benchmarks }; + // Omit empty/undefined discovery_roots from the serialized form so registries + // without the feature don't grow a stray key. + const payload: Record = { + benchmarks: registry.benchmarks.map(toYaml), + }; if (registry.discoveryRoots && registry.discoveryRoots.length > 0) { payload.discovery_roots = registry.discoveryRoots; } diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index 051f7a1a..2d1a533f 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -45,8 +45,8 @@ describe('benchmarks registry + runtime discovery', () => { expect(added).toBe(path.resolve(reposRoot)); expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); - // Serialized key on disk is snake_case per AGENTS.md wire-format convention, - // even though the in-memory TS field is discoveryRoots. + // Serialized keys on disk are snake_case per AGENTS.md wire-format convention, + // even though the in-memory TS fields are camelCase. const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); expect(yamlOnDisk).toContain('discovery_roots:'); expect(yamlOnDisk).not.toContain('discoveryRoots:'); @@ -79,6 +79,25 @@ describe('benchmarks registry + runtime discovery', () => { expect(resolveActiveBenchmarks()).toEqual([]); }); + it('serializes benchmark entries with snake_case keys on disk', () => { + const repoPath = makeRepo('snake'); + const entry = addBenchmark(repoPath); + + const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); + expect(yamlOnDisk).toContain('added_at:'); + expect(yamlOnDisk).toContain('last_opened_at:'); + expect(yamlOnDisk).not.toContain('addedAt:'); + expect(yamlOnDisk).not.toContain('lastOpenedAt:'); + + // Round-trips cleanly back into the camelCase TS shape. + const reloaded = loadBenchmarkRegistry().benchmarks.find((b) => b.id === entry.id); + expect(reloaded).toMatchObject({ + id: entry.id, + addedAt: entry.addedAt, + lastOpenedAt: entry.lastOpenedAt, + }); + }); + it('keeps manually-added entries even when their path is not under a root', () => { const outside = makeRepo('manual'); const entry = addBenchmark(outside); From e9ceba395b61a4e20da7455532a31582113fa7ff Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 09:13:40 +0200 Subject: [PATCH 05/12] feat(studio)!: Remove on discovered entry hides via excluded_paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING: hard-remove the --discover CLI flag and POST /api/benchmarks/discover endpoint. Callers should use --discovery-root + POST /api/benchmarks/discovery-roots for the runtime-watching model. The core discoverBenchmarks util stays exported. Better UX for Remove on a discovered entry: instead of 400-ing, add the repo's path to a new persisted excluded_paths[] list and hide it from future scans. The .agentv/ directory stays on disk, so the user can re-show the repo (via DELETE /api/benchmarks/exclusions) or pin it manually (POST /api/benchmarks, which auto-unexcludes). - New BenchmarkRegistry.excludedPaths?: string[] (YAML key: excluded_paths). - New core helpers: getExcludedPaths, addExcludedPath, removeExcludedPath. - resolveActiveBenchmarks filters the discovered set by excludedPaths; pinned entries are never filtered. - addBenchmark() strips the path from excludedPaths if present — explicit pin wins over a prior hide. - DELETE /api/benchmarks/:id on a discovered entry calls addExcludedPath and returns { ok: true, excluded: }; on a manual entry it still removes from benchmarks[] as before. - GET /api/benchmarks/exclusions lists excluded paths; DELETE unhides one. - Route-ordering fix: DELETE /api/benchmarks/:benchmarkId is now registered after all /api/benchmarks/ sub-paths so Hono doesn't route DELETE /api/benchmarks/exclusions (or /discovery-roots) through the :id handler with benchmarkId=. Inline comment documents the constraint. - Tests: exclude-hides-then-unexclude-shows round-trip, snake_case on-disk key assertion, pin-beats-exclusion flow. All 2260 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/src/commands/results/serve.ts | 117 +++++++++++-------------- packages/core/src/benchmarks.ts | 83 ++++++++++++++++-- packages/core/src/index.ts | 3 + packages/core/test/benchmarks.test.ts | 40 +++++++++ 4 files changed, 170 insertions(+), 73 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 8ee3c8e0..0ae2ef07 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -14,6 +14,7 @@ * - GET /api/benchmarks — list active benchmarks (persisted + live-discovered) * - POST /api/benchmarks/rescan — force a discovery-root rescan * - GET/POST/DELETE /api/benchmarks/discovery-roots — manage runtime discovery roots + * - GET/DELETE /api/benchmarks/exclusions — list / un-hide paths hidden via "Remove" * - GET /api/benchmarks/:benchmarkId/runs — benchmark-scoped run list * * All data routes (runs, suites, categories, evals, experiments, targets) @@ -51,12 +52,14 @@ import { type EvaluationResult, addBenchmark, addDiscoveryRoot, - discoverBenchmarks, + addExcludedPath, getActiveBenchmark, getDiscoveryRoots, + getExcludedPaths, loadConfig, removeBenchmark, removeDiscoveryRoot, + removeExcludedPath, resolveActiveBenchmarks, } from '@agentv/core'; import type { Context } from 'hono'; @@ -1012,26 +1015,6 @@ export function createApp( } }); - app.delete('/api/benchmarks/:benchmarkId', (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - const benchmarkId = c.req.param('benchmarkId') ?? ''; - const active = getActiveBenchmark(benchmarkId); - if (active?.source === 'discovered') { - return c.json( - { - error: - 'This project was discovered from a configured root. Remove the root or delete its .agentv/ directory to drop it.', - }, - 400, - ); - } - const removed = removeBenchmark(benchmarkId); - if (!removed) return c.json({ error: 'Project not found' }, 404); - return c.json({ ok: true }); - }); - app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark) return c.json({ error: 'Project not found' }, 404); @@ -1053,21 +1036,6 @@ export function createApp( } }); - app.post('/api/benchmarks/discover', async (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - try { - const body = await c.req.json<{ path: string }>(); - if (!body.path) return c.json({ error: 'Missing path' }, 400); - const discovered = discoverBenchmarks(body.path); - const registered = discovered.map((p) => benchmarkEntryToWire(addBenchmark(p))); - return c.json({ discovered: registered }); - } catch (err) { - return c.json({ error: (err as Error).message }, 400); - } - }); - /** Aggregate runs from all active benchmarks, sorted by timestamp descending. */ app.get('/api/benchmarks/all-runs', async (c) => { const active = resolveActiveBenchmarks(); @@ -1165,6 +1133,51 @@ export function createApp( } }); + // ── Exclusions (hide a discovered repo from the UI) ───────────────── + // DELETE /api/benchmarks/:id on a discovered entry adds its path here; + // these endpoints let users list or un-hide those paths. + + app.get('/api/benchmarks/exclusions', (c) => { + return c.json({ excluded_paths: getExcludedPaths() }); + }); + + app.delete('/api/benchmarks/exclusions', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + try { + const body = await c.req.json<{ path: string }>(); + if (!body.path) return c.json({ error: 'Missing path' }, 400); + const removed = removeExcludedPath(body.path); + if (!removed) return c.json({ error: 'Path not in exclusions' }, 404); + return c.json({ ok: true }); + } catch (err) { + return c.json({ error: (err as Error).message }, 400); + } + }); + + // Registered after all `/api/benchmarks/` sub-paths so Hono doesn't + // route e.g. `DELETE /api/benchmarks/exclusions` into this handler with + // benchmarkId="exclusions". + app.delete('/api/benchmarks/:benchmarkId', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const benchmarkId = c.req.param('benchmarkId') ?? ''; + const active = getActiveBenchmark(benchmarkId); + if (!active) return c.json({ error: 'Project not found' }, 404); + // For a discovered entry, "remove" means hide it from the UI. The + // .agentv/ dir stays on disk; the path goes onto the exclusion list + // and is filtered out of resolveActiveBenchmarks on the next rescan. + if (active.source === 'discovered') { + addExcludedPath(active.path); + return c.json({ ok: true, excluded: active.path }); + } + const removed = removeBenchmark(benchmarkId); + if (!removed) return c.json({ error: 'Project not found' }, 404); + return c.json({ ok: true }); + }); + /** Explicit rescan hook — useful when the UI wants a refresh without the poll tick. */ app.post('/api/benchmarks/rescan', async (c) => { const active = resolveActiveBenchmarks(); @@ -1503,11 +1516,6 @@ export const resultsServeCommand = command({ long: 'remove', description: 'Unregister a project by ID', }), - discover: option({ - type: optional(string), - long: 'discover', - description: 'Scan a directory tree for repos with .agentv/ (one-shot; exits after)', - }), discoveryRoot: multioption({ type: array(string), long: 'discovery-root', @@ -1519,18 +1527,7 @@ export const resultsServeCommand = command({ description: 'Disable write operations and launch Studio in read-only leaderboard mode', }), }, - handler: async ({ - source, - port, - dir, - multi, - single, - add, - remove, - discover, - discoveryRoot, - readOnly, - }) => { + handler: async ({ source, port, dir, multi, single, add, remove, discoveryRoot, readOnly }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1557,20 +1554,6 @@ export const resultsServeCommand = command({ return; } - if (discover) { - const discovered = discoverBenchmarks(discover); - if (discovered.length === 0) { - console.log(`No projects with .agentv/ found under ${discover}`); - return; - } - for (const p of discovered) { - const entry = addBenchmark(p); - console.log(`Registered: ${entry.name} (${entry.id}) at ${entry.path}`); - } - console.log(`\nDiscovered ${discovered.length} project(s).`); - return; - } - // Persist --discovery-root paths before starting the server. The server // keeps running after this so Studio continuously rescans the roots. if (discoveryRoot.length > 0) { diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index dd848120..bea9d5bd 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -15,6 +15,8 @@ * last_opened_at: "2026-03-30T14:00:00Z" * discovery_roots: * - /home/user/agentv-repos + * excluded_paths: # discovered repos to hide from Studio + * - /home/user/agentv-repos/experiment-v0 * * Runtime model: * - Entries in `benchmarks` are persisted (manual add/remove). @@ -71,6 +73,12 @@ export interface BenchmarkRegistry { benchmarks: BenchmarkEntry[]; /** Directories continuously rescanned for `.agentv/` repos. Optional. */ discoveryRoots?: string[]; + /** + * Absolute paths to exclude from the discovered set. Clicking "Remove" on a + * discovered entry in Studio adds its path here so the repo stays on disk + * but disappears from the UI. Has no effect on manually-pinned entries. + */ + excludedPaths?: string[]; } // ── Registry path ─────────────────────────────────────────────────────── @@ -157,7 +165,13 @@ export function loadBenchmarkRegistry(): BenchmarkRegistry { const discoveryRoots = Array.isArray(parsed.discovery_roots) ? (parsed.discovery_roots as unknown[]).filter((v): v is string => typeof v === 'string') : undefined; - return discoveryRoots !== undefined ? { benchmarks, discoveryRoots } : { benchmarks }; + const excludedPaths = Array.isArray(parsed.excluded_paths) + ? (parsed.excluded_paths as unknown[]).filter((v): v is string => typeof v === 'string') + : undefined; + const result: BenchmarkRegistry = { benchmarks }; + if (discoveryRoots !== undefined) result.discoveryRoots = discoveryRoots; + if (excludedPaths !== undefined) result.excludedPaths = excludedPaths; + return result; } catch { return { benchmarks: [] }; } @@ -169,14 +183,17 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - // Omit empty/undefined discovery_roots from the serialized form so registries - // without the feature don't grow a stray key. + // Omit empty/undefined optional lists from the serialized form so registries + // without the feature don't grow stray keys. const payload: Record = { benchmarks: registry.benchmarks.map(toYaml), }; if (registry.discoveryRoots && registry.discoveryRoots.length > 0) { payload.discovery_roots = registry.discoveryRoots; } + if (registry.excludedPaths && registry.excludedPaths.length > 0) { + payload.excluded_paths = registry.excludedPaths; + } writeFileSync(registryPath, stringifyYaml(payload), 'utf-8'); } @@ -217,8 +234,14 @@ export function addBenchmark(benchmarkPath: string): BenchmarkEntry { } const registry = loadBenchmarkRegistry(); + // Pinning overrides a prior exclusion: if the user explicitly adds a path + // they had previously hidden from discovery, they clearly want to see it. + if (registry.excludedPaths?.includes(absPath)) { + registry.excludedPaths = registry.excludedPaths.filter((p) => p !== absPath); + } const existing = registry.benchmarks.find((p) => p.path === absPath); if (existing) { + saveBenchmarkRegistry(registry); return existing; } @@ -334,7 +357,8 @@ export function addDiscoveryRoot(rootPath: string): string { if (!roots.includes(absRoot)) { roots.push(absRoot); } - saveBenchmarkRegistry({ benchmarks: registry.benchmarks, discoveryRoots: roots }); + registry.discoveryRoots = roots; + saveBenchmarkRegistry(registry); return absRoot; } @@ -348,7 +372,51 @@ export function removeDiscoveryRoot(rootPath: string): boolean { const idx = roots.indexOf(absRoot); if (idx < 0) return false; roots.splice(idx, 1); - saveBenchmarkRegistry({ benchmarks: registry.benchmarks, discoveryRoots: roots }); + registry.discoveryRoots = roots; + saveBenchmarkRegistry(registry); + return true; +} + +// ── Exclusions (hide a discovered repo without deleting its .agentv/) ── + +/** + * Return the persisted exclusion list as absolute paths. + */ +export function getExcludedPaths(): string[] { + return [...(loadBenchmarkRegistry().excludedPaths ?? [])]; +} + +/** + * Append a path to the exclusion list (idempotent). Used when the user + * clicks "Remove" on a discovered entry — the .agentv/ dir stays on disk, + * but it's suppressed from the active set until the user unexcludes it. + * Returns the resolved absolute path. + */ +export function addExcludedPath(excludePath: string): string { + const abs = path.resolve(excludePath); + const registry = loadBenchmarkRegistry(); + const excluded = registry.excludedPaths ?? []; + if (!excluded.includes(abs)) { + excluded.push(abs); + } + registry.excludedPaths = excluded; + saveBenchmarkRegistry(registry); + return abs; +} + +/** + * Remove a path from the exclusion list. Returns true if it was present. + * The repo will reappear on the next discovery rescan if still under a root. + */ +export function removeExcludedPath(excludePath: string): boolean { + const abs = path.resolve(excludePath); + const registry = loadBenchmarkRegistry(); + const excluded = registry.excludedPaths ?? []; + const idx = excluded.indexOf(abs); + if (idx < 0) return false; + excluded.splice(idx, 1); + registry.excludedPaths = excluded; + saveBenchmarkRegistry(registry); return true; } @@ -360,7 +428,8 @@ export function removeDiscoveryRoot(rootPath: string): boolean { * (tagged `source: 'discovered'`) and are NOT written to disk, so a repo * disappearing from a root drops out of subsequent calls. Persisted entries * win on absolute-path conflict, letting a user opt a discovered repo into - * manual management. + * manual management. Paths in `excludedPaths` are filtered out of the + * discovered set (but never from pinned entries). */ export function resolveActiveBenchmarks(): BenchmarkEntry[] { const registry = loadBenchmarkRegistry(); @@ -371,12 +440,14 @@ export function resolveActiveBenchmarks(): BenchmarkEntry[] { const roots = registry.discoveryRoots ?? []; if (roots.length === 0) return persisted; + const excluded = new Set(registry.excludedPaths ?? []); const takenPaths = new Set(persisted.map((b) => b.path)); const takenIds = new Set(persisted.map((b) => b.id)); const discovered: BenchmarkEntry[] = []; for (const root of roots) { for (const repoPath of discoverBenchmarks(root)) { if (takenPaths.has(repoPath)) continue; + if (excluded.has(repoPath)) continue; takenPaths.add(repoPath); const id = deriveBenchmarkId(repoPath, [...takenIds]); takenIds.add(id); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index f418339a..b1da23c8 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -101,6 +101,9 @@ export { getDiscoveryRoots, addDiscoveryRoot, removeDiscoveryRoot, + getExcludedPaths, + addExcludedPath, + removeExcludedPath, resolveActiveBenchmarks, getActiveBenchmark, } from './benchmarks.js'; diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index 2d1a533f..9d9bd163 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -6,10 +6,13 @@ import path from 'node:path'; import { addBenchmark, addDiscoveryRoot, + addExcludedPath, getBenchmarksRegistryPath, getDiscoveryRoots, + getExcludedPaths, loadBenchmarkRegistry, removeDiscoveryRoot, + removeExcludedPath, resolveActiveBenchmarks, } from '../src/benchmarks.js'; @@ -108,6 +111,43 @@ describe('benchmarks registry + runtime discovery', () => { expect(active[0].source).toBe('manual'); }); + it('hides a discovered repo once its path is excluded, and shows it again when unexcluded', () => { + addDiscoveryRoot(reposRoot); + const repoPath = makeRepo('junk'); + + expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); + + const excluded = addExcludedPath(repoPath); + expect(excluded).toBe(path.resolve(repoPath)); + expect(getExcludedPaths()).toEqual([path.resolve(repoPath)]); + expect(resolveActiveBenchmarks()).toEqual([]); + + // Serialized form uses snake_case. + const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); + expect(yamlOnDisk).toContain('excluded_paths:'); + expect(yamlOnDisk).not.toContain('excludedPaths:'); + + // Unexclude → the repo reappears on the next scan. + expect(removeExcludedPath(repoPath)).toBe(true); + expect(getExcludedPaths()).toEqual([]); + expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); + }); + + it('auto-unexcludes a path when it is manually pinned', () => { + addDiscoveryRoot(reposRoot); + const repoPath = makeRepo('pin-me'); + addExcludedPath(repoPath); + expect(resolveActiveBenchmarks()).toEqual([]); + + // Pinning wins: addBenchmark should drop the exclusion. + const entry = addBenchmark(repoPath); + expect(getExcludedPaths()).toEqual([]); + const active = resolveActiveBenchmarks(); + expect(active).toHaveLength(1); + expect(active[0].id).toBe(entry.id); + expect(active[0].source).toBe('manual'); + }); + it('prefers the persisted entry when a discovery root would produce a duplicate path', () => { const repoPath = makeRepo('shared'); // Register manually first. From 2206c96663958afdf26f57d185d79d99c2fe311c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 09:41:35 +0200 Subject: [PATCH 06/12] =?UTF-8?q?refactor!:=20unify=20project=20=E2=86=92?= =?UTF-8?q?=20benchmark=20across=20file,=20API,=20CLI,=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Studio already called the domain concept a "benchmark" in code (BenchmarkRegistry, BenchmarkEntry, /api/benchmarks), but a long tail of surfaces still said "project": the on-disk registry filename, API response keys, CLI help text, error messages, docs, Studio routes, and component names. This sweeps every remaining "project" surface to "benchmark". BREAKING (safe — multi-benchmark Studio isn't in use yet): - File rename: ~/.agentv/projects.yaml → ~/.agentv/benchmarks.yaml. migrateLegacyRegistry() copies any existing projects.yaml on first load (from either the AGENTV_HOME or the config-dir location) and deletes the legacy file. - API response shape: /api/benchmarks now returns { benchmarks: [...] } (was projects); /api/benchmarks/all-runs rows now carry benchmark_id / benchmark_name (were project_id / project_name); /api/config returns benchmark_name + multi_benchmark_dashboard. - Studio URLs: /projects/$benchmarkId/... → /benchmarks/$benchmarkId/...; the routes/projects/ directory is renamed to routes/benchmarks/. - CLI: --discover flag removed (use --discovery-root instead); --add/--remove/--multi/--single help text says "benchmark"; error messages say "Benchmark not found" / "Registered benchmark: …"; console.log is "Multi-benchmark mode". - Core: BenchmarkEntry / BenchmarkRegistry unchanged (already benchmark-named); saveBenchmarkRegistry now preserves excludedPaths alongside discoveryRoots; the old migrateProjectsYaml is folded into migrateLegacyRegistry. - Frontend: ProjectCard component → BenchmarkCard; all Project*Sidebar / Project*Tab internal components renamed; UI strings say "benchmark". - Docs: studio.mdx option table updated; Auto-Discovery section replaced with Runtime Discovery that describes --discovery-root and the excluded_paths flow. running-evals.mdx filename reference updated. - AGENTS.md: Wire Format Convention lists benchmarks.yaml instead of projects.yaml. - Tests: resolveDashboardMode tests renamed ("single/multi-benchmark"); /api/config test reads the new keys; new benchmarks.test.ts case asserts migration from legacy projects.yaml. All 2261 tests pass; build, typecheck, and lint clean. Manual UAT confirmed: legacy projects.yaml → benchmarks.yaml migration, on-disk snake_case keys, response shape and /api/config emit the new names end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 4 +- apps/cli/src/commands/results/serve.ts | 76 ++++++------ apps/cli/test/commands/results/serve.test.ts | 28 ++--- .../{ProjectCard.tsx => BenchmarkCard.tsx} | 26 ++-- apps/studio/src/components/RunDetail.tsx | 2 +- apps/studio/src/components/RunList.tsx | 2 +- apps/studio/src/components/Sidebar.tsx | 76 ++++++------ apps/studio/src/lib/api.ts | 19 +-- apps/studio/src/lib/types.ts | 10 +- apps/studio/src/routeTree.gen.ts | 116 +++++++++--------- .../{projects => benchmarks}/$benchmarkId.tsx | 28 ++--- .../$benchmarkId_/evals/$runId.$evalId.tsx | 8 +- .../$benchmarkId_/runs/$runId.tsx | 8 +- apps/studio/src/routes/index.tsx | 70 +++++------ .../docs/docs/evaluation/running-evals.mdx | 2 +- .../src/content/docs/docs/tools/studio.mdx | 30 ++--- .../plans/1144-runtime-benchmark-discovery.md | 4 +- packages/core/src/benchmarks.ts | 55 ++++++--- packages/core/src/paths.ts | 2 +- packages/core/test/benchmarks.test.ts | 21 +++- 20 files changed, 321 insertions(+), 266 deletions(-) rename apps/studio/src/components/{ProjectCard.tsx => BenchmarkCard.tsx} (68%) rename apps/studio/src/routes/{projects => benchmarks}/$benchmarkId.tsx (89%) rename apps/studio/src/routes/{projects => benchmarks}/$benchmarkId_/evals/$runId.$evalId.tsx (92%) rename apps/studio/src/routes/{projects => benchmarks}/$benchmarkId_/runs/$runId.tsx (93%) diff --git a/AGENTS.md b/AGENTS.md index d159bd37..511987db 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -151,7 +151,7 @@ cd ../agentv.worktrees/- The rule is blanket: if the key is going to disk, to a user's editor, into a JSON response, or onto a CLI, it's snake_case. There is no "well this file is internal-ish" carve-out. If in doubt, snake_case. ### snake_case surfaces -- All YAML files on disk: `*.eval.yaml`, `agentv.config.yaml`, `projects.yaml`, `studio/config.yaml`, any future YAML we add. +- All YAML files on disk: `*.eval.yaml`, `agentv.config.yaml`, `benchmarks.yaml`, `studio/config.yaml`, any future YAML we add. - JSONL result files (`test_id`, `token_usage`, `duration_ms`). - Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`). - HTTP response bodies from `agentv serve` / Studio (`added_at`, `pass_rate`, `project_id`). @@ -201,7 +201,7 @@ Yes, this is two interfaces and two functions per entity. That's the price of ke - Accepting both `testId` and `test_id` on input "for back-compat" when nothing is shipped yet. Just snake_case. ### Existing divergences -If you spot a camelCase key already on disk or in a response (e.g. historical `projects.yaml`, a legacy endpoint), treat it as a bug: migrate it to snake_case in the same PR where you touch that code path. Don't grandfather it in. +If you spot a camelCase key already on disk or in a response (e.g. a legacy endpoint), treat it as a bug: migrate it to snake_case in the same PR where you touch that code path. Don't grandfather it in. **Reading back:** `parseJsonlResults()` in `artifact-writer.ts` converts snake_case → camelCase when reading JSONL into TypeScript. `fromYaml` / `toYaml` in `packages/core/src/benchmarks.ts` is the model for YAML boundaries. diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 0ae2ef07..ab79cf4e 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -23,7 +23,7 @@ * how searchDir is resolved. * * Before starting the server, the command enforces `required_version` from - * the cwd's `.agentv/config.yaml` (single-project scope) via + * the cwd's `.agentv/config.yaml` (single-benchmark scope) via * `enforceRequiredVersion()`, matching the behavior of `agentv eval`. * * Exported functions (for testing): @@ -143,16 +143,16 @@ export function loadResults(content: string): EvaluationResult[] { export function resolveDashboardMode( projectCount: number, options: { multi?: boolean; single?: boolean }, -): { isMultiProject: boolean; showMultiWarning: boolean } { +): { isMultiBenchmark: boolean; showMultiWarning: boolean } { if (options.single === true) { - return { isMultiProject: false, showMultiWarning: options.multi === true }; + return { isMultiBenchmark: false, showMultiWarning: options.multi === true }; } if (options.multi === true) { - return { isMultiProject: true, showMultiWarning: true }; + return { isMultiBenchmark: true, showMultiWarning: true }; } - return { isMultiProject: projectCount > 1, showMultiWarning: false }; + return { isMultiBenchmark: projectCount > 1, showMultiWarning: false }; } // ── Feedback persistence ───────────────────────────────────────────────── @@ -833,13 +833,13 @@ async function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { function handleConfig( c: C, { agentvDir, searchDir }: DataContext, - options?: { readOnly?: boolean; multiProjectDashboard?: boolean }, + options?: { readOnly?: boolean; multiBenchmarkDashboard?: boolean }, ) { return c.json({ ...loadStudioConfig(agentvDir), read_only: options?.readOnly === true, - project_name: path.basename(searchDir), - multi_project_dashboard: options?.multiProjectDashboard === true, + benchmark_name: path.basename(searchDir), + multi_benchmark_dashboard: options?.multiBenchmarkDashboard === true, }); } @@ -905,7 +905,7 @@ export function createApp( resultDir: string, cwd?: string, sourceFile?: string, - options?: { studioDir?: string; readOnly?: boolean; multiProjectDashboard?: boolean }, + options?: { studioDir?: string; readOnly?: boolean; multiBenchmarkDashboard?: boolean }, ): Hono { const searchDir = cwd ?? resultDir; const agentvDir = path.join(searchDir, '.agentv'); @@ -923,7 +923,7 @@ export function createApp( ): Response | Promise { const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark || !existsSync(benchmark.path)) { - return c.json({ error: 'Project not found' }, 404); + return c.json({ error: 'Benchmark not found' }, 404); } return handler(c, { searchDir: benchmark.path, @@ -998,7 +998,7 @@ export function createApp( }; }), ); - return c.json({ projects: benchmarks }); + return c.json({ benchmarks }); }); app.post('/api/benchmarks', async (c) => { @@ -1017,7 +1017,7 @@ export function createApp( app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); - if (!benchmark) return c.json({ error: 'Project not found' }, 404); + if (!benchmark) return c.json({ error: 'Benchmark not found' }, 404); try { const { runs: metas } = await listMergedResultFiles(benchmark.path); const runCount = metas.length; @@ -1051,8 +1051,8 @@ export function createApp( target?: string; experiment?: string; source: 'local' | 'remote'; - project_id: string; - project_name: string; + benchmark_id: string; + benchmark_name: string; }> = []; for (const p of active) { @@ -1082,12 +1082,12 @@ export function createApp( source: m.source, ...(target && { target }), ...(experiment && { experiment }), - project_id: p.id, - project_name: p.name, + benchmark_id: p.id, + benchmark_name: p.name, }); } } catch { - // skip inaccessible projects + // skip inaccessible benchmarks } } @@ -1096,7 +1096,7 @@ export function createApp( }); // ── Discovery roots (runtime benchmark auto-discovery) ─────────────── - // Roots are persisted in ~/.agentv/projects.yaml. On each GET + // Roots are persisted in ~/.agentv/benchmarks.yaml. On each GET // /api/benchmarks, Studio rescans them and surfaces new `.agentv/` repos — // no server restart required. @@ -1165,7 +1165,7 @@ export function createApp( } const benchmarkId = c.req.param('benchmarkId') ?? ''; const active = getActiveBenchmark(benchmarkId); - if (!active) return c.json({ error: 'Project not found' }, 404); + if (!active) return c.json({ error: 'Benchmark not found' }, 404); // For a discovered entry, "remove" means hide it from the UI. The // .agentv/ dir stays on disk; the path goes onto the exclusion list // and is filtered out of resolveActiveBenchmarks on the next rescan. @@ -1174,7 +1174,7 @@ export function createApp( return c.json({ ok: true, excluded: active.path }); } const removed = removeBenchmark(benchmarkId); - if (!removed) return c.json({ error: 'Project not found' }, 404); + if (!removed) return c.json({ error: 'Benchmark not found' }, 404); return c.json({ ok: true }); }); @@ -1205,7 +1205,7 @@ export function createApp( }; }), ); - return c.json({ projects: benchmarks }); + return c.json({ benchmarks }); }); // ── Data routes (unscoped) ──────────────────────────────────────────── @@ -1213,7 +1213,7 @@ export function createApp( app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly, - multiProjectDashboard: options?.multiProjectDashboard, + multiBenchmarkDashboard: options?.multiBenchmarkDashboard, }), ); app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); @@ -1330,7 +1330,7 @@ export function createApp( withBenchmark(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly, - multiProjectDashboard: options?.multiProjectDashboard, + multiBenchmarkDashboard: options?.multiBenchmarkDashboard, }), ), ); @@ -1500,21 +1500,21 @@ export const resultsServeCommand = command({ multi: flag({ long: 'multi', description: - 'Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)', + 'Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or --single)', }), single: flag({ long: 'single', - description: 'Force single-project dashboard mode', + description: 'Force single-benchmark dashboard mode', }), add: option({ type: optional(string), long: 'add', - description: 'Register a project by path', + description: 'Register a benchmark by path', }), remove: option({ type: optional(string), long: 'remove', - description: 'Unregister a project by ID', + description: 'Unregister a benchmark by ID', }), discoveryRoot: multioption({ type: array(string), @@ -1535,7 +1535,7 @@ export const resultsServeCommand = command({ if (add) { try { const entry = addBenchmark(add); - console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`); + console.log(`Registered benchmark: ${entry.name} (${entry.id}) at ${entry.path}`); } catch (err) { console.error(`Error: ${(err as Error).message}`); process.exit(1); @@ -1546,9 +1546,9 @@ export const resultsServeCommand = command({ if (remove) { const removed = removeBenchmark(remove); if (removed) { - console.log(`Unregistered project: ${remove}`); + console.log(`Unregistered benchmark: ${remove}`); } else { - console.error(`Project not found: ${remove}`); + console.error(`Benchmark not found: ${remove}`); process.exit(1); } return; @@ -1566,20 +1566,20 @@ export const resultsServeCommand = command({ // ── Version check ──────────────────────────────────────────────── // Enforce `required_version` from .agentv/config.yaml so Studio/serve // match `agentv eval` behavior. Same prompt in TTY, warn+continue - // otherwise. Single-project scope only — when one agentv instance + // otherwise. Single-benchmark scope only — when one agentv instance // serves multiple repos with differing version requirements, a - // per-project local install is required instead. + // per-benchmark local install is required instead. const repoRoot = await findRepoRoot(cwd); const yamlConfig = await loadConfig(path.join(cwd, '_'), repoRoot); if (yamlConfig?.required_version) { await enforceRequiredVersion(yamlConfig.required_version); } - // ── Determine multi-project mode ──────────────────────────────── + // ── Determine multi-benchmark mode ─────────────────────────────── // Count active (persisted + live-discovered) benchmarks so that the // dashboard mode reflects what the user will actually see in the UI. const activeBenchmarks = resolveActiveBenchmarks(); - const { isMultiProject, showMultiWarning } = resolveDashboardMode(activeBenchmarks.length, { + const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode(activeBenchmarks.length, { multi, single, }); @@ -1614,17 +1614,17 @@ export const resultsServeCommand = command({ const resultDir = sourceFile ? path.dirname(path.resolve(sourceFile)) : cwd; const app = createApp(results, resultDir, cwd, sourceFile, { readOnly, - multiProjectDashboard: isMultiProject, + multiBenchmarkDashboard: isMultiBenchmark, }); if (showMultiWarning) { console.warn( - 'Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view.', + 'Warning: --multi is deprecated. Studio now auto-detects multi-benchmark mode when multiple benchmarks are registered. Use --single to force the single-benchmark view.', ); } - if (isMultiProject) { - console.log(`Multi-project mode: ${activeBenchmarks.length} project(s) active`); + if (isMultiBenchmark) { + console.log(`Multi-benchmark mode: ${activeBenchmarks.length} benchmark(s) active`); } else if (results.length > 0 && sourceFile) { console.log(`Serving ${results.length} result(s) from ${sourceFile}`); } else { diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 6b79b5c1..702cd9ec 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -102,44 +102,44 @@ describe('loadResults', () => { // ── resolveDashboardMode ─────────────────────────────────────────────── describe('resolveDashboardMode', () => { - it('defaults to single-project mode when no projects are registered', () => { + it('defaults to single-benchmark mode when no benchmarks are registered', () => { expect(resolveDashboardMode(0, {})).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); - it('defaults to single-project mode when exactly one project is registered', () => { + it('defaults to single-benchmark mode when exactly one benchmark is registered', () => { expect(resolveDashboardMode(1, {})).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); - it('defaults to multi-project mode when multiple projects are registered', () => { + it('defaults to multi-benchmark mode when multiple benchmarks are registered', () => { expect(resolveDashboardMode(2, {})).toEqual({ - isMultiProject: true, + isMultiBenchmark: true, showMultiWarning: false, }); }); - it('forces multi-project mode with a deprecation warning when --multi is used', () => { + it('forces multi-benchmark mode with a deprecation warning when --multi is used', () => { expect(resolveDashboardMode(1, { multi: true })).toEqual({ - isMultiProject: true, + isMultiBenchmark: true, showMultiWarning: true, }); }); - it('forces single-project mode when --single is used', () => { + it('forces single-benchmark mode when --single is used', () => { expect(resolveDashboardMode(3, { single: true })).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); it('lets --single override --multi', () => { expect(resolveDashboardMode(3, { multi: true, single: true })).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: true, }); }); @@ -370,17 +370,17 @@ describe('serve app', () => { const app = createApp(results, tempDir, undefined, undefined, { studioDir, readOnly: true, - multiProjectDashboard: true, + multiBenchmarkDashboard: true, }); const res = await app.request('/api/config'); expect(res.status).toBe(200); const data = (await res.json()) as { read_only?: boolean; - multi_project_dashboard?: boolean; + multi_benchmark_dashboard?: boolean; }; expect(data.read_only).toBe(true); - expect(data.multi_project_dashboard).toBe(true); + expect(data.multi_benchmark_dashboard).toBe(true); }); }); diff --git a/apps/studio/src/components/ProjectCard.tsx b/apps/studio/src/components/BenchmarkCard.tsx similarity index 68% rename from apps/studio/src/components/ProjectCard.tsx rename to apps/studio/src/components/BenchmarkCard.tsx index a7af8ceb..a498c1a1 100644 --- a/apps/studio/src/components/ProjectCard.tsx +++ b/apps/studio/src/components/BenchmarkCard.tsx @@ -1,8 +1,8 @@ /** - * Project card for the multi-project dashboard. + * Benchmark card for the multi-benchmark dashboard. * - * Shows project name, path, run count, pass rate, and last run time. - * Click navigates to the project's run list. + * Shows benchmark name, path, run count, pass rate, and last run time. + * Click navigates to the benchmark's run list. */ import { Link } from '@tanstack/react-router'; @@ -23,34 +23,34 @@ function formatTimeAgo(timestamp: string | null): string { return `${days}d ago`; } -export function ProjectCard({ project }: { project: BenchmarkSummary }) { - const passPercent = Math.round(project.pass_rate * 100); +export function BenchmarkCard({ benchmark }: { benchmark: BenchmarkSummary }) { + const passPercent = Math.round(benchmark.pass_rate * 100); return (

- {project.name} + {benchmark.name}

-

{project.path}

+

{benchmark.path}

Runs

-

{project.run_count}

+

{benchmark.run_count}

Pass Rate

= 80 ? 'text-emerald-400' @@ -59,12 +59,12 @@ export function ProjectCard({ project }: { project: BenchmarkSummary }) { : 'text-red-400' }`} > - {project.run_count > 0 ? `${passPercent}%` : '--'} + {benchmark.run_count > 0 ? `${passPercent}%` : '--'}

Last Run

-

{formatTimeAgo(project.last_run)}

+

{formatTimeAgo(benchmark.last_run)}

diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 7e78431a..019885ee 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -193,7 +193,7 @@ export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) { {benchmarkId ? ( diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 6a82b78e..1d3f7be7 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -107,7 +107,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { {benchmarkId ? ( diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 3bc35f0f..0d420311 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -70,44 +70,48 @@ function SidebarShell({ children }: { children: ReactNode }) { export function Sidebar() { const matchRoute = useMatchRoute(); - // ── Project-scoped route matching ──────────────────────────────────── - const projectEvalMatch = matchRoute({ - to: '/projects/$benchmarkId/evals/$runId/$evalId', + // ── Benchmark-scoped route matching ────────────────────────────────── + const benchmarkEvalMatch = matchRoute({ + to: '/benchmarks/$benchmarkId/evals/$runId/$evalId', fuzzy: true, }); - const projectRunMatch = matchRoute({ - to: '/projects/$benchmarkId/runs/$runId', + const benchmarkRunMatch = matchRoute({ + to: '/benchmarks/$benchmarkId/runs/$runId', fuzzy: true, }); - const projectMatch = matchRoute({ - to: '/projects/$benchmarkId', + const benchmarkMatch = matchRoute({ + to: '/benchmarks/$benchmarkId', fuzzy: true, }); - // Project-scoped eval detail + // Benchmark-scoped eval detail if ( - projectEvalMatch && - typeof projectEvalMatch === 'object' && - 'benchmarkId' in projectEvalMatch + benchmarkEvalMatch && + typeof benchmarkEvalMatch === 'object' && + 'benchmarkId' in benchmarkEvalMatch ) { - const { benchmarkId, runId, evalId } = projectEvalMatch as { + const { benchmarkId, runId, evalId } = benchmarkEvalMatch as { benchmarkId: string; runId: string; evalId: string; }; - return ; + return ; } - // Project-scoped run detail - if (projectRunMatch && typeof projectRunMatch === 'object' && 'benchmarkId' in projectRunMatch) { - const { benchmarkId, runId } = projectRunMatch as { benchmarkId: string; runId: string }; - return ; + // Benchmark-scoped run detail + if ( + benchmarkRunMatch && + typeof benchmarkRunMatch === 'object' && + 'benchmarkId' in benchmarkRunMatch + ) { + const { benchmarkId, runId } = benchmarkRunMatch as { benchmarkId: string; runId: string }; + return ; } - // Project home (runs/experiments/targets) - if (projectMatch && typeof projectMatch === 'object' && 'benchmarkId' in projectMatch) { - const { benchmarkId } = projectMatch as { benchmarkId: string }; - return ; + // Benchmark home (runs/experiments/targets) + if (benchmarkMatch && typeof benchmarkMatch === 'object' && 'benchmarkId' in benchmarkMatch) { + const { benchmarkId } = benchmarkMatch as { benchmarkId: string }; + return ; } // ── Unscoped route matching ────────────────────────────────────────── @@ -154,14 +158,14 @@ export function Sidebar() { function RunSidebar() { const matchRoute = useMatchRoute(); - const { data: projectData } = useBenchmarkList(); - const hasProjects = (projectData?.projects.length ?? 0) > 0; + const { data: benchmarkData } = useBenchmarkList(); + const hasBenchmarks = (benchmarkData?.benchmarks.length ?? 0) > 0; const isHome = matchRoute({ to: '/' }); const runMatch = matchRoute({ to: '/runs/$runId', fuzzy: true }); - // On the projects landing page, show aggregated runs from all projects - const useAggregated = hasProjects && isHome !== false; + // On the benchmarks landing page, show aggregated runs from all benchmarks + const useAggregated = hasBenchmarks && isHome !== false; const { data: localData } = useRunList(); const { data: aggregatedData } = useAllBenchmarkRuns(); @@ -199,15 +203,15 @@ function RunSidebar() { 'runId' in runMatch && (runMatch as { runId: string }).runId === run.filename; - // Aggregated runs link to their project's run detail - if (run.project_id) { + // Aggregated runs link to their benchmark's run detail + if (run.benchmark_id) { return ( {run.display_name ?? run.filename} @@ -406,7 +410,7 @@ function CategorySidebar({ runId, category }: { runId: string; category: string // ── Project-scoped sidebars ────────────────────────────────────────────── -function ProjectRunDetailSidebar({ +function BenchmarkRunDetailSidebar({ benchmarkId, currentRunId, }: { @@ -439,7 +443,7 @@ function ProjectRunDetailSidebar({ return ( @@ -498,7 +502,7 @@ function ProjectEvalSidebar({ return ( ; } @@ -277,22 +277,27 @@ export async function removeBenchmarkApi(benchmarkId: string): Promise { method: 'DELETE', }); if (!res.ok) { - throw new Error(`Failed to remove project: ${res.status}`); + throw new Error(`Failed to remove benchmark: ${res.status}`); } } -export async function discoverBenchmarksApi(dirPath: string): Promise { - const res = await fetch('/api/benchmarks/discover', { +/** + * Persist a directory as a discovery root. Studio rescans every configured + * root on each `/api/benchmarks` read so benchmarks under it appear/disappear + * live without a server restart. + */ +export async function addDiscoveryRootApi(dirPath: string): Promise { + const res = await fetch('/api/benchmarks/discovery-roots', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ path: dirPath }), }); if (!res.ok) { const err = (await res.json()) as { error: string }; - throw new Error(err.error || `Failed to discover: ${res.status}`); + throw new Error(err.error || `Failed to add discovery root: ${res.status}`); } - const data = (await res.json()) as { discovered: BenchmarkEntry[] }; - return data.discovered; + const data = (await res.json()) as { root: string }; + return data.root; } /** Build the API base URL for a benchmark-scoped request. */ diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 98dfd03b..6ca18934 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -17,8 +17,8 @@ export interface RunMeta { target?: string; experiment?: string; source: 'local' | 'remote'; - project_id?: string; - project_name?: string; + benchmark_id?: string; + benchmark_name?: string; /** Optional user-assigned tags from the run's sidecar tags.json. */ tags?: string[]; } @@ -236,8 +236,8 @@ export interface StudioConfigResponse { /** @deprecated Use threshold */ pass_threshold?: number; read_only?: boolean; - project_name?: string; - multi_project_dashboard?: boolean; + benchmark_name?: string; + multi_benchmark_dashboard?: boolean; } export interface RemoteStatusResponse { @@ -267,7 +267,7 @@ export interface BenchmarkSummary { } export interface BenchmarkListResponse { - projects: BenchmarkSummary[]; + benchmarks: BenchmarkSummary[]; } export interface BenchmarkEntry { diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index f1552a6c..dda530e9 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -12,14 +12,14 @@ import { Route as rootRouteImport } from './routes/__root' import { Route as SettingsRouteImport } from './routes/settings' import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' -import { Route as ProjectsBenchmarkIdRouteImport } from './routes/projects/$benchmarkId' import { Route as JobsRunIdRouteImport } from './routes/jobs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' +import { Route as BenchmarksBenchmarkIdRouteImport } from './routes/benchmarks/$benchmarkId' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' import { Route as RunsRunIdSuiteSuiteRouteImport } from './routes/runs/$runId_.suite.$suite' import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' -import { Route as ProjectsBenchmarkIdRunsRunIdRouteImport } from './routes/projects/$benchmarkId_/runs/$runId' -import { Route as ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport } from './routes/projects/$benchmarkId_/evals/$runId.$evalId' +import { Route as BenchmarksBenchmarkIdRunsRunIdRouteImport } from './routes/benchmarks/$benchmarkId_/runs/$runId' +import { Route as BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport } from './routes/benchmarks/$benchmarkId_/evals/$runId.$evalId' const SettingsRoute = SettingsRouteImport.update({ id: '/settings', @@ -36,11 +36,6 @@ const RunsRunIdRoute = RunsRunIdRouteImport.update({ path: '/runs/$runId', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdRoute = ProjectsBenchmarkIdRouteImport.update({ - id: '/projects/$benchmarkId', - path: '/projects/$benchmarkId', - getParentRoute: () => rootRouteImport, -} as any) const JobsRunIdRoute = JobsRunIdRouteImport.update({ id: '/jobs/$runId', path: '/jobs/$runId', @@ -52,6 +47,11 @@ const ExperimentsExperimentNameRoute = path: '/experiments/$experimentName', getParentRoute: () => rootRouteImport, } as any) +const BenchmarksBenchmarkIdRoute = BenchmarksBenchmarkIdRouteImport.update({ + id: '/benchmarks/$benchmarkId', + path: '/benchmarks/$benchmarkId', + getParentRoute: () => rootRouteImport, +} as any) const EvalsRunIdEvalIdRoute = EvalsRunIdEvalIdRouteImport.update({ id: '/evals/$runId/$evalId', path: '/evals/$runId/$evalId', @@ -68,113 +68,113 @@ const RunsRunIdCategoryCategoryRoute = path: '/runs/$runId/category/$category', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdRunsRunIdRoute = - ProjectsBenchmarkIdRunsRunIdRouteImport.update({ - id: '/projects/$benchmarkId_/runs/$runId', - path: '/projects/$benchmarkId/runs/$runId', +const BenchmarksBenchmarkIdRunsRunIdRoute = + BenchmarksBenchmarkIdRunsRunIdRouteImport.update({ + id: '/benchmarks/$benchmarkId_/runs/$runId', + path: '/benchmarks/$benchmarkId/runs/$runId', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdEvalsRunIdEvalIdRoute = - ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport.update({ - id: '/projects/$benchmarkId_/evals/$runId/$evalId', - path: '/projects/$benchmarkId/evals/$runId/$evalId', +const BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute = + BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport.update({ + id: '/benchmarks/$benchmarkId_/evals/$runId/$evalId', + path: '/benchmarks/$benchmarkId/evals/$runId/$evalId', getParentRoute: () => rootRouteImport, } as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRoutesByTo { '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId_/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId_/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId_/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId_/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId_/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId/runs/$runId' + | '/benchmarks/$benchmarkId/runs/$runId' | '/runs/$runId/category/$category' | '/runs/$runId/suite/$suite' - | '/projects/$benchmarkId/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId/evals/$runId/$evalId' fileRoutesByTo: FileRoutesByTo to: | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId/runs/$runId' + | '/benchmarks/$benchmarkId/runs/$runId' | '/runs/$runId/category/$category' | '/runs/$runId/suite/$suite' - | '/projects/$benchmarkId/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId/evals/$runId/$evalId' id: | '__root__' | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId_/runs/$runId' + | '/benchmarks/$benchmarkId_/runs/$runId' | '/runs/$runId_/category/$category' | '/runs/$runId_/suite/$suite' - | '/projects/$benchmarkId_/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId_/evals/$runId/$evalId' fileRoutesById: FileRoutesById } export interface RootRouteChildren { IndexRoute: typeof IndexRoute SettingsRoute: typeof SettingsRoute + BenchmarksBenchmarkIdRoute: typeof BenchmarksBenchmarkIdRoute ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute JobsRunIdRoute: typeof JobsRunIdRoute - ProjectsBenchmarkIdRoute: typeof ProjectsBenchmarkIdRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute - ProjectsBenchmarkIdRunsRunIdRoute: typeof ProjectsBenchmarkIdRunsRunIdRoute + BenchmarksBenchmarkIdRunsRunIdRoute: typeof BenchmarksBenchmarkIdRunsRunIdRoute RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute RunsRunIdSuiteSuiteRoute: typeof RunsRunIdSuiteSuiteRoute - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute: typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute: typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } declare module '@tanstack/react-router' { @@ -200,13 +200,6 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId': { - id: '/projects/$benchmarkId' - path: '/projects/$benchmarkId' - fullPath: '/projects/$benchmarkId' - preLoaderRoute: typeof ProjectsBenchmarkIdRouteImport - parentRoute: typeof rootRouteImport - } '/jobs/$runId': { id: '/jobs/$runId' path: '/jobs/$runId' @@ -221,6 +214,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ExperimentsExperimentNameRouteImport parentRoute: typeof rootRouteImport } + '/benchmarks/$benchmarkId': { + id: '/benchmarks/$benchmarkId' + path: '/benchmarks/$benchmarkId' + fullPath: '/benchmarks/$benchmarkId' + preLoaderRoute: typeof BenchmarksBenchmarkIdRouteImport + parentRoute: typeof rootRouteImport + } '/evals/$runId/$evalId': { id: '/evals/$runId/$evalId' path: '/evals/$runId/$evalId' @@ -242,18 +242,18 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId_/runs/$runId': { - id: '/projects/$benchmarkId_/runs/$runId' - path: '/projects/$benchmarkId/runs/$runId' - fullPath: '/projects/$benchmarkId/runs/$runId' - preLoaderRoute: typeof ProjectsBenchmarkIdRunsRunIdRouteImport + '/benchmarks/$benchmarkId_/runs/$runId': { + id: '/benchmarks/$benchmarkId_/runs/$runId' + path: '/benchmarks/$benchmarkId/runs/$runId' + fullPath: '/benchmarks/$benchmarkId/runs/$runId' + preLoaderRoute: typeof BenchmarksBenchmarkIdRunsRunIdRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId_/evals/$runId/$evalId': { - id: '/projects/$benchmarkId_/evals/$runId/$evalId' - path: '/projects/$benchmarkId/evals/$runId/$evalId' - fullPath: '/projects/$benchmarkId/evals/$runId/$evalId' - preLoaderRoute: typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport + '/benchmarks/$benchmarkId_/evals/$runId/$evalId': { + id: '/benchmarks/$benchmarkId_/evals/$runId/$evalId' + path: '/benchmarks/$benchmarkId/evals/$runId/$evalId' + fullPath: '/benchmarks/$benchmarkId/evals/$runId/$evalId' + preLoaderRoute: typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport parentRoute: typeof rootRouteImport } } @@ -262,16 +262,16 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, SettingsRoute: SettingsRoute, + BenchmarksBenchmarkIdRoute: BenchmarksBenchmarkIdRoute, ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, JobsRunIdRoute: JobsRunIdRoute, - ProjectsBenchmarkIdRoute: ProjectsBenchmarkIdRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, - ProjectsBenchmarkIdRunsRunIdRoute: ProjectsBenchmarkIdRunsRunIdRoute, + BenchmarksBenchmarkIdRunsRunIdRoute: BenchmarksBenchmarkIdRunsRunIdRoute, RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, RunsRunIdSuiteSuiteRoute: RunsRunIdSuiteSuiteRoute, - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute: - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute, + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute: + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/projects/$benchmarkId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx similarity index 89% rename from apps/studio/src/routes/projects/$benchmarkId.tsx rename to apps/studio/src/routes/benchmarks/$benchmarkId.tsx index 3bcdaf28..f8d69819 100644 --- a/apps/studio/src/routes/projects/$benchmarkId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx @@ -1,7 +1,7 @@ /** - * Project home route: tabbed view (Runs, Experiments, Analytics, Targets) scoped to a project. + * Benchmark home route: tabbed view (Runs, Experiments, Analytics, Targets) scoped to a benchmark. * - * Mirrors the single-project home page but fetches from project-scoped API endpoints. + * Mirrors the single-benchmark home page but fetches from benchmark-scoped API endpoints. */ import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-router'; @@ -32,11 +32,11 @@ const tabs: { id: TabId; label: string }[] = [ { id: 'targets', label: 'Targets' }, ]; -export const Route = createFileRoute('/projects/$benchmarkId')({ - component: ProjectHomePage, +export const Route = createFileRoute('/benchmarks/$benchmarkId')({ + component: BenchmarkHomePage, }); -function ProjectHomePage() { +function BenchmarkHomePage() { const { benchmarkId } = Route.useParams(); const routerState = useRouterState(); const searchParams = routerState.location.search as Record; @@ -72,7 +72,7 @@ function ProjectHomePage() { key={t.id} onClick={() => navigate({ - to: '/projects/$benchmarkId', + to: '/benchmarks/$benchmarkId', params: { benchmarkId }, search: { tab: t.id } as Record, }) @@ -89,12 +89,12 @@ function ProjectHomePage() { - {activeTab === 'runs' && } - {activeTab === 'experiments' && } + {activeTab === 'runs' && } + {activeTab === 'experiments' && } {activeTab === 'analytics' && ( - + )} - {activeTab === 'targets' && } + {activeTab === 'targets' && } {!isReadOnly && ( 0; - const multiProjectDashboard = config?.multi_project_dashboard; + const hasBenchmarks = (benchmarkData?.benchmarks.length ?? 0) > 0; + const multiBenchmarkDashboard = config?.multi_benchmark_dashboard; - if (projectsLoading || configLoading) { + if (benchmarksLoading || configLoading) { return ; } - if (multiProjectDashboard === true || (multiProjectDashboard === undefined && hasProjects)) { - return ; + if ( + multiBenchmarkDashboard === true || + (multiBenchmarkDashboard === undefined && hasBenchmarks) + ) { + return ; } - return ; + return ; } -// ── Projects Dashboard ────────────────────────────────────────────────── +// ── Benchmarks Dashboard ──────────────────────────────────────────────── -function ProjectsDashboard() { +function BenchmarksDashboard() { const { data } = useBenchmarkList(); const { data: config } = useStudioConfig(); const queryClient = useQueryClient(); const [addPath, setAddPath] = useState(''); - const [discoverPath, setDiscoverPath] = useState(''); + const [rootPath, setRootPath] = useState(''); const [error, setError] = useState(null); const [showAddForm, setShowAddForm] = useState(false); const [showRunEval, setShowRunEval] = useState(false); - const projects = data?.projects ?? []; + const benchmarks = data?.benchmarks ?? []; const isReadOnly = config?.read_only === true; - async function handleAddProject(e: React.FormEvent) { + async function handleAddBenchmark(e: React.FormEvent) { e.preventDefault(); if (!addPath.trim()) return; setError(null); @@ -88,16 +91,13 @@ function ProjectsDashboard() { } } - async function handleDiscover(e: React.FormEvent) { + async function handleAddDiscoveryRoot(e: React.FormEvent) { e.preventDefault(); - if (!discoverPath.trim()) return; + if (!rootPath.trim()) return; setError(null); try { - const discovered = await discoverBenchmarksApi(discoverPath.trim()); - setDiscoverPath(''); - if (discovered.length === 0) { - setError('No projects with .agentv/ found in that directory.'); - } + await addDiscoveryRootApi(rootPath.trim()); + setRootPath(''); queryClient.invalidateQueries({ queryKey: ['benchmarks'] }); } catch (err) { setError((err as Error).message); @@ -138,7 +138,7 @@ function ProjectsDashboard() { {!isReadOnly && showAddForm && (
-
+
-
+ setDiscoverPath(e.target.value)} - placeholder="Discover benchmarks in directory..." + value={rootPath} + onChange={(e) => setRootPath(e.target.value)} + placeholder="Watch a directory for .agentv/ repos..." className="flex-1 rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" />
)}
- {projects.map((project) => ( - + {benchmarks.map((benchmark) => ( + ))}
@@ -182,9 +182,9 @@ function ProjectsDashboard() { ); } -// ── Single-project home (existing behavior) ───────────────────────────── +// ── Single-benchmark home (existing behavior) ─────────────────────────── -function SingleProjectHome() { +function SingleBenchmarkHome() { const routerState = useRouterState(); const searchParams = routerState.location.search as Record; const tab = searchParams.tab as TabId | undefined; @@ -225,8 +225,8 @@ function SingleProjectHome() {

Evaluation Runs

- {config?.project_name && ( -

{config.project_name}

+ {config?.benchmark_name && ( +

{config.benchmark_name}

)}
{!isReadOnly && ( diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index b4c33362..49a8947c 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -409,7 +409,7 @@ The `{timestamp}` placeholder is replaced with an ISO-like timestamp (e.g., `202 ### AGENTV_HOME -Override the data directory for heavy runtime artifacts — workspaces, workspace pool, subagents, trace state, git cache, and downloaded dependencies. Lightweight config and cache files (`version-check.json`, `last-config.json`, `projects.yaml`) always stay in `~/.agentv` regardless of this setting. +Override the data directory for heavy runtime artifacts — workspaces, workspace pool, subagents, trace state, git cache, and downloaded dependencies. Lightweight config and cache files (`version-check.json`, `last-config.json`, `benchmarks.yaml`) always stay in `~/.agentv` regardless of this setting. ```bash # Linux/macOS diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 1df0b42a..3d485366 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -45,11 +45,11 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z |--------|-------------| | `--port`, `-p` | Port to listen on (flag > `PORT` env var > 3117) | | `--dir`, `-d` | Working directory (default: current directory) | -| `--multi` | Launch in multi-project dashboard mode (deprecated; use auto-detect or `--single`) | -| `--single` | Force single-project dashboard mode | -| `--add ` | Register a project by path | -| `--remove ` | Unregister a project by ID | -| `--discover ` | Scan a directory tree for repos with `.agentv/` | +| `--multi` | Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or `--single`) | +| `--single` | Force single-benchmark dashboard mode | +| `--add ` | Register a benchmark by path | +| `--remove ` | Unregister a benchmark by ID | +| `--discovery-root ` | Watch a directory: benchmarks under it appear/disappear live without restart. Repeatable. | ## Features @@ -152,28 +152,30 @@ agentv studio --add /path/to/my-evals agentv studio --add /path/to/other-evals ``` -Each path must contain a `.agentv/` directory. Registered benchmarks are stored in `~/.agentv/projects.yaml`. +Each path must contain a `.agentv/` directory. Registered benchmarks are stored in `~/.agentv/benchmarks.yaml`. -### Auto-Discovery +### Runtime Discovery -Scan a parent directory to find and register all benchmark repos: +For a 24/7 Studio deployment, tell it which directory to watch: ```bash -agentv studio --discover /path/to/repos +agentv studio --discovery-root /path/to/repos ``` -This recursively searches (up to 2 levels deep) for directories containing `.agentv/` and registers them. +Studio rescans every configured root on each `/api/benchmarks` read (every ~10 s via the UI poll), so any `.agentv/` repo appearing or disappearing under a root shows up without restarting the server. The flag is repeatable and persisted to `~/.agentv/benchmarks.yaml` under `discovery_roots`, so you only need to pass it once. + +Click **Remove** on a discovered benchmark to hide it from the UI — its path goes into the `excluded_paths` list. Pinning it via `--add` or the UI's Add form un-hides it (manual always wins). ### Launching the Dashboard -Studio auto-detects the mode based on how many benchmarks are registered: +Studio auto-detects the mode based on how many benchmarks are active: -- `0` or `1` registered: single-project view -- `2+` registered: Benchmarks dashboard +- `0` or `1` active: single-benchmark view +- `2+` active: Benchmarks dashboard ```bash agentv studio # auto-detects -agentv studio --single # force single-project view +agentv studio --single # force single-benchmark view ``` The landing page shows a card for each benchmark with run count, pass rate, and last run time. diff --git a/docs/plans/1144-runtime-benchmark-discovery.md b/docs/plans/1144-runtime-benchmark-discovery.md index 442e0a7e..74a7ad82 100644 --- a/docs/plans/1144-runtime-benchmark-discovery.md +++ b/docs/plans/1144-runtime-benchmark-discovery.md @@ -1,7 +1,7 @@ # Studio Runtime Benchmark Discovery (#1144) ## Problem -Studio reads `~/.agentv/projects.yaml` fresh on every `/api/benchmarks` request, so +Studio reads `~/.agentv/benchmarks.yaml` fresh on every `/api/benchmarks` request, so edits to that file are already picked up live. What doesn't work is **filesystem discovery**: `--discover ` is a one-shot scan at startup, so any `.agentv/` repo that appears/disappears under that path while `agentv serve` is running is @@ -9,7 +9,7 @@ invisible until restart. ## Design -### Persisted state (projects.yaml) +### Persisted state (benchmarks.yaml) Extend `BenchmarkRegistry` with an optional `discoveryRoots?: string[]`. This is the persisted list of directories Studio should continuously scan for `.agentv/` repos. Existing `benchmarks` entries remain untouched. diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index bea9d5bd..11a5da6b 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -2,9 +2,9 @@ * Benchmark registry for AgentV Studio multi-benchmark support. * * A Benchmark = any directory containing a `.agentv/` folder. - * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks - * plus an optional list of discovery roots that Studio continuously rescans at - * runtime so repos can appear/disappear without a server restart. + * The registry lives at `~/.agentv/benchmarks.yaml` and tracks registered + * benchmarks plus an optional list of discovery roots that Studio continuously + * rescans at runtime so repos can appear/disappear without a server restart. * * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"): * benchmarks: @@ -27,11 +27,15 @@ * * Concurrency: the registry assumes a single writer. All mutating calls * (add/remove/touchBenchmark, add/removeDiscoveryRoot) do read-modify-write on - * projects.yaml without a lock. Interleaved writes from multiple processes + * benchmarks.yaml without a lock. Interleaved writes from multiple processes * can clobber each other; Studio's HTTP handlers are serialized by Node's * single-threaded event loop, which satisfies the 24/7 Studio case. Run only * one `agentv` process against a given home at a time. * + * Legacy filename: pre-rename installs stored this at `~/.agentv/projects.yaml`. + * On first load, migrateLegacyRegistry() copies that file to benchmarks.yaml + * (and deletes the old one) so nobody loses their registrations. + * * To extend: * - For CRUD on persisted entries: loadBenchmarkRegistry() / saveBenchmarkRegistry(). * - For live discovery: addDiscoveryRoot() / removeDiscoveryRoot() / @@ -46,6 +50,7 @@ import { mkdirSync, readFileSync, readdirSync, + rmSync, statSync, writeFileSync, } from 'node:fs'; @@ -84,22 +89,42 @@ export interface BenchmarkRegistry { // ── Registry path ─────────────────────────────────────────────────────── export function getBenchmarksRegistryPath(): string { - return path.join(getAgentvConfigDir(), 'projects.yaml'); + return path.join(getAgentvConfigDir(), 'benchmarks.yaml'); } /** - * One-time migration: if projects.yaml exists at the old AGENTV_HOME location - * but not in ~/.agentv, copy it over. This handles the case where users had - * AGENTV_HOME set and projects.yaml was created there before the config/data split. + * One-time migration run before every load. Two legacy forms are handled: + * 1. AGENTV_HOME-relative `projects.yaml` — from before the config/data + * split, when the registry lived under AGENTV_HOME. + * 2. `~/.agentv/projects.yaml` — from before the projects→benchmarks + * rename. + * Whichever is found first (1 beats 2) is copied to the current + * benchmarks.yaml path and the legacy file is removed so the migration + * doesn't keep repeating. */ -function migrateProjectsYaml(targetPath: string): void { +function migrateLegacyRegistry(targetPath: string): void { + if (existsSync(targetPath)) return; + const dataHome = getAgentvHome(); const configDir = getAgentvConfigDir(); - if (dataHome === configDir) return; - const legacyPath = path.join(dataHome, 'projects.yaml'); - if (!existsSync(legacyPath)) return; - mkdirSync(path.dirname(targetPath), { recursive: true }); - copyFileSync(legacyPath, targetPath); + const legacyCandidates: string[] = []; + if (dataHome !== configDir) { + legacyCandidates.push(path.join(dataHome, 'projects.yaml')); + } + legacyCandidates.push(path.join(configDir, 'projects.yaml')); + + for (const legacy of legacyCandidates) { + if (!existsSync(legacy)) continue; + mkdirSync(path.dirname(targetPath), { recursive: true }); + copyFileSync(legacy, targetPath); + try { + rmSync(legacy, { force: true }); + } catch { + // Leaving the legacy file behind is harmless — next load sees + // targetPath exists and skips the migration entirely. + } + return; + } } // ── Load / Save ───────────────────────────────────────────────────────── @@ -146,7 +171,7 @@ function toYaml(entry: BenchmarkEntry): BenchmarkEntryYaml { export function loadBenchmarkRegistry(): BenchmarkRegistry { const registryPath = getBenchmarksRegistryPath(); if (!existsSync(registryPath)) { - migrateProjectsYaml(registryPath); + migrateLegacyRegistry(registryPath); } if (!existsSync(registryPath)) { return { benchmarks: [] }; diff --git a/packages/core/src/paths.ts b/packages/core/src/paths.ts index e864319c..f11cd4c8 100644 --- a/packages/core/src/paths.ts +++ b/packages/core/src/paths.ts @@ -6,7 +6,7 @@ let logged = false; /** * The default config directory (~/.agentv). Always resolves to the user's home * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files - * like version-check.json, last-config.json, and projects.yaml. + * like version-check.json, last-config.json, and benchmarks.yaml. */ export function getAgentvConfigDir(): string { return path.join(os.homedir(), '.agentv'); diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index 9d9bd163..e01ba316 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; -import { mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import os from 'node:os'; import path from 'node:path'; @@ -40,6 +40,25 @@ describe('benchmarks registry + runtime discovery', () => { return dir; } + it('migrates a legacy ~/.agentv/projects.yaml to benchmarks.yaml on first load', () => { + const legacyPath = path.join(fakeHome, '.agentv', 'projects.yaml'); + mkdirSync(path.dirname(legacyPath), { recursive: true }); + // Write a legacy registry by hand using the current snake_case format. + writeFileSync( + legacyPath, + 'benchmarks:\n - id: legacy\n name: legacy\n path: /legacy/path\n added_at: "2026-01-01T00:00:00Z"\n last_opened_at: "2026-01-01T00:00:00Z"\n', + 'utf-8', + ); + + const registry = loadBenchmarkRegistry(); + expect(registry.benchmarks).toHaveLength(1); + expect(registry.benchmarks[0].id).toBe('legacy'); + + // File moved, not copied: the legacy path is gone, the new one exists. + expect(existsSync(legacyPath)).toBe(false); + expect(existsSync(path.join(fakeHome, '.agentv', 'benchmarks.yaml'))).toBe(true); + }); + it('persists and lists discovery roots, omitting the key when empty', () => { expect(getDiscoveryRoots()).toEqual([]); expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); From 15478e066b880044ec0c455ee6139e24986d56fc Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 09:49:01 +0200 Subject: [PATCH 07/12] chore(core): mark migrateLegacyRegistry for v5.0.0 removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The projects.yaml → benchmarks.yaml migration shim is one-time: any surviving file is migrated on the first post-upgrade `agentv` invocation. Leaving the code in place past v5.0.0 is dead weight. Flag it with a concrete TODO so future maintainers know where and when to delete it. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/src/benchmarks.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 11a5da6b..06f860cd 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -101,6 +101,12 @@ export function getBenchmarksRegistryPath(): string { * Whichever is found first (1 beats 2) is copied to the current * benchmarks.yaml path and the legacy file is removed so the migration * doesn't keep repeating. + * + * TODO(v5.0.0): delete this function and the `rmSync` import. By the next + * major release, any surviving projects.yaml has been migrated on the first + * `agentv` invocation after upgrade; keeping the shim beyond then is dead + * weight. Callers can switch `loadBenchmarkRegistry` to skip the migration + * check unconditionally. */ function migrateLegacyRegistry(targetPath: string): void { if (existsSync(targetPath)) return; From 716dcb48525b369c80b72b806f6133d9a0a907b3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 09:55:55 +0200 Subject: [PATCH 08/12] revert: drop projects.yaml migration shim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-benchmark Studio only shipped last week — nobody has a populated projects.yaml to migrate from in practice. The shim (and its TODO pointing at a future v5.0.0 cleanup) is dead code that future maintainers have to carry. Delete it now while there's no adoption to protect, and let any (hypothetical) stale file simply mean "re-register your benchmarks." - Remove migrateLegacyRegistry() and its call site in loadBenchmarkRegistry. - Remove the legacy-filename paragraph from the benchmarks.ts header. - Remove the unused copyFileSync / rmSync / getAgentvHome imports. - Remove the migration test case from benchmarks.test.ts. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/src/benchmarks.ts | 61 +-------------------------- packages/core/test/benchmarks.test.ts | 21 +-------- 2 files changed, 3 insertions(+), 79 deletions(-) diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 06f860cd..5f948be3 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -32,10 +32,6 @@ * single-threaded event loop, which satisfies the 24/7 Studio case. Run only * one `agentv` process against a given home at a time. * - * Legacy filename: pre-rename installs stored this at `~/.agentv/projects.yaml`. - * On first load, migrateLegacyRegistry() copies that file to benchmarks.yaml - * (and deletes the old one) so nobody loses their registrations. - * * To extend: * - For CRUD on persisted entries: loadBenchmarkRegistry() / saveBenchmarkRegistry(). * - For live discovery: addDiscoveryRoot() / removeDiscoveryRoot() / @@ -44,21 +40,12 @@ * its output is sorted for deterministic id assignment under basename collisions. */ -import { - copyFileSync, - existsSync, - mkdirSync, - readFileSync, - readdirSync, - rmSync, - statSync, - writeFileSync, -} from 'node:fs'; +import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; -import { getAgentvConfigDir, getAgentvHome } from './paths.js'; +import { getAgentvConfigDir } from './paths.js'; // ── Types ─────────────────────────────────────────────────────────────── @@ -92,47 +79,6 @@ export function getBenchmarksRegistryPath(): string { return path.join(getAgentvConfigDir(), 'benchmarks.yaml'); } -/** - * One-time migration run before every load. Two legacy forms are handled: - * 1. AGENTV_HOME-relative `projects.yaml` — from before the config/data - * split, when the registry lived under AGENTV_HOME. - * 2. `~/.agentv/projects.yaml` — from before the projects→benchmarks - * rename. - * Whichever is found first (1 beats 2) is copied to the current - * benchmarks.yaml path and the legacy file is removed so the migration - * doesn't keep repeating. - * - * TODO(v5.0.0): delete this function and the `rmSync` import. By the next - * major release, any surviving projects.yaml has been migrated on the first - * `agentv` invocation after upgrade; keeping the shim beyond then is dead - * weight. Callers can switch `loadBenchmarkRegistry` to skip the migration - * check unconditionally. - */ -function migrateLegacyRegistry(targetPath: string): void { - if (existsSync(targetPath)) return; - - const dataHome = getAgentvHome(); - const configDir = getAgentvConfigDir(); - const legacyCandidates: string[] = []; - if (dataHome !== configDir) { - legacyCandidates.push(path.join(dataHome, 'projects.yaml')); - } - legacyCandidates.push(path.join(configDir, 'projects.yaml')); - - for (const legacy of legacyCandidates) { - if (!existsSync(legacy)) continue; - mkdirSync(path.dirname(targetPath), { recursive: true }); - copyFileSync(legacy, targetPath); - try { - rmSync(legacy, { force: true }); - } catch { - // Leaving the legacy file behind is harmless — next load sees - // targetPath exists and skips the migration entirely. - } - return; - } -} - // ── Load / Save ───────────────────────────────────────────────────────── // YAML uses snake_case per AGENTS.md §"Wire Format Convention"; TypeScript // internals stay camelCase. fromYaml / toYaml handle the translation; every @@ -176,9 +122,6 @@ function toYaml(entry: BenchmarkEntry): BenchmarkEntryYaml { export function loadBenchmarkRegistry(): BenchmarkRegistry { const registryPath = getBenchmarksRegistryPath(); - if (!existsSync(registryPath)) { - migrateLegacyRegistry(registryPath); - } if (!existsSync(registryPath)) { return { benchmarks: [] }; } diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index e01ba316..9d9bd163 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; import os from 'node:os'; import path from 'node:path'; @@ -40,25 +40,6 @@ describe('benchmarks registry + runtime discovery', () => { return dir; } - it('migrates a legacy ~/.agentv/projects.yaml to benchmarks.yaml on first load', () => { - const legacyPath = path.join(fakeHome, '.agentv', 'projects.yaml'); - mkdirSync(path.dirname(legacyPath), { recursive: true }); - // Write a legacy registry by hand using the current snake_case format. - writeFileSync( - legacyPath, - 'benchmarks:\n - id: legacy\n name: legacy\n path: /legacy/path\n added_at: "2026-01-01T00:00:00Z"\n last_opened_at: "2026-01-01T00:00:00Z"\n', - 'utf-8', - ); - - const registry = loadBenchmarkRegistry(); - expect(registry.benchmarks).toHaveLength(1); - expect(registry.benchmarks[0].id).toBe('legacy'); - - // File moved, not copied: the legacy path is gone, the new one exists. - expect(existsSync(legacyPath)).toBe(false); - expect(existsSync(path.join(fakeHome, '.agentv', 'benchmarks.yaml'))).toBe(true); - }); - it('persists and lists discovery roots, omitting the key when empty', () => { expect(getDiscoveryRoots()).toEqual([]); expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); From dadc0524f7f0f11874229d0f403bc831ee458527 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 14:08:25 +0200 Subject: [PATCH 09/12] =?UTF-8?q?docs:=20sweep=20trailing=20project=20?= =?UTF-8?q?=E2=86=92=20benchmark=20references?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AGENTS.md §"Wire Format Convention" now uses benchmark_id in its HTTP-body example, matching the actual wire shape. - Delete docs/plans/1144-runtime-benchmark-discovery.md. Per AGENTS.md §"Plans and Worktrees", plans are working materials and should be removed before merging; the design decisions it captured have all landed in code, the header docstring, and studio.mdx. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 2 +- .../plans/1144-runtime-benchmark-discovery.md | 70 ------------------- 2 files changed, 1 insertion(+), 71 deletions(-) delete mode 100644 docs/plans/1144-runtime-benchmark-discovery.md diff --git a/AGENTS.md b/AGENTS.md index 511987db..90e3209c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -154,7 +154,7 @@ The rule is blanket: if the key is going to disk, to a user's editor, into a JSO - All YAML files on disk: `*.eval.yaml`, `agentv.config.yaml`, `benchmarks.yaml`, `studio/config.yaml`, any future YAML we add. - JSONL result files (`test_id`, `token_usage`, `duration_ms`). - Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`). -- HTTP response bodies from `agentv serve` / Studio (`added_at`, `pass_rate`, `project_id`). +- HTTP response bodies from `agentv serve` / Studio (`added_at`, `pass_rate`, `benchmark_id`). - CLI JSON output (`agentv results summary`, `results failures`, `results show`). - Anything consumed by non-TS tooling (Python, jq pipelines, external dashboards). diff --git a/docs/plans/1144-runtime-benchmark-discovery.md b/docs/plans/1144-runtime-benchmark-discovery.md deleted file mode 100644 index 74a7ad82..00000000 --- a/docs/plans/1144-runtime-benchmark-discovery.md +++ /dev/null @@ -1,70 +0,0 @@ -# Studio Runtime Benchmark Discovery (#1144) - -## Problem -Studio reads `~/.agentv/benchmarks.yaml` fresh on every `/api/benchmarks` request, so -edits to that file are already picked up live. What doesn't work is **filesystem -discovery**: `--discover ` is a one-shot scan at startup, so any `.agentv/` -repo that appears/disappears under that path while `agentv serve` is running is -invisible until restart. - -## Design - -### Persisted state (benchmarks.yaml) -Extend `BenchmarkRegistry` with an optional `discoveryRoots?: string[]`. This is -the persisted list of directories Studio should continuously scan for -`.agentv/` repos. Existing `benchmarks` entries remain untouched. - -### Active-vs-persisted split -Introduce `resolveActiveBenchmarks()` in `packages/core/src/benchmarks.ts`: -- Start with the persisted `benchmarks` array (manually added entries). -- For each discovery root, call `discoverBenchmarks(root)` and generate - synthetic entries with `source: 'discovered'`. Absolute path is the identity; - id is derived from basename + dedup against persisted ids. -- Persisted wins on path conflict (so a user can opt a discovered repo into - manual management). -- Return the merged list. Nothing is written to disk. - -This is cheap (depth-2 `readdirSync`) and avoids write contention. Discovered -entries are ephemeral — removing a `.agentv/` directory causes the next scan to -drop it. Manually-added entries are never auto-removed. - -### API changes (apps/cli/src/commands/results/serve.ts) -- `/api/benchmarks`, `/api/benchmarks/all-runs`, `/api/benchmarks/:id/summary`, - and `withBenchmark()` switch from `loadBenchmarkRegistry()` / - `getBenchmark()` to the resolved list, so discovered entries participate in - every benchmark-scoped route. -- New endpoints: - - `GET /api/benchmarks/discovery-roots` → `{ roots: string[] }` - - `POST /api/benchmarks/discovery-roots` `{ path }` → `{ root }` - - `DELETE /api/benchmarks/discovery-roots` `{ path }` → `{ ok: true }` - - `POST /api/benchmarks/rescan` → same shape as `GET /api/benchmarks` - -### CLI changes -Add `--discovery-root ` (repeatable via `multioption`). Paths are resolved -to absolute and appended to the persisted `discoveryRoots` (idempotent). The -server still starts — this is not a one-shot flag. - -The existing `--discover ` flag keeps its one-shot semantics for backward -compatibility. - -### Wire format -Discovered entries return `source: "discovered"` in the snake_case response so -the frontend can optionally disable the Remove button for them. The default is -`"manual"` (preserving the existing response shape for registered repos). - -## Acceptance-criteria mapping - -| Criterion | Handled by | -| ------------------------------------------- | --------------------------------------- | -| Start with zero projects, stay healthy | Already works; no change | -| New `.agentv/` repo appears without restart | `resolveActiveBenchmarks()` on each GET | -| Removed repo disappears without restart | Same — scan is recomputed per request | -| `/api/benchmarks` reflects live state | Same | - -## Test plan -1. Unit test `resolveActiveBenchmarks` with temp directories (add + remove - `.agentv/` and assert the returned list reflects it). -2. Unit test that persisted entries win over discovered ones at the same path. -3. Red/green UAT: start `agentv serve --discovery-root `; `curl - /api/benchmarks` → empty; `mkdir /r1/.agentv`; re-curl → shows `r1`; - `rm -rf /r1/.agentv`; re-curl → gone. Same server process throughout. From 70baab8299a81006677cf0913ad9398ee2b14b8b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 14:27:02 +0200 Subject: [PATCH 10/12] review: fix stale projects query key + tidy rename holdouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final-review findings from a fresh subagent pass over commits since the first review: - RunEvalModal invalidated queryKey: ['projects'] after an eval run finished. That key never existed after the rename (the benchmark list uses ['benchmarks']), so the multi-benchmark dashboard's pass-rate / last-run columns did not refresh when an eval completed from the modal. Rename to ['benchmarks']. Real regression — the only functional bug the reviewer found. - /api/benchmarks/:id/summary returned "Failed to read project" on 500. Bring it in line with the rest of the API: "Failed to read benchmark". - resolveDashboardMode took a projectCount parameter and one of its internal comments still said "project-scoped routes"; Sidebar.tsx had a "Project-scoped sidebars" section header. Pure TS drift from the rename sweep. - addExcludedPath now early-returns when the path is already pinned in benchmarks[]. The exclusion filter only applies to the discovered set, so recording an exclusion for a pinned path is meaningless state; the guard keeps the YAML invariant crisp and mirrors the auto-unexclude that addBenchmark already does. New unit test covers the invariant. Skipped nits (per YAGNI): defensive literal-path guard inside DELETE /api/benchmarks/:id and the pre-existing benchmark_name-from-basename quirk in single-benchmark mode. Route ordering already works and the inline comment documents the constraint; the basename issue was there before this PR. 2261 tests pass; build, typecheck, lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/src/commands/results/serve.ts | 8 ++++---- apps/studio/src/components/RunEvalModal.tsx | 2 +- apps/studio/src/components/Sidebar.tsx | 2 +- packages/core/src/benchmarks.ts | 8 +++++++- packages/core/test/benchmarks.test.ts | 11 +++++++++++ 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index ab79cf4e..fc50a24f 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -141,7 +141,7 @@ export function loadResults(content: string): EvaluationResult[] { } export function resolveDashboardMode( - projectCount: number, + benchmarkCount: number, options: { multi?: boolean; single?: boolean }, ): { isMultiBenchmark: boolean; showMultiWarning: boolean } { if (options.single === true) { @@ -152,7 +152,7 @@ export function resolveDashboardMode( return { isMultiBenchmark: true, showMultiWarning: true }; } - return { isMultiBenchmark: projectCount > 1, showMultiWarning: false }; + return { isMultiBenchmark: benchmarkCount > 1, showMultiWarning: false }; } // ── Feedback persistence ───────────────────────────────────────────────── @@ -270,7 +270,7 @@ function stripHeavyFields(results: readonly EvaluationResult[]) { // ── Shared data-route handlers ─────────────────────────────────────────── // // Each handler takes a Hono Context and a DataContext (resolved directories). -// Both unscoped and project-scoped routes call the same handler, differing +// Both unscoped and benchmark-scoped routes call the same handler, differing // only in how the DataContext is constructed. interface DataContext { @@ -1032,7 +1032,7 @@ export function createApp( last_run: lastRun, }); } catch { - return c.json({ error: 'Failed to read project' }, 500); + return c.json({ error: 'Failed to read benchmark' }, 500); } }); diff --git a/apps/studio/src/components/RunEvalModal.tsx b/apps/studio/src/components/RunEvalModal.tsx index db7bd0d1..9c281a9d 100644 --- a/apps/studio/src/components/RunEvalModal.tsx +++ b/apps/studio/src/components/RunEvalModal.tsx @@ -89,7 +89,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod useEffect(() => { if (runStatus?.status === 'finished' || runStatus?.status === 'failed') { queryClient.invalidateQueries({ queryKey: ['runs'] }); - queryClient.invalidateQueries({ queryKey: ['projects'] }); + queryClient.invalidateQueries({ queryKey: ['benchmarks'] }); } }, [runStatus?.status, queryClient]); diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 0d420311..5933d791 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -408,7 +408,7 @@ function CategorySidebar({ runId, category }: { runId: string; category: string ); } -// ── Project-scoped sidebars ────────────────────────────────────────────── +// ── Benchmark-scoped sidebars ──────────────────────────────────────────── function BenchmarkRunDetailSidebar({ benchmarkId, diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 5f948be3..58801e40 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -364,11 +364,17 @@ export function getExcludedPaths(): string[] { * Append a path to the exclusion list (idempotent). Used when the user * clicks "Remove" on a discovered entry — the .agentv/ dir stays on disk, * but it's suppressed from the active set until the user unexcludes it. - * Returns the resolved absolute path. + * Returns the resolved absolute path. No-op when the path is already + * pinned in `benchmarks[]`: exclusions only filter the discovered set, so + * tracking an excluded pin is meaningless state; `removeBenchmark` is the + * right tool for dropping a pin. */ export function addExcludedPath(excludePath: string): string { const abs = path.resolve(excludePath); const registry = loadBenchmarkRegistry(); + if (registry.benchmarks.some((b) => b.path === abs)) { + return abs; + } const excluded = registry.excludedPaths ?? []; if (!excluded.includes(abs)) { excluded.push(abs); diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index 9d9bd163..a3276902 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -133,6 +133,17 @@ describe('benchmarks registry + runtime discovery', () => { expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); }); + it('treats addExcludedPath on a pinned repo as a no-op', () => { + const repoPath = makeRepo('already-pinned'); + addBenchmark(repoPath); + + // Returns the resolved path but does not persist an exclusion. + expect(addExcludedPath(repoPath)).toBe(path.resolve(repoPath)); + expect(getExcludedPaths()).toEqual([]); + // Pinned benchmark still shows up, unchanged. + expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); + }); + it('auto-unexcludes a path when it is manually pinned', () => { addDiscoveryRoot(reposRoot); const repoPath = makeRepo('pin-me'); From c4c4d58e9880427acad705d2c131b048110e949b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 14:52:29 +0200 Subject: [PATCH 11/12] refactor!: replace filesystem discovery with config-only benchmark registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Studio runtime-update acceptance criteria for #1144 are satisfied by runtime reload of benchmarks.yaml (option 2 of the issue's "Proposed direction"), which is what loadBenchmarkRegistry() has always done on every request. The filesystem-scanning path added in earlier commits (discovery_roots, excluded_paths, source=manual|discovered, the active- vs-persisted split, per-request depth-2 readdirSync) was significant code surface for a single niche workflow — dropping a .agentv/ directory into a watched folder and having it appear without an explicit API call or file edit. Trade it for the simpler declarative model: benchmarks.yaml is the single source of truth; edits to it (direct, via POST /api/benchmarks, via --add/--remove, or via a Kubernetes ConfigMap mount) propagate within the UI's 10 s poll interval. Deployments that want declarative config get the clean path; deployments that want ad-hoc repo drops can script POST /api/benchmarks. BREAKING (safe — multi-benchmark Studio shipped last week and nothing adopted yet): - Remove BenchmarkSource, BenchmarkEntry.source, BenchmarkRegistry. discoveryRoots, BenchmarkRegistry.excludedPaths. - Remove core helpers: addDiscoveryRoot, removeDiscoveryRoot, getDiscovery Roots, addExcludedPath, removeExcludedPath, getExcludedPaths, resolveActiveBenchmarks, getActiveBenchmark. - Remove HTTP endpoints: GET/POST/DELETE /api/benchmarks/discovery-roots, GET/DELETE /api/benchmarks/exclusions, POST /api/benchmarks/rescan. - Remove --discovery-root CLI flag (and the multioption/array cmd-ts imports it needed). - Remove wire-format source field from /api/benchmarks responses. - Remove Watch form + addDiscoveryRootApi from the Studio frontend. - Simplify DELETE /api/benchmarks/:benchmarkId back to a straight remove. - Docs: studio.mdx drops --discovery-root from the options table and replaces the Runtime Discovery section with "Runtime behavior: no restart needed" covering the 10 s poll model and ConfigMap flow. - Tests: rewrite benchmarks.test.ts to cover the core CRUD surface (start-empty, add/remove, idempotency, touch, snake_case on-disk round-trip) and drop the discovery/exclusion/precedence cases. Net -472 lines. All 2259 tests pass; build, typecheck, lint clean. UAT confirmed: POST add, external YAML edit, DELETE by id all reflect live; removed endpoints return 404; removed flag rejected with "Unknown arguments". Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/src/commands/results/serve.ts | 190 ++------------ apps/studio/src/lib/api.ts | 19 -- apps/studio/src/routes/index.tsx | 30 --- .../src/content/docs/docs/tools/studio.mdx | 22 +- packages/core/src/benchmarks.ts | 238 ++---------------- packages/core/src/index.ts | 9 - packages/core/test/benchmarks.test.ts | 154 +++--------- 7 files changed, 95 insertions(+), 567 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index fc50a24f..580d4e1f 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -11,10 +11,9 @@ * - GET /api/runs/:filename — load results from a specific run workspace * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews - * - GET /api/benchmarks — list active benchmarks (persisted + live-discovered) - * - POST /api/benchmarks/rescan — force a discovery-root rescan - * - GET/POST/DELETE /api/benchmarks/discovery-roots — manage runtime discovery roots - * - GET/DELETE /api/benchmarks/exclusions — list / un-hide paths hidden via "Remove" + * - GET /api/benchmarks — list registered benchmarks + * - POST /api/benchmarks — register a benchmark by path + * - DELETE /api/benchmarks/:benchmarkId — unregister a benchmark * - GET /api/benchmarks/:benchmarkId/runs — benchmark-scoped run list * * All data routes (runs, suites, categories, evals, experiments, targets) @@ -35,32 +34,16 @@ import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { - array, - command, - flag, - multioption, - number, - option, - optional, - positional, - string, -} from 'cmd-ts'; +import { command, flag, number, option, optional, positional, string } from 'cmd-ts'; import { DEFAULT_CATEGORY, type EvaluationResult, addBenchmark, - addDiscoveryRoot, - addExcludedPath, - getActiveBenchmark, - getDiscoveryRoots, - getExcludedPaths, + getBenchmark, + loadBenchmarkRegistry, loadConfig, removeBenchmark, - removeDiscoveryRoot, - removeExcludedPath, - resolveActiveBenchmarks, } from '@agentv/core'; import type { Context } from 'hono'; import { Hono } from 'hono'; @@ -914,14 +897,14 @@ export function createApp( const app = new Hono(); // ── Benchmark resolution wrapper ────────────────────────────────────── - // Resolves benchmarkId → DataContext, returning 404 if not found. - // Looks up against the *active* set (persisted + live-discovered) so repos - // under a configured discovery root resolve without a server restart. + // Resolves benchmarkId → DataContext, returning 404 if not found. The + // registry is re-read on every request, so edits to benchmarks.yaml (or + // POST /api/benchmarks) take effect without restarting the server. function withBenchmark( c: C, handler: (c: C, ctx: DataContext) => Response | Promise, ): Response | Promise { - const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); + const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark || !existsSync(benchmark.path)) { return c.json({ error: 'Benchmark not found' }, 404); } @@ -960,7 +943,6 @@ export function createApp( path: string; addedAt: string; lastOpenedAt: string; - source?: 'manual' | 'discovered'; }) { return { id: entry.id, @@ -968,14 +950,13 @@ export function createApp( path: entry.path, added_at: entry.addedAt, last_opened_at: entry.lastOpenedAt, - source: entry.source ?? 'manual', }; } app.get('/api/benchmarks', async (c) => { - const active = resolveActiveBenchmarks(); + const registry = loadBenchmarkRegistry(); const benchmarks = await Promise.all( - active.map(async (p) => { + registry.benchmarks.map(async (p) => { let runCount = 0; let passRate = 0; let lastRun: string | null = null; @@ -1016,7 +997,7 @@ export function createApp( }); app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { - const benchmark = getActiveBenchmark(c.req.param('benchmarkId') ?? ''); + const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark) return c.json({ error: 'Benchmark not found' }, 404); try { const { runs: metas } = await listMergedResultFiles(benchmark.path); @@ -1036,9 +1017,9 @@ export function createApp( } }); - /** Aggregate runs from all active benchmarks, sorted by timestamp descending. */ + /** Aggregate runs from all registered benchmarks, sorted by timestamp descending. */ app.get('/api/benchmarks/all-runs', async (c) => { - const active = resolveActiveBenchmarks(); + const registry = loadBenchmarkRegistry(); const allRuns: Array<{ filename: string; display_name: string; @@ -1055,7 +1036,7 @@ export function createApp( benchmark_name: string; }> = []; - for (const p of active) { + for (const p of registry.benchmarks) { try { const { runs: metas } = await listMergedResultFiles(p.path); for (const m of metas) { @@ -1095,119 +1076,15 @@ export function createApp( return c.json({ runs: allRuns }); }); - // ── Discovery roots (runtime benchmark auto-discovery) ─────────────── - // Roots are persisted in ~/.agentv/benchmarks.yaml. On each GET - // /api/benchmarks, Studio rescans them and surfaces new `.agentv/` repos — - // no server restart required. - - app.get('/api/benchmarks/discovery-roots', (c) => { - return c.json({ roots: getDiscoveryRoots() }); - }); - - app.post('/api/benchmarks/discovery-roots', async (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - try { - const body = await c.req.json<{ path: string }>(); - if (!body.path) return c.json({ error: 'Missing path' }, 400); - const root = addDiscoveryRoot(body.path); - return c.json({ root }, 201); - } catch (err) { - return c.json({ error: (err as Error).message }, 400); - } - }); - - app.delete('/api/benchmarks/discovery-roots', async (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - try { - const body = await c.req.json<{ path: string }>(); - if (!body.path) return c.json({ error: 'Missing path' }, 400); - const removed = removeDiscoveryRoot(body.path); - if (!removed) return c.json({ error: 'Root not found' }, 404); - return c.json({ ok: true }); - } catch (err) { - return c.json({ error: (err as Error).message }, 400); - } - }); - - // ── Exclusions (hide a discovered repo from the UI) ───────────────── - // DELETE /api/benchmarks/:id on a discovered entry adds its path here; - // these endpoints let users list or un-hide those paths. - - app.get('/api/benchmarks/exclusions', (c) => { - return c.json({ excluded_paths: getExcludedPaths() }); - }); - - app.delete('/api/benchmarks/exclusions', async (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - try { - const body = await c.req.json<{ path: string }>(); - if (!body.path) return c.json({ error: 'Missing path' }, 400); - const removed = removeExcludedPath(body.path); - if (!removed) return c.json({ error: 'Path not in exclusions' }, 404); - return c.json({ ok: true }); - } catch (err) { - return c.json({ error: (err as Error).message }, 400); - } - }); - - // Registered after all `/api/benchmarks/` sub-paths so Hono doesn't - // route e.g. `DELETE /api/benchmarks/exclusions` into this handler with - // benchmarkId="exclusions". app.delete('/api/benchmarks/:benchmarkId', (c) => { if (readOnly) { return c.json({ error: 'Studio is running in read-only mode' }, 403); } - const benchmarkId = c.req.param('benchmarkId') ?? ''; - const active = getActiveBenchmark(benchmarkId); - if (!active) return c.json({ error: 'Benchmark not found' }, 404); - // For a discovered entry, "remove" means hide it from the UI. The - // .agentv/ dir stays on disk; the path goes onto the exclusion list - // and is filtered out of resolveActiveBenchmarks on the next rescan. - if (active.source === 'discovered') { - addExcludedPath(active.path); - return c.json({ ok: true, excluded: active.path }); - } - const removed = removeBenchmark(benchmarkId); + const removed = removeBenchmark(c.req.param('benchmarkId') ?? ''); if (!removed) return c.json({ error: 'Benchmark not found' }, 404); return c.json({ ok: true }); }); - /** Explicit rescan hook — useful when the UI wants a refresh without the poll tick. */ - app.post('/api/benchmarks/rescan', async (c) => { - const active = resolveActiveBenchmarks(); - const benchmarks = await Promise.all( - active.map(async (p) => { - let runCount = 0; - let passRate = 0; - let lastRun: string | null = null; - try { - const { runs: metas } = await listMergedResultFiles(p.path); - runCount = metas.length; - if (metas.length > 0) { - const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0); - passRate = totalPassRate / metas.length; - lastRun = metas[0].timestamp; - } - } catch { - // inaccessible - } - return { - ...benchmarkEntryToWire(p), - run_count: runCount, - pass_rate: passRate, - last_run: lastRun, - }; - }), - ); - return c.json({ benchmarks }); - }); - // ── Data routes (unscoped) ──────────────────────────────────────────── app.get('/api/config', (c) => @@ -1387,7 +1264,7 @@ export function createApp( // For benchmark-scoped routes, resolve to benchmark path; otherwise use searchDir const benchmarkId = c.req.param('benchmarkId'); if (benchmarkId) { - const benchmark = getActiveBenchmark(benchmarkId); + const benchmark = getBenchmark(benchmarkId); if (benchmark) return benchmark.path; } return searchDir; @@ -1516,18 +1393,12 @@ export const resultsServeCommand = command({ long: 'remove', description: 'Unregister a benchmark by ID', }), - discoveryRoot: multioption({ - type: array(string), - long: 'discovery-root', - description: - 'Persist a directory that Studio continuously rescans for .agentv/ repos. Repeatable.', - }), readOnly: flag({ long: 'read-only', description: 'Disable write operations and launch Studio in read-only leaderboard mode', }), }, - handler: async ({ source, port, dir, multi, single, add, remove, discoveryRoot, readOnly }) => { + handler: async ({ source, port, dir, multi, single, add, remove, readOnly }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1554,15 +1425,6 @@ export const resultsServeCommand = command({ return; } - // Persist --discovery-root paths before starting the server. The server - // keeps running after this so Studio continuously rescans the roots. - if (discoveryRoot.length > 0) { - for (const root of discoveryRoot) { - const abs = addDiscoveryRoot(root); - console.log(`Watching discovery root: ${abs}`); - } - } - // ── Version check ──────────────────────────────────────────────── // Enforce `required_version` from .agentv/config.yaml so Studio/serve // match `agentv eval` behavior. Same prompt in TTY, warn+continue @@ -1576,13 +1438,11 @@ export const resultsServeCommand = command({ } // ── Determine multi-benchmark mode ─────────────────────────────── - // Count active (persisted + live-discovered) benchmarks so that the - // dashboard mode reflects what the user will actually see in the UI. - const activeBenchmarks = resolveActiveBenchmarks(); - const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode(activeBenchmarks.length, { - multi, - single, - }); + const registry = loadBenchmarkRegistry(); + const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode( + registry.benchmarks.length, + { multi, single }, + ); try { let results: EvaluationResult[] = []; @@ -1624,7 +1484,7 @@ export const resultsServeCommand = command({ } if (isMultiBenchmark) { - console.log(`Multi-benchmark mode: ${activeBenchmarks.length} benchmark(s) active`); + console.log(`Multi-benchmark mode: ${registry.benchmarks.length} benchmark(s) registered`); } else if (results.length > 0 && sourceFile) { console.log(`Serving ${results.length} result(s) from ${sourceFile}`); } else { diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 4373bfd9..ea48bfd6 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -281,25 +281,6 @@ export async function removeBenchmarkApi(benchmarkId: string): Promise { } } -/** - * Persist a directory as a discovery root. Studio rescans every configured - * root on each `/api/benchmarks` read so benchmarks under it appear/disappear - * live without a server restart. - */ -export async function addDiscoveryRootApi(dirPath: string): Promise { - const res = await fetch('/api/benchmarks/discovery-roots', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ path: dirPath }), - }); - if (!res.ok) { - const err = (await res.json()) as { error: string }; - throw new Error(err.error || `Failed to add discovery root: ${res.status}`); - } - const data = (await res.json()) as { root: string }; - return data.root; -} - /** Build the API base URL for a benchmark-scoped request. */ function benchmarkApiBase(benchmarkId: string): string { return `/api/benchmarks/${encodeURIComponent(benchmarkId)}`; diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index b5a771e1..391ea875 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -19,7 +19,6 @@ import { type RunSourceFilter, RunSourceToolbar } from '~/components/RunSourceTo import { TargetsTab } from '~/components/TargetsTab'; import { addBenchmarkApi, - addDiscoveryRootApi, syncRemoteResultsApi, useBenchmarkList, useCompare, @@ -69,7 +68,6 @@ function BenchmarksDashboard() { const { data: config } = useStudioConfig(); const queryClient = useQueryClient(); const [addPath, setAddPath] = useState(''); - const [rootPath, setRootPath] = useState(''); const [error, setError] = useState(null); const [showAddForm, setShowAddForm] = useState(false); const [showRunEval, setShowRunEval] = useState(false); @@ -91,19 +89,6 @@ function BenchmarksDashboard() { } } - async function handleAddDiscoveryRoot(e: React.FormEvent) { - e.preventDefault(); - if (!rootPath.trim()) return; - setError(null); - try { - await addDiscoveryRootApi(rootPath.trim()); - setRootPath(''); - queryClient.invalidateQueries({ queryKey: ['benchmarks'] }); - } catch (err) { - setError((err as Error).message); - } - } - return (
@@ -153,21 +138,6 @@ function BenchmarksDashboard() { Add -
- setRootPath(e.target.value)} - placeholder="Watch a directory for .agentv/ repos..." - className="flex-1 rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" - /> - -
)} diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 3d485366..33b96455 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -49,7 +49,6 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z | `--single` | Force single-benchmark dashboard mode | | `--add ` | Register a benchmark by path | | `--remove ` | Unregister a benchmark by ID | -| `--discovery-root ` | Watch a directory: benchmarks under it appear/disappear live without restart. Repeatable. | ## Features @@ -154,24 +153,23 @@ agentv studio --add /path/to/other-evals Each path must contain a `.agentv/` directory. Registered benchmarks are stored in `~/.agentv/benchmarks.yaml`. -### Runtime Discovery +### Runtime behavior: no restart needed -For a 24/7 Studio deployment, tell it which directory to watch: +`benchmarks.yaml` is the single source of truth. Studio re-reads it on every `/api/benchmarks` request (which the UI polls every ~10 s), so any of these changes appear live without restarting `agentv serve`: -```bash -agentv studio --discovery-root /path/to/repos -``` - -Studio rescans every configured root on each `/api/benchmarks` read (every ~10 s via the UI poll), so any `.agentv/` repo appearing or disappearing under a root shows up without restarting the server. The flag is repeatable and persisted to `~/.agentv/benchmarks.yaml` under `discovery_roots`, so you only need to pass it once. +- Adding via the UI's **Add Benchmark** form or `POST /api/benchmarks`. +- Removing via the UI's **Remove** button or `DELETE /api/benchmarks/:id`. +- Editing `~/.agentv/benchmarks.yaml` directly. +- Mounting the file via a Kubernetes ConfigMap — GitOps the ConfigMap and Studio reflects it within the next poll. -Click **Remove** on a discovered benchmark to hide it from the UI — its path goes into the `excluded_paths` list. Pinning it via `--add` or the UI's Add form un-hides it (manual always wins). +This satisfies the 24/7-Studio use case: the server stays up; benchmarks come and go through config edits or API calls. ### Launching the Dashboard -Studio auto-detects the mode based on how many benchmarks are active: +Studio auto-detects the mode based on how many benchmarks are registered: -- `0` or `1` active: single-benchmark view -- `2+` active: Benchmarks dashboard +- `0` or `1` registered: single-benchmark view +- `2+` registered: Benchmarks dashboard ```bash agentv studio # auto-detects diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 58801e40..39fa6b3c 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -2,9 +2,11 @@ * Benchmark registry for AgentV Studio multi-benchmark support. * * A Benchmark = any directory containing a `.agentv/` folder. - * The registry lives at `~/.agentv/benchmarks.yaml` and tracks registered - * benchmarks plus an optional list of discovery roots that Studio continuously - * rescans at runtime so repos can appear/disappear without a server restart. + * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of + * truth for which benchmarks Studio shows. Studio re-reads the file on every + * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via + * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected + * without restarting `agentv serve`. * * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"): * benchmarks: @@ -13,31 +15,18 @@ * path: /home/user/projects/my-app * added_at: "2026-03-20T10:00:00Z" * last_opened_at: "2026-03-30T14:00:00Z" - * discovery_roots: - * - /home/user/agentv-repos - * excluded_paths: # discovered repos to hide from Studio - * - /home/user/agentv-repos/experiment-v0 - * - * Runtime model: - * - Entries in `benchmarks` are persisted (manual add/remove). - * - Entries under `discoveryRoots` are resolved live on each call to - * `resolveActiveBenchmarks()` — they are NOT written to disk. This means - * a repo appearing or disappearing under a root is reflected immediately, - * and manual entries are never auto-removed. * * Concurrency: the registry assumes a single writer. All mutating calls - * (add/remove/touchBenchmark, add/removeDiscoveryRoot) do read-modify-write on - * benchmarks.yaml without a lock. Interleaved writes from multiple processes - * can clobber each other; Studio's HTTP handlers are serialized by Node's - * single-threaded event loop, which satisfies the 24/7 Studio case. Run only - * one `agentv` process against a given home at a time. + * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml + * without a lock. Studio's HTTP handlers are serialized by Node's + * single-threaded event loop, which satisfies the 24/7 deployment case. + * Run only one `agentv` process against a given home at a time. * * To extend: - * - For CRUD on persisted entries: loadBenchmarkRegistry() / saveBenchmarkRegistry(). - * - For live discovery: addDiscoveryRoot() / removeDiscoveryRoot() / - * resolveActiveBenchmarks(). - * - discoverBenchmarks() scans a single directory tree for `.agentv/` folders; - * its output is sorted for deterministic id assignment under basename collisions. + * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the + * add/remove/touch helpers. + * - discoverBenchmarks() is a one-shot filesystem utility for bulk + * registration; it does not run in the request path. */ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; @@ -49,28 +38,16 @@ import { getAgentvConfigDir } from './paths.js'; // ── Types ─────────────────────────────────────────────────────────────── -export type BenchmarkSource = 'manual' | 'discovered'; - export interface BenchmarkEntry { id: string; name: string; path: string; addedAt: string; lastOpenedAt: string; - /** How this entry was registered. Absent (undefined) ≡ 'manual'. */ - source?: BenchmarkSource; } export interface BenchmarkRegistry { benchmarks: BenchmarkEntry[]; - /** Directories continuously rescanned for `.agentv/` repos. Optional. */ - discoveryRoots?: string[]; - /** - * Absolute paths to exclude from the discovered set. Clicking "Remove" on a - * discovered entry in Studio adds its path here so the repo stays on disk - * but disappears from the UI. Has no effect on manually-pinned entries. - */ - excludedPaths?: string[]; } // ── Registry path ─────────────────────────────────────────────────────── @@ -90,7 +67,6 @@ interface BenchmarkEntryYaml { path: string; added_at: string; last_opened_at: string; - source?: BenchmarkSource; } function fromYaml(raw: unknown): BenchmarkEntry | null { @@ -105,7 +81,6 @@ function fromYaml(raw: unknown): BenchmarkEntry | null { path: e.path, addedAt: typeof e.added_at === 'string' ? e.added_at : '', lastOpenedAt: typeof e.last_opened_at === 'string' ? e.last_opened_at : '', - ...(e.source && { source: e.source }), }; } @@ -116,7 +91,6 @@ function toYaml(entry: BenchmarkEntry): BenchmarkEntryYaml { path: entry.path, added_at: entry.addedAt, last_opened_at: entry.lastOpenedAt, - ...(entry.source && { source: entry.source }), }; } @@ -136,16 +110,7 @@ export function loadBenchmarkRegistry(): BenchmarkRegistry { .map(fromYaml) .filter((e): e is BenchmarkEntry => e !== null) : []; - const discoveryRoots = Array.isArray(parsed.discovery_roots) - ? (parsed.discovery_roots as unknown[]).filter((v): v is string => typeof v === 'string') - : undefined; - const excludedPaths = Array.isArray(parsed.excluded_paths) - ? (parsed.excluded_paths as unknown[]).filter((v): v is string => typeof v === 'string') - : undefined; - const result: BenchmarkRegistry = { benchmarks }; - if (discoveryRoots !== undefined) result.discoveryRoots = discoveryRoots; - if (excludedPaths !== undefined) result.excludedPaths = excludedPaths; - return result; + return { benchmarks }; } catch { return { benchmarks: [] }; } @@ -157,17 +122,7 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - // Omit empty/undefined optional lists from the serialized form so registries - // without the feature don't grow stray keys. - const payload: Record = { - benchmarks: registry.benchmarks.map(toYaml), - }; - if (registry.discoveryRoots && registry.discoveryRoots.length > 0) { - payload.discovery_roots = registry.discoveryRoots; - } - if (registry.excludedPaths && registry.excludedPaths.length > 0) { - payload.excluded_paths = registry.excludedPaths; - } + const payload = { benchmarks: registry.benchmarks.map(toYaml) }; writeFileSync(registryPath, stringifyYaml(payload), 'utf-8'); } @@ -208,14 +163,8 @@ export function addBenchmark(benchmarkPath: string): BenchmarkEntry { } const registry = loadBenchmarkRegistry(); - // Pinning overrides a prior exclusion: if the user explicitly adds a path - // they had previously hidden from discovery, they clearly want to see it. - if (registry.excludedPaths?.includes(absPath)) { - registry.excludedPaths = registry.excludedPaths.filter((p) => p !== absPath); - } const existing = registry.benchmarks.find((p) => p.path === absPath); if (existing) { - saveBenchmarkRegistry(registry); return existing; } @@ -266,11 +215,13 @@ export function touchBenchmark(benchmarkId: string): void { } } -// ── Discovery ─────────────────────────────────────────────────────────── +// ── Discovery utility ─────────────────────────────────────────────────── /** * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`. - * Returns absolute paths of discovered benchmark directories. + * Returns absolute paths of discovered benchmark directories, sorted for + * deterministic iteration. This is a one-shot helper for bulk registration; + * Studio does not scan at request time. */ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { const absRoot = path.resolve(rootDir); @@ -304,156 +255,5 @@ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { } scan(absRoot, 0); - // Sort for deterministic iteration — readdirSync order is filesystem-dependent, - // and basename collisions produce collision-suffix ids that must be stable. return results.sort(); } - -// ── Discovery roots (persisted) ───────────────────────────────────────── - -/** - * Return the persisted discovery roots as absolute paths. Never returns undefined. - */ -export function getDiscoveryRoots(): string[] { - const registry = loadBenchmarkRegistry(); - return [...(registry.discoveryRoots ?? [])]; -} - -/** - * Add an absolute discovery root to the persisted registry (idempotent). - * Returns the resolved absolute path. Does NOT validate that the directory - * currently exists — a root may become populated after Studio starts. - */ -export function addDiscoveryRoot(rootPath: string): string { - const absRoot = path.resolve(rootPath); - const registry = loadBenchmarkRegistry(); - const roots = registry.discoveryRoots ?? []; - if (!roots.includes(absRoot)) { - roots.push(absRoot); - } - registry.discoveryRoots = roots; - saveBenchmarkRegistry(registry); - return absRoot; -} - -/** - * Remove a discovery root. Returns true if it was present, false otherwise. - */ -export function removeDiscoveryRoot(rootPath: string): boolean { - const absRoot = path.resolve(rootPath); - const registry = loadBenchmarkRegistry(); - const roots = registry.discoveryRoots ?? []; - const idx = roots.indexOf(absRoot); - if (idx < 0) return false; - roots.splice(idx, 1); - registry.discoveryRoots = roots; - saveBenchmarkRegistry(registry); - return true; -} - -// ── Exclusions (hide a discovered repo without deleting its .agentv/) ── - -/** - * Return the persisted exclusion list as absolute paths. - */ -export function getExcludedPaths(): string[] { - return [...(loadBenchmarkRegistry().excludedPaths ?? [])]; -} - -/** - * Append a path to the exclusion list (idempotent). Used when the user - * clicks "Remove" on a discovered entry — the .agentv/ dir stays on disk, - * but it's suppressed from the active set until the user unexcludes it. - * Returns the resolved absolute path. No-op when the path is already - * pinned in `benchmarks[]`: exclusions only filter the discovered set, so - * tracking an excluded pin is meaningless state; `removeBenchmark` is the - * right tool for dropping a pin. - */ -export function addExcludedPath(excludePath: string): string { - const abs = path.resolve(excludePath); - const registry = loadBenchmarkRegistry(); - if (registry.benchmarks.some((b) => b.path === abs)) { - return abs; - } - const excluded = registry.excludedPaths ?? []; - if (!excluded.includes(abs)) { - excluded.push(abs); - } - registry.excludedPaths = excluded; - saveBenchmarkRegistry(registry); - return abs; -} - -/** - * Remove a path from the exclusion list. Returns true if it was present. - * The repo will reappear on the next discovery rescan if still under a root. - */ -export function removeExcludedPath(excludePath: string): boolean { - const abs = path.resolve(excludePath); - const registry = loadBenchmarkRegistry(); - const excluded = registry.excludedPaths ?? []; - const idx = excluded.indexOf(abs); - if (idx < 0) return false; - excluded.splice(idx, 1); - registry.excludedPaths = excluded; - saveBenchmarkRegistry(registry); - return true; -} - -// ── Active benchmarks (persisted + live-discovered) ───────────────────── - -/** - * Return the effective benchmark list: persisted entries merged with a live - * scan of every discovery root. Discovered entries are synthesized on the fly - * (tagged `source: 'discovered'`) and are NOT written to disk, so a repo - * disappearing from a root drops out of subsequent calls. Persisted entries - * win on absolute-path conflict, letting a user opt a discovered repo into - * manual management. Paths in `excludedPaths` are filtered out of the - * discovered set (but never from pinned entries). - */ -export function resolveActiveBenchmarks(): BenchmarkEntry[] { - const registry = loadBenchmarkRegistry(); - const persisted = registry.benchmarks.map((b) => ({ - ...b, - source: b.source ?? ('manual' as const), - })); - const roots = registry.discoveryRoots ?? []; - if (roots.length === 0) return persisted; - - const excluded = new Set(registry.excludedPaths ?? []); - const takenPaths = new Set(persisted.map((b) => b.path)); - const takenIds = new Set(persisted.map((b) => b.id)); - const discovered: BenchmarkEntry[] = []; - for (const root of roots) { - for (const repoPath of discoverBenchmarks(root)) { - if (takenPaths.has(repoPath)) continue; - if (excluded.has(repoPath)) continue; - takenPaths.add(repoPath); - const id = deriveBenchmarkId(repoPath, [...takenIds]); - takenIds.add(id); - // Synthetic timestamps: use the .agentv dir mtime if readable, else now. - let ts = new Date().toISOString(); - try { - ts = statSync(path.join(repoPath, '.agentv')).mtime.toISOString(); - } catch { - // Keep the fallback timestamp. - } - discovered.push({ - id, - name: path.basename(repoPath), - path: repoPath, - addedAt: ts, - lastOpenedAt: ts, - source: 'discovered', - }); - } - } - return [...persisted, ...discovered]; -} - -/** - * Look up an active benchmark (persisted or discovered) by id. - */ -export function getActiveBenchmark(benchmarkId: string): BenchmarkEntry | undefined { - return resolveActiveBenchmarks().find((b) => b.id === benchmarkId); -} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index b1da23c8..3e9a475d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -88,7 +88,6 @@ export { export { type BenchmarkEntry, type BenchmarkRegistry, - type BenchmarkSource, loadBenchmarkRegistry, saveBenchmarkRegistry, addBenchmark, @@ -98,14 +97,6 @@ export { discoverBenchmarks, deriveBenchmarkId, getBenchmarksRegistryPath, - getDiscoveryRoots, - addDiscoveryRoot, - removeDiscoveryRoot, - getExcludedPaths, - addExcludedPath, - removeExcludedPath, - resolveActiveBenchmarks, - getActiveBenchmark, } from './benchmarks.js'; export { trimBaselineResult } from './evaluation/baseline.js'; export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts index a3276902..5c995389 100644 --- a/packages/core/test/benchmarks.test.ts +++ b/packages/core/test/benchmarks.test.ts @@ -5,18 +5,14 @@ import path from 'node:path'; import { addBenchmark, - addDiscoveryRoot, - addExcludedPath, + getBenchmark, getBenchmarksRegistryPath, - getDiscoveryRoots, - getExcludedPaths, loadBenchmarkRegistry, - removeDiscoveryRoot, - removeExcludedPath, - resolveActiveBenchmarks, + removeBenchmark, + touchBenchmark, } from '../src/benchmarks.js'; -describe('benchmarks registry + runtime discovery', () => { +describe('benchmarks registry', () => { let fakeHome: string; let reposRoot: string; // biome-ignore lint/suspicious/noExplicitAny: spy typing from bun:test is intentionally loose. @@ -40,51 +36,54 @@ describe('benchmarks registry + runtime discovery', () => { return dir; } - it('persists and lists discovery roots, omitting the key when empty', () => { - expect(getDiscoveryRoots()).toEqual([]); - expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); + it('starts empty and surfaces new entries after addBenchmark', () => { + expect(loadBenchmarkRegistry().benchmarks).toEqual([]); - const added = addDiscoveryRoot(reposRoot); - expect(added).toBe(path.resolve(reposRoot)); - expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); - - // Serialized keys on disk are snake_case per AGENTS.md wire-format convention, - // even though the in-memory TS fields are camelCase. - const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); - expect(yamlOnDisk).toContain('discovery_roots:'); - expect(yamlOnDisk).not.toContain('discoveryRoots:'); + const repoPath = makeRepo('alpha'); + const entry = addBenchmark(repoPath); + expect(entry.name).toBe('alpha'); + expect(entry.path).toBe(path.resolve(repoPath)); - // Adding the same root again is idempotent. - addDiscoveryRoot(reposRoot); - expect(getDiscoveryRoots()).toEqual([path.resolve(reposRoot)]); + // Subsequent load reflects the write (per-request reload model). + expect(loadBenchmarkRegistry().benchmarks).toHaveLength(1); + expect(getBenchmark(entry.id)?.path).toBe(entry.path); + }); - expect(removeDiscoveryRoot(reposRoot)).toBe(true); - expect(getDiscoveryRoots()).toEqual([]); - expect(loadBenchmarkRegistry().discoveryRoots).toBeUndefined(); + it('addBenchmark refuses a path with no .agentv/ directory', () => { + const bare = mkdtempSync(path.join(os.tmpdir(), 'agentv-bare-')); + expect(() => addBenchmark(bare)).toThrow(/No \.agentv\/ directory found/); + rmSync(bare, { recursive: true, force: true }); }); - it('surfaces repos appearing under a discovery root without restart', () => { - addDiscoveryRoot(reposRoot); + it('addBenchmark is idempotent on the same path', () => { + const repoPath = makeRepo('idempotent'); + const first = addBenchmark(repoPath); + const second = addBenchmark(repoPath); + expect(first.id).toBe(second.id); + expect(loadBenchmarkRegistry().benchmarks).toHaveLength(1); + }); - expect(resolveActiveBenchmarks()).toEqual([]); + it('removeBenchmark drops the entry by id', () => { + const entry = addBenchmark(makeRepo('to-remove')); + expect(removeBenchmark(entry.id)).toBe(true); + expect(loadBenchmarkRegistry().benchmarks).toEqual([]); + expect(removeBenchmark(entry.id)).toBe(false); + }); - makeRepo('r1'); - const afterAdd = resolveActiveBenchmarks(); - expect(afterAdd).toHaveLength(1); - expect(afterAdd[0]).toMatchObject({ - name: 'r1', - path: path.resolve(reposRoot, 'r1'), - source: 'discovered', - }); + it('touchBenchmark updates lastOpenedAt without affecting other entries', () => { + const a = addBenchmark(makeRepo('a')); + const b = addBenchmark(makeRepo('b')); + const originalB = loadBenchmarkRegistry().benchmarks.find((e) => e.id === b.id); - // Simulate removal: rm -rf the repo dir. - rmSync(path.join(reposRoot, 'r1'), { recursive: true, force: true }); - expect(resolveActiveBenchmarks()).toEqual([]); + touchBenchmark(a.id); + const reloadedA = loadBenchmarkRegistry().benchmarks.find((e) => e.id === a.id); + const reloadedB = loadBenchmarkRegistry().benchmarks.find((e) => e.id === b.id); + expect(reloadedA?.lastOpenedAt).not.toBe(a.lastOpenedAt); + expect(reloadedB?.lastOpenedAt).toBe(originalB?.lastOpenedAt); }); it('serializes benchmark entries with snake_case keys on disk', () => { - const repoPath = makeRepo('snake'); - const entry = addBenchmark(repoPath); + const entry = addBenchmark(makeRepo('snake')); const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); expect(yamlOnDisk).toContain('added_at:'); @@ -100,75 +99,4 @@ describe('benchmarks registry + runtime discovery', () => { lastOpenedAt: entry.lastOpenedAt, }); }); - - it('keeps manually-added entries even when their path is not under a root', () => { - const outside = makeRepo('manual'); - const entry = addBenchmark(outside); - - const active = resolveActiveBenchmarks(); - expect(active).toHaveLength(1); - expect(active[0].id).toBe(entry.id); - expect(active[0].source).toBe('manual'); - }); - - it('hides a discovered repo once its path is excluded, and shows it again when unexcluded', () => { - addDiscoveryRoot(reposRoot); - const repoPath = makeRepo('junk'); - - expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); - - const excluded = addExcludedPath(repoPath); - expect(excluded).toBe(path.resolve(repoPath)); - expect(getExcludedPaths()).toEqual([path.resolve(repoPath)]); - expect(resolveActiveBenchmarks()).toEqual([]); - - // Serialized form uses snake_case. - const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); - expect(yamlOnDisk).toContain('excluded_paths:'); - expect(yamlOnDisk).not.toContain('excludedPaths:'); - - // Unexclude → the repo reappears on the next scan. - expect(removeExcludedPath(repoPath)).toBe(true); - expect(getExcludedPaths()).toEqual([]); - expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); - }); - - it('treats addExcludedPath on a pinned repo as a no-op', () => { - const repoPath = makeRepo('already-pinned'); - addBenchmark(repoPath); - - // Returns the resolved path but does not persist an exclusion. - expect(addExcludedPath(repoPath)).toBe(path.resolve(repoPath)); - expect(getExcludedPaths()).toEqual([]); - // Pinned benchmark still shows up, unchanged. - expect(resolveActiveBenchmarks().map((b) => b.path)).toEqual([repoPath]); - }); - - it('auto-unexcludes a path when it is manually pinned', () => { - addDiscoveryRoot(reposRoot); - const repoPath = makeRepo('pin-me'); - addExcludedPath(repoPath); - expect(resolveActiveBenchmarks()).toEqual([]); - - // Pinning wins: addBenchmark should drop the exclusion. - const entry = addBenchmark(repoPath); - expect(getExcludedPaths()).toEqual([]); - const active = resolveActiveBenchmarks(); - expect(active).toHaveLength(1); - expect(active[0].id).toBe(entry.id); - expect(active[0].source).toBe('manual'); - }); - - it('prefers the persisted entry when a discovery root would produce a duplicate path', () => { - const repoPath = makeRepo('shared'); - // Register manually first. - const manual = addBenchmark(repoPath); - // Then configure a discovery root covering the same repo. - addDiscoveryRoot(reposRoot); - - const active = resolveActiveBenchmarks(); - expect(active).toHaveLength(1); - expect(active[0].id).toBe(manual.id); - expect(active[0].source).toBe('manual'); - }); }); From b97f20e03d87a4af8d38296956686486670dd1a9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 20 Apr 2026 22:40:25 +0200 Subject: [PATCH 12/12] docs(agents): expand YAGNI with checks against overengineering real requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retro on #1145: the PR started as a modest runtime-discovery feature, grew to include source/excluded_paths/route-ordering machinery, and was later torn back out in favor of the one-line runtime-reload that the existing registry already provided. YAGNI was in AGENTS.md but only covered "don't build features nobody asked for" — it didn't catch "someone asked for X and I built a bigger X than necessary." Add five habits to §YAGNI that would have caught the miss: 1. Audit existing primitives before adding new ones. 2. Treat issue language as a hint; summarize acceptance criteria in your own words, strip implementation nouns, then check existing primitives before designing. 3. Prefer data/config changes over new mechanisms. 4. Stop and re-plan when scope doubles — don't push through. 5. Stop when you're about to add a second mode, precedence rules, or invariants between optional fields. Those are complexity tells. Also add a "call out existing overengineering" rule: when working on a task, if you spot an overengineered existing feature, open a cleanup tracking issue rather than widening the current PR. Names the shape of issue to open so it's actionable. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 90e3209c..13c3cd0c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -49,6 +49,16 @@ Before adding features, research how peer frameworks solve the problem. Prefer t ### 5. YAGNI — You Aren't Gonna Need It Don't build features until there's a concrete need. Before adding a new capability, ask: "Is there real demand for this today, or am I anticipating future needs?" Numeric thresholds, extra tracking fields, and configurable knobs should be omitted until users actually request them. Start with the simplest version (e.g., boolean over numeric range) and extend later if needed. +**YAGNI applies to *how* you meet a real request, not just *whether* to meet it.** The common failure mode is not "I built X and nobody wanted it." It's "someone asked for X and I built a bigger X than they asked for." Guard against that with these habits: + +1. **Audit existing primitives before adding new ones.** When an issue asks for capability Y, the first question is not "how do I build Y?" — it's **"what does the codebase already do that addresses Y?"** Grep for existing functions, endpoints, and config shapes. Many requests are satisfied by a behavior that already exists and just needs to be surfaced, configured, or exercised differently. +2. **Treat issue language as a hint, not a spec.** Issues describe problems *and* implementations. "We need a discovery root" is one implementation of "we need the registry to update live." When an issue lists multiple acceptable approaches (or its acceptance criteria don't actually require the implementation it names), pick the one with the least code surface. Summarize the acceptance criteria in your own words, strip out implementation nouns ("discovery root," "watcher," "registry reload"), then match them against existing primitives before designing anything new. +3. **Prefer data/config changes over new mechanisms.** If the observable effect is "this list should be editable at runtime," prefer "re-read the file per request" over "add a watcher + a new field + a precedence rule + a new endpoint." Config-driven beats code-driven when both are sufficient. +4. **Stop when scope doubles.** If an implementation's surface area grows more than ~2× the starting estimate (extra types, extra endpoints, extra invariants), that's a red flag to re-plan, not a sign to push through. Pause and ask: "What would the smallest possible version look like? Does the issue actually require more than that?" +5. **If you are about to add a second mode, two-layer precedence, or an invariant between two optional fields, stop.** `source: manual | discovered`, "pinned wins over discovered," `excluded_paths` filtering the discovered set — every one of these is a sign that you're in complexity territory that a simpler data model would have avoided. + +**Call out existing overengineering.** If, while working on a task, you notice a *current* feature in the repo that looks overengineered relative to what it's used for (multiple modes, optional precedence rules, dead-looking extensibility scaffolding), flag it — don't silently fix it. Open a tracking issue titled "cleanup: simplify X" that lists: the observable behavior today, the simpler model that would cover it, and the migration notes. Link to the code. Do not widen your current PR to absorb the cleanup unless the user asks. + ### 6. Non-Breaking Extensions New fields should be optional. Existing configurations must continue working unchanged.