From c2358e2d8ec8675ec245c28388dec904f0523304 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:22:16 +0000 Subject: [PATCH 1/3] feat(studio): add cross-model comparison matrix view (#981) Add a Compare tab to the Studio UI that displays a matrix of experiment (columns) x target (rows) cells. Each cell shows pass rate, average score, and test counts, color-coded by performance thresholds (green >80%, yellow 50-80%, red <50%). Cells are expandable to show per-test-case breakdown. Backend: new /api/compare and /api/projects/:projectId/compare endpoints that group runs by experiment x target and compute pass_rate + avg_score. Closes #981 Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 78 ++++++ apps/studio/src/components/CompareTab.tsx | 261 ++++++++++++++++++ apps/studio/src/lib/api.ts | 18 ++ apps/studio/src/lib/types.ts | 23 ++ apps/studio/src/routes/index.tsx | 11 +- .../studio/src/routes/projects/$projectId.tsx | 14 +- 6 files changed, 401 insertions(+), 4 deletions(-) create mode 100644 apps/studio/src/components/CompareTab.tsx diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 4e54155b8..8d00849f1 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -518,6 +518,82 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { return c.json({ experiments }); } +function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { + const metas = listResultFiles(searchDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); + + // Collect per-test-case results keyed by experiment × target + const cellMap = new Map< + string, + { + experiment: string; + target: string; + evalCount: number; + passedCount: number; + scoreSum: number; + tests: Array<{ + test_id: string; + score: number; + passed: boolean; + execution_status?: string; + }>; + } + >(); + + const experimentsSet = new Set(); + const targetsSet = new Set(); + + for (const m of metas) { + try { + const records = loadLightweightResults(m.path); + for (const r of records) { + const experiment = r.experiment ?? 'default'; + const target = r.target ?? 'default'; + experimentsSet.add(experiment); + targetsSet.add(target); + const key = `${experiment}\0${target}`; + const entry = cellMap.get(key) ?? { + experiment, + target, + evalCount: 0, + passedCount: 0, + scoreSum: 0, + tests: [], + }; + const passed = r.score >= pass_threshold; + entry.evalCount++; + if (passed) entry.passedCount++; + entry.scoreSum += r.score; + entry.tests.push({ + test_id: r.testId, + score: r.score, + passed, + execution_status: r.executionStatus, + }); + cellMap.set(key, entry); + } + } catch { + // skip runs that fail to load + } + } + + const cells = [...cellMap.values()].map((entry) => ({ + experiment: entry.experiment, + target: entry.target, + eval_count: entry.evalCount, + passed_count: entry.passedCount, + pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0, + avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0, + tests: entry.tests, + })); + + return c.json({ + experiments: [...experimentsSet].sort(), + targets: [...targetsSet].sort(), + cells, + }); +} + function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { const metas = listResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); @@ -808,6 +884,7 @@ export function createApp( app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx)); app.get('/api/runs/:filename/evals/:evalId/files/*', (c) => handleEvalFileContent(c, defaultCtx)); app.get('/api/experiments', (c) => handleExperiments(c, defaultCtx)); + app.get('/api/compare', (c) => handleCompare(c, defaultCtx)); app.get('/api/targets', (c) => handleTargets(c, defaultCtx)); // Feedback (unscoped — read uses defaultCtx.searchDir as resultDir) @@ -914,6 +991,7 @@ export function createApp( withProject(c, handleEvalFileContent), ); app.get('/api/projects/:projectId/experiments', (c) => withProject(c, handleExperiments)); + app.get('/api/projects/:projectId/compare', (c) => withProject(c, handleCompare)); app.get('/api/projects/:projectId/targets', (c) => withProject(c, handleTargets)); app.get('/api/projects/:projectId/feedback', (c) => withProject(c, handleFeedbackRead)); diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx new file mode 100644 index 000000000..82943746e --- /dev/null +++ b/apps/studio/src/components/CompareTab.tsx @@ -0,0 +1,261 @@ +/** + * Cross-model comparison matrix component. + * + * Displays a grid of experiment (columns) x target (rows) cells, + * each showing pass rate, average score, and test counts. Color-coded + * by performance: green (>80%), yellow (50-80%), red (<50%). + * Cells are expandable to show per-test-case breakdown. + * + * Used in both unscoped and project-scoped views. + */ + +import { useState } from 'react'; + +import type { CompareCell, CompareResponse, CompareTestResult } from '~/lib/types'; + +interface CompareTabProps { + data: CompareResponse | undefined; + isLoading: boolean; +} + +export function CompareTab({ data, isLoading }: CompareTabProps) { + if (isLoading) { + return ; + } + + if (!data || data.cells.length === 0) { + return ( +
+

No comparison data available

+

+ Run evaluations with different experiment and target combinations to see a comparison + matrix. +

+
+ ); + } + + const { experiments, targets, cells } = data; + + // If there is only one experiment and one target, the matrix is trivial + if (experiments.length <= 1 && targets.length <= 1) { + return ( +
+

Not enough variation to compare

+

+ The comparison matrix requires at least 2 experiments or 2 targets. Currently there{' '} + {experiments.length === 1 ? 'is 1 experiment' : `are ${experiments.length} experiments`}{' '} + and {targets.length === 1 ? '1 target' : `${targets.length} targets`}. +

+
+ ); + } + + // Build a lookup map for cells + const cellMap = new Map(); + for (const cell of cells) { + cellMap.set(`${cell.experiment}\0${cell.target}`, cell); + } + + // Find best pass rate per row (target) for highlighting + const bestByTarget = new Map(); + const worstByTarget = new Map(); + for (const target of targets) { + let best = -1; + let worst = 2; + for (const experiment of experiments) { + const cell = cellMap.get(`${experiment}\0${target}`); + if (cell) { + if (cell.pass_rate > best) best = cell.pass_rate; + if (cell.pass_rate < worst) worst = cell.pass_rate; + } + } + bestByTarget.set(target, best); + worstByTarget.set(target, worst); + } + + return ( +
+
+ + + >80% + + + + 50-80% + + + + <50% + + + + No data + +
+ +
+ + + + + {experiments.map((exp) => ( + + ))} + + + + {targets.map((target) => ( + + ))} + +
Target + {exp} +
+
+
+ ); +} + +function CompareRow({ + target, + experiments, + cellMap, + bestRate, + worstRate, +}: { + target: string; + experiments: string[]; + cellMap: Map; + bestRate: number; + worstRate: number; +}) { + return ( + + {target} + {experiments.map((exp) => { + const cell = cellMap.get(`${exp}\0${target}`); + return ( + + {cell ? ( + 1 && cell.pass_rate === bestRate && bestRate !== worstRate} + isWorst={experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate} + /> + ) : ( +
+ -- +
+ )} + + ); + })} + + ); +} + +function passRateColorClass(rate: number): string { + if (rate >= 0.8) return 'bg-emerald-900/60 ring-emerald-700/40'; + if (rate >= 0.5) return 'bg-amber-900/40 ring-amber-700/40'; + return 'bg-red-900/40 ring-red-700/40'; +} + +function passRateTextClass(rate: number): string { + if (rate >= 0.8) return 'text-emerald-400'; + if (rate >= 0.5) return 'text-amber-400'; + return 'text-red-400'; +} + +function CompareMatrixCell({ + cell, + isBest, + isWorst, +}: { + cell: CompareCell; + isBest: boolean; + isWorst: boolean; +}) { + const [expanded, setExpanded] = useState(false); + const pct = Math.round(cell.pass_rate * 100); + const avgPct = Math.round(cell.avg_score * 100); + + return ( +
+ + + {expanded && ( + + )} +
+ ); +} + +function TestCaseBreakdown({ tests }: { tests: CompareTestResult[] }) { + return ( +
+
Test Cases
+
+ {tests.map((t) => ( +
+ + {t.passed ? '\u2713' : '\u2717'} + + + {t.test_id} + + {Math.round(t.score * 100)}% +
+ ))} +
+
+ ); +} + +function LoadingSkeleton() { + return ( +
+
+
+
+
+ {['sk-1', 'sk-2', 'sk-3'].map((id) => ( +
+
+
+
+
+
+ ))} +
+
+ ); +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 267106f81..0ffc8a868 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -9,6 +9,7 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { CategoriesResponse, + CompareResponse, EvalDetailResponse, EvalDiscoverResponse, EvalPreviewResponse, @@ -88,6 +89,11 @@ export const experimentsOptions = queryOptions({ queryFn: () => fetchJson('/api/experiments'), }); +export const compareOptions = queryOptions({ + queryKey: ['compare'], + queryFn: () => fetchJson('/api/compare'), +}); + export const targetsOptions = queryOptions({ queryKey: ['targets'], queryFn: () => fetchJson('/api/targets'), @@ -171,6 +177,10 @@ export function useExperiments() { return useQuery(experimentsOptions); } +export function useCompare() { + return useQuery(compareOptions); +} + export function useTargets() { return useQuery(targetsOptions); } @@ -372,6 +382,14 @@ export function projectExperimentsOptions(projectId: string) { }); } +export function projectCompareOptions(projectId: string) { + return queryOptions({ + queryKey: ['projects', projectId, 'compare'], + queryFn: () => fetchJson(`${projectApiBase(projectId)}/compare`), + enabled: !!projectId, + }); +} + export function projectTargetsOptions(projectId: string) { return queryOptions({ queryKey: ['projects', projectId, 'targets'], diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index bc6be5908..97300baa1 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -127,6 +127,29 @@ export interface ExperimentsResponse { experiments: ExperimentSummary[]; } +export interface CompareTestResult { + test_id: string; + score: number; + passed: boolean; + execution_status?: string; +} + +export interface CompareCell { + experiment: string; + target: string; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: CompareTestResult[]; +} + +export interface CompareResponse { + experiments: string[]; + targets: string[]; + cells: CompareCell[]; +} + export interface TargetSummary { name: string; run_count: number; diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index cf95c22bf..ed6185d4c 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -10,6 +10,7 @@ import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-ro import { useState } from 'react'; import { useQueryClient } from '@tanstack/react-query'; +import { CompareTab } from '~/components/CompareTab'; import { ExperimentsTab } from '~/components/ExperimentsTab'; import { ProjectCard } from '~/components/ProjectCard'; import { RunEvalModal } from '~/components/RunEvalModal'; @@ -18,16 +19,18 @@ import { TargetsTab } from '~/components/TargetsTab'; import { addProjectApi, discoverProjectsApi, + useCompare, useProjectList, useRunList, useStudioConfig, } from '~/lib/api'; -type TabId = 'runs' | 'experiments' | 'targets'; +type TabId = 'runs' | 'experiments' | 'compare' | 'targets'; const tabs: { id: TabId; label: string }[] = [ { id: 'runs', label: 'Recent Runs' }, { id: 'experiments', label: 'Experiments' }, + { id: 'compare', label: 'Compare' }, { id: 'targets', label: 'Targets' }, ]; @@ -225,6 +228,7 @@ function SingleProjectHome() { {/* Tab content */} {activeTab === 'runs' && } {activeTab === 'experiments' && } + {activeTab === 'compare' && } {activeTab === 'targets' && } {!isReadOnly && setShowRunEval(false)} />} @@ -232,6 +236,11 @@ function SingleProjectHome() { ); } +function CompareTabContent() { + const { data, isLoading } = useCompare(); + return ; +} + function RunsTabContent({ data, isLoading, diff --git a/apps/studio/src/routes/projects/$projectId.tsx b/apps/studio/src/routes/projects/$projectId.tsx index b38d112e4..12bc5309a 100644 --- a/apps/studio/src/routes/projects/$projectId.tsx +++ b/apps/studio/src/routes/projects/$projectId.tsx @@ -8,17 +8,19 @@ import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-ro import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; +import { CompareTab } from '~/components/CompareTab'; import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; import { useProjectRunList, useStudioConfig } from '~/lib/api'; -import { projectExperimentsOptions, projectTargetsOptions } from '~/lib/api'; -import type { ExperimentsResponse, TargetsResponse } from '~/lib/types'; +import { projectCompareOptions, projectExperimentsOptions, projectTargetsOptions } from '~/lib/api'; +import type { CompareResponse, ExperimentsResponse, TargetsResponse } from '~/lib/types'; -type TabId = 'runs' | 'experiments' | 'targets'; +type TabId = 'runs' | 'experiments' | 'compare' | 'targets'; const tabs: { id: TabId; label: string }[] = [ { id: 'runs', label: 'Recent Runs' }, { id: 'experiments', label: 'Experiments' }, + { id: 'compare', label: 'Compare' }, { id: 'targets', label: 'Targets' }, ]; @@ -81,6 +83,7 @@ function ProjectHomePage() { {activeTab === 'runs' && } {activeTab === 'experiments' && } + {activeTab === 'compare' && } {activeTab === 'targets' && } {!isReadOnly && ( @@ -162,6 +165,11 @@ function ProjectExperimentsTab({ projectId }: { projectId: string }) { ); } +function ProjectCompareTab({ projectId }: { projectId: string }) { + const { data, isLoading } = useQuery(projectCompareOptions(projectId)); + return ; +} + function ProjectTargetsTab({ projectId }: { projectId: string }) { const { data, isLoading } = useQuery(projectTargetsOptions(projectId)); const targets = (data as TargetsResponse | undefined)?.targets ?? []; From 8aee759d740e0743f1bb5ecd78645dfff5e9cf82 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:23:50 +0000 Subject: [PATCH 2/3] style(studio): fix biome formatting in CompareTab Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/studio/src/components/CompareTab.tsx | 33 ++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index 82943746e..dc99ccdbf 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -148,8 +148,12 @@ function CompareRow({ {cell ? ( 1 && cell.pass_rate === bestRate && bestRate !== worstRate} - isWorst={experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate} + isBest={ + experiments.length > 1 && cell.pass_rate === bestRate && bestRate !== worstRate + } + isWorst={ + experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate + } /> ) : (
@@ -198,20 +202,28 @@ function CompareMatrixCell({ }`} >
- + {pct}% - {isBest && } - {isWorst && } + {isBest && ( + + ▲ + + )} + {isWorst && ( + + ▼ + + )}
{cell.passed_count}/{cell.eval_count} pass | avg {avgPct}%
- {expanded && ( - - )} + {expanded && }
); } @@ -222,10 +234,7 @@ function TestCaseBreakdown({ tests }: { tests: CompareTestResult[] }) {
Test Cases
{tests.map((t) => ( -
+
{t.passed ? '\u2713' : '\u2717'} From 87ca80b7f0469f487533bbab9282399b9c94f583 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:42:10 +0000 Subject: [PATCH 3/3] fix(studio): address code review findings for comparison view - Use JSON.stringify key to prevent cell collisions - Cap tests array per cell to prevent unbounded payload - Deduplicate test results (keep latest per test_id) - Add aria-expanded to expandable cells - Thread error state into CompareTab - Remove type assertion in project compare tab Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 34 +++++++++++++------ apps/studio/src/components/CompareTab.tsx | 19 ++++++++--- apps/studio/src/routes/index.tsx | 4 +-- .../studio/src/routes/projects/$projectId.tsx | 6 ++-- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 8d00849f1..57686a897 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -551,7 +551,7 @@ function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { const target = r.target ?? 'default'; experimentsSet.add(experiment); targetsSet.add(target); - const key = `${experiment}\0${target}`; + const key = JSON.stringify([experiment, target]); const entry = cellMap.get(key) ?? { experiment, target, @@ -577,15 +577,29 @@ function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { } } - const cells = [...cellMap.values()].map((entry) => ({ - experiment: entry.experiment, - target: entry.target, - eval_count: entry.evalCount, - passed_count: entry.passedCount, - pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0, - avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0, - tests: entry.tests, - })); + const MAX_TESTS_PER_CELL = 100; + + const cells = [...cellMap.values()].map((entry) => { + // Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order) + const dedupMap = new Map(); + for (const t of entry.tests) { + dedupMap.set(t.test_id, t); + } + const dedupedTests = [...dedupMap.values()]; + + // Cap to most recent entries to prevent unbounded payloads + const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL); + + return { + experiment: entry.experiment, + target: entry.target, + eval_count: entry.evalCount, + passed_count: entry.passedCount, + pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0, + avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0, + tests: cappedTests, + }; + }); return c.json({ experiments: [...experimentsSet].sort(), diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index dc99ccdbf..a448214d3 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -16,13 +16,23 @@ import type { CompareCell, CompareResponse, CompareTestResult } from '~/lib/type interface CompareTabProps { data: CompareResponse | undefined; isLoading: boolean; + isError?: boolean; + error?: Error | null; } -export function CompareTab({ data, isLoading }: CompareTabProps) { +export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) { if (isLoading) { return ; } + if (isError && error) { + return ( +
+ Failed to load comparison data: {error.message} +
+ ); + } + if (!data || data.cells.length === 0) { return (
@@ -54,7 +64,7 @@ export function CompareTab({ data, isLoading }: CompareTabProps) { // Build a lookup map for cells const cellMap = new Map(); for (const cell of cells) { - cellMap.set(`${cell.experiment}\0${cell.target}`, cell); + cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell); } // Find best pass rate per row (target) for highlighting @@ -64,7 +74,7 @@ export function CompareTab({ data, isLoading }: CompareTabProps) { let best = -1; let worst = 2; for (const experiment of experiments) { - const cell = cellMap.get(`${experiment}\0${target}`); + const cell = cellMap.get(JSON.stringify([experiment, target])); if (cell) { if (cell.pass_rate > best) best = cell.pass_rate; if (cell.pass_rate < worst) worst = cell.pass_rate; @@ -142,7 +152,7 @@ function CompareRow({ {target} {experiments.map((exp) => { - const cell = cellMap.get(`${exp}\0${target}`); + const cell = cellMap.get(JSON.stringify([exp, target])); return ( {cell ? ( @@ -197,6 +207,7 @@ function CompareMatrixCell({