Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,96 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) {
return c.json({ experiments });
}

function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
const metas = listResultFiles(searchDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);

// Collect per-test-case results keyed by experiment × target
const cellMap = new Map<
string,
{
experiment: string;
target: string;
evalCount: number;
passedCount: number;
scoreSum: number;
tests: Array<{
test_id: string;
score: number;
passed: boolean;
execution_status?: string;
}>;
}
>();

const experimentsSet = new Set<string>();
const targetsSet = new Set<string>();

for (const m of metas) {
try {
const records = loadLightweightResults(m.path);
for (const r of records) {
const experiment = r.experiment ?? 'default';
const target = r.target ?? 'default';
experimentsSet.add(experiment);
targetsSet.add(target);
const key = JSON.stringify([experiment, target]);
const entry = cellMap.get(key) ?? {
experiment,
target,
evalCount: 0,
passedCount: 0,
scoreSum: 0,
tests: [],
};
const passed = r.score >= pass_threshold;
entry.evalCount++;
if (passed) entry.passedCount++;
entry.scoreSum += r.score;
entry.tests.push({
test_id: r.testId,
score: r.score,
passed,
execution_status: r.executionStatus,
});
cellMap.set(key, entry);
}
} catch {
// skip runs that fail to load
}
}

const MAX_TESTS_PER_CELL = 100;

const cells = [...cellMap.values()].map((entry) => {
// Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order)
const dedupMap = new Map<string, (typeof entry.tests)[number]>();
for (const t of entry.tests) {
dedupMap.set(t.test_id, t);
}
const dedupedTests = [...dedupMap.values()];

// Cap to most recent entries to prevent unbounded payloads
const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL);

return {
experiment: entry.experiment,
target: entry.target,
eval_count: entry.evalCount,
passed_count: entry.passedCount,
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0,
tests: cappedTests,
};
});

return c.json({
experiments: [...experimentsSet].sort(),
targets: [...targetsSet].sort(),
cells,
});
}

function handleTargets(c: C, { searchDir, agentvDir }: DataContext) {
const metas = listResultFiles(searchDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
Expand Down Expand Up @@ -808,6 +898,7 @@ export function createApp(
app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx));
app.get('/api/runs/:filename/evals/:evalId/files/*', (c) => handleEvalFileContent(c, defaultCtx));
app.get('/api/experiments', (c) => handleExperiments(c, defaultCtx));
app.get('/api/compare', (c) => handleCompare(c, defaultCtx));
app.get('/api/targets', (c) => handleTargets(c, defaultCtx));

// Feedback (unscoped — read uses defaultCtx.searchDir as resultDir)
Expand Down Expand Up @@ -914,6 +1005,7 @@ export function createApp(
withProject(c, handleEvalFileContent),
);
app.get('/api/projects/:projectId/experiments', (c) => withProject(c, handleExperiments));
app.get('/api/projects/:projectId/compare', (c) => withProject(c, handleCompare));
app.get('/api/projects/:projectId/targets', (c) => withProject(c, handleTargets));
app.get('/api/projects/:projectId/feedback', (c) => withProject(c, handleFeedbackRead));

Expand Down
281 changes: 281 additions & 0 deletions apps/studio/src/components/CompareTab.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
/**
* Cross-model comparison matrix component.
*
* Displays a grid of experiment (columns) x target (rows) cells,
* each showing pass rate, average score, and test counts. Color-coded
* by performance: green (>80%), yellow (50-80%), red (<50%).
* Cells are expandable to show per-test-case breakdown.
*
* Used in both unscoped and project-scoped views.
*/

import { useState } from 'react';

import type { CompareCell, CompareResponse, CompareTestResult } from '~/lib/types';

interface CompareTabProps {
data: CompareResponse | undefined;
isLoading: boolean;
isError?: boolean;
error?: Error | null;
}

export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) {
if (isLoading) {
return <LoadingSkeleton />;
}

if (isError && error) {
return (
<div className="rounded-lg border border-red-900/50 bg-red-950/20 p-6 text-red-400">
Failed to load comparison data: {error.message}
</div>
);
}

if (!data || data.cells.length === 0) {
return (
<div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
<p className="text-lg text-gray-400">No comparison data available</p>
<p className="mt-2 text-sm text-gray-500">
Run evaluations with different experiment and target combinations to see a comparison
matrix.
</p>
</div>
);
}

const { experiments, targets, cells } = data;

// If there is only one experiment and one target, the matrix is trivial
if (experiments.length <= 1 && targets.length <= 1) {
return (
<div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
<p className="text-lg text-gray-400">Not enough variation to compare</p>
<p className="mt-2 text-sm text-gray-500">
The comparison matrix requires at least 2 experiments or 2 targets. Currently there{' '}
{experiments.length === 1 ? 'is 1 experiment' : `are ${experiments.length} experiments`}{' '}
and {targets.length === 1 ? '1 target' : `${targets.length} targets`}.
</p>
</div>
);
}

// Build a lookup map for cells
const cellMap = new Map<string, CompareCell>();
for (const cell of cells) {
cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell);
}

// Find best pass rate per row (target) for highlighting
const bestByTarget = new Map<string, number>();
const worstByTarget = new Map<string, number>();
for (const target of targets) {
let best = -1;
let worst = 2;
for (const experiment of experiments) {
const cell = cellMap.get(JSON.stringify([experiment, target]));
if (cell) {
if (cell.pass_rate > best) best = cell.pass_rate;
if (cell.pass_rate < worst) worst = cell.pass_rate;
}
}
bestByTarget.set(target, best);
worstByTarget.set(target, worst);
}

return (
<div className="space-y-4">
<div className="flex items-center gap-4 text-sm text-gray-400">
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-emerald-900/60 ring-1 ring-emerald-700/40" />
&gt;80%
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-amber-900/40 ring-1 ring-amber-700/40" />
50-80%
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-red-900/40 ring-1 ring-red-700/40" />
&lt;50%
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm border border-dashed border-gray-700" />
No data
</span>
</div>

<div className="overflow-x-auto rounded-lg border border-gray-800">
<table className="w-full text-left text-sm">
<thead className="border-b border-gray-800 bg-gray-900/50">
<tr>
<th className="px-4 py-3 font-medium text-gray-400">Target</th>
{experiments.map((exp) => (
<th key={exp} className="px-4 py-3 text-center font-medium text-gray-400">
{exp}
</th>
))}
</tr>
</thead>
<tbody className="divide-y divide-gray-800/50">
{targets.map((target) => (
<CompareRow
key={target}
target={target}
experiments={experiments}
cellMap={cellMap}
bestRate={bestByTarget.get(target) ?? 0}
worstRate={worstByTarget.get(target) ?? 0}
/>
))}
</tbody>
</table>
</div>
</div>
);
}

function CompareRow({
target,
experiments,
cellMap,
bestRate,
worstRate,
}: {
target: string;
experiments: string[];
cellMap: Map<string, CompareCell>;
bestRate: number;
worstRate: number;
}) {
return (
<tr className="transition-colors hover:bg-gray-900/30">
<td className="px-4 py-3 font-medium text-gray-200">{target}</td>
{experiments.map((exp) => {
const cell = cellMap.get(JSON.stringify([exp, target]));
return (
<td key={exp} className="px-2 py-2">
{cell ? (
<CompareMatrixCell
cell={cell}
isBest={
experiments.length > 1 && cell.pass_rate === bestRate && bestRate !== worstRate
}
isWorst={
experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate
}
/>
) : (
<div className="flex items-center justify-center rounded-lg border border-dashed border-gray-700 px-3 py-4 text-gray-600">
--
</div>
)}
</td>
);
})}
</tr>
);
}

function passRateColorClass(rate: number): string {
if (rate >= 0.8) return 'bg-emerald-900/60 ring-emerald-700/40';
if (rate >= 0.5) return 'bg-amber-900/40 ring-amber-700/40';
return 'bg-red-900/40 ring-red-700/40';
}

function passRateTextClass(rate: number): string {
if (rate >= 0.8) return 'text-emerald-400';
if (rate >= 0.5) return 'text-amber-400';
return 'text-red-400';
}

function CompareMatrixCell({
cell,
isBest,
isWorst,
}: {
cell: CompareCell;
isBest: boolean;
isWorst: boolean;
}) {
const [expanded, setExpanded] = useState(false);
const pct = Math.round(cell.pass_rate * 100);
const avgPct = Math.round(cell.avg_score * 100);

return (
<div className="space-y-1">
<button
type="button"
onClick={() => setExpanded(!expanded)}
aria-expanded={expanded}
className={`w-full rounded-lg px-3 py-3 text-center ring-1 transition-colors ${passRateColorClass(cell.pass_rate)} hover:brightness-110 ${
isBest ? 'ring-2 ring-emerald-500/60' : isWorst ? 'ring-2 ring-red-500/40' : ''
}`}
>
<div className="flex items-center justify-center gap-1">
<span
className={`text-lg font-semibold tabular-nums ${passRateTextClass(cell.pass_rate)}`}
>
{pct}%
</span>
{isBest && (
<span className="text-xs text-emerald-400" title="Best performer">
&#9650;
</span>
)}
{isWorst && (
<span className="text-xs text-red-400" title="Worst performer">
&#9660;
</span>
)}
</div>
<div className="mt-0.5 text-xs text-gray-400">
{cell.passed_count}/{cell.eval_count} pass | avg {avgPct}%
</div>
</button>

{expanded && <TestCaseBreakdown tests={cell.tests} />}
</div>
);
}

function TestCaseBreakdown({ tests }: { tests: CompareTestResult[] }) {
return (
<div className="mt-1 max-h-48 overflow-y-auto rounded-md border border-gray-800 bg-gray-950/80 p-2">
<div className="mb-1 text-xs font-medium text-gray-500">Test Cases</div>
<div className="space-y-0.5">
{tests.map((t) => (
<div key={t.test_id} className="flex items-center gap-2 rounded px-1.5 py-0.5 text-xs">
<span className={t.passed ? 'text-emerald-400' : 'text-red-400'}>
{t.passed ? '\u2713' : '\u2717'}
</span>
<span className="flex-1 truncate text-gray-300" title={t.test_id}>
{t.test_id}
</span>
<span className="tabular-nums text-gray-500">{Math.round(t.score * 100)}%</span>
</div>
))}
</div>
</div>
);
}

function LoadingSkeleton() {
return (
<div className="overflow-hidden rounded-lg border border-gray-800">
<div className="animate-pulse">
<div className="border-b border-gray-800 bg-gray-900/50 px-4 py-3">
<div className="h-4 w-48 rounded bg-gray-800" />
</div>
{['sk-1', 'sk-2', 'sk-3'].map((id) => (
<div key={id} className="flex gap-4 border-b border-gray-800/50 px-4 py-6">
<div className="h-4 w-24 rounded bg-gray-800" />
<div className="h-16 w-32 rounded bg-gray-800" />
<div className="h-16 w-32 rounded bg-gray-800" />
<div className="h-16 w-32 rounded bg-gray-800" />
</div>
))}
</div>
</div>
);
}
Loading
Loading