Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions apps/cli/src/cli.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,32 @@
#!/usr/bin/env node
import { killAllTrackedChildren } from '@agentv/core';

import { runCli } from './index.js';

// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting.
// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans
// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial
// `index.jsonl` is already row-by-row durable, so finished tests survive.
//
// First signal: kill children, exit with the conventional 128+signal code.
// Second signal within the same process: hard-exit so a hung child cannot
// trap the user.
let interrupted = false;
function installShutdown(signal: NodeJS.Signals, exitCode: number) {
process.on(signal, () => {
if (interrupted) {
process.exit(1);
}
interrupted = true;
killAllTrackedChildren('SIGTERM');
// Defer exit one tick so SIGTERM has a chance to dispatch before the
// event loop tears down.
setTimeout(() => process.exit(exitCode), 50);
});
}
installShutdown('SIGINT', 130);
installShutdown('SIGTERM', 143);

runCli()
.then(() => {
process.exit(0);
Expand Down
78 changes: 73 additions & 5 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget(

export async function aggregateRunDir(
runDir: string,
options?: { evalFile?: string; experiment?: string },
options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
const content = await readFile(indexPath, 'utf8');
Expand All @@ -45,14 +45,36 @@ export async function aggregateRunDir(
const timingPath = path.join(runDir, 'timing.json');
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');

const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
// Preserve `planned_test_count` from any pre-existing benchmark.json (e.g.
// the stub written at run start, or from the original run when this is a
// resume) unless an explicit value was passed.
const plannedTestCount =
options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));

const benchmark = buildBenchmarkArtifact(
results,
options?.evalFile,
options?.experiment,
plannedTestCount,
);
const benchmarkPath = path.join(runDir, 'benchmark.json');
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');

const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
}

async function readPlannedTestCount(benchmarkPath: string): Promise<number | undefined> {
try {
const raw = await readFile(benchmarkPath, 'utf8');
const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
const value = parsed.metadata?.planned_test_count;
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
} catch {
return undefined;
}
}

// ---------------------------------------------------------------------------
// Artifact interfaces (snake_case to match skill-creator conventions)
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -110,6 +132,13 @@ export interface BenchmarkArtifact {
readonly targets: readonly string[];
readonly tests_run: readonly string[];
readonly experiment?: string;
/**
* Total number of test cases the run was planned to execute (across all
* targets and eval files). Written at run start so an interrupted run
* can be diagnosed as resumable when `tests_run.length < planned_test_count`,
* even if every recorded row has `execution_status: ok`.
*/
readonly planned_test_count?: number;
};
readonly run_summary: Record<
string,
Expand Down Expand Up @@ -364,6 +393,7 @@ export function buildBenchmarkArtifact(
results: readonly EvaluationResult[],
evalFile = '',
experiment?: string,
plannedTestCount?: number,
): BenchmarkArtifact {
const targetSet = new Set<string>();
const testIdSet = new Set<string>();
Expand Down Expand Up @@ -457,13 +487,43 @@ export function buildBenchmarkArtifact(
targets,
tests_run: testIds,
experiment,
planned_test_count: plannedTestCount,
},
run_summary: runSummary,
per_grader_summary: perEvaluatorSummary,
notes,
};
}

/**
* Write a stub `benchmark.json` at the start of a run, before any tests
* have executed. Carries `planned_test_count` so an interrupted run can
* still be detected as resumable even when every recorded row has
* `execution_status: ok`.
*
* The end-of-run write (writeArtifactsFromResults / aggregateRunDir)
* overwrites this file with the full summary; preserve `planned_test_count`
* by passing it through.
*/
export async function writeInitialBenchmarkArtifact(
runDir: string,
options: {
evalFile: string;
plannedTestCount: number;
experiment?: string;
},
): Promise<void> {
await mkdir(runDir, { recursive: true });
const stub = buildBenchmarkArtifact(
[],
options.evalFile,
options.experiment,
options.plannedTestCount,
);
const benchmarkPath = path.join(runDir, 'benchmark.json');
await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
}

export function buildAggregateGradingArtifact(
results: readonly EvaluationResult[],
): AggregateGradingArtifact {
Expand Down Expand Up @@ -826,7 +886,7 @@ export async function writePerTestArtifacts(
export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
options?: { evalFile?: string; experiment?: string },
options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{
testArtifactDir: string;
timingPath: string;
Expand Down Expand Up @@ -877,8 +937,16 @@ export async function writeArtifactsFromResults(
const timing = buildTimingArtifact(results);
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');

// Write benchmark
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
// Write benchmark — preserve `planned_test_count` from the run-start stub
// (or from the original run when this is a resume) unless an explicit
// value was passed by the caller.
const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
const benchmark = buildBenchmarkArtifact(
results,
options?.evalFile,
options?.experiment,
plannedTestCount,
);
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');

await writeJsonlFile(indexPath, indexRecords);
Expand Down
16 changes: 16 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import {
deduplicateByTestIdTarget,
parseJsonlResults,
writeArtifactsFromResults,
writeInitialBenchmarkArtifact,
} from './artifact-writer.js';
import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
Expand Down Expand Up @@ -1447,6 +1448,21 @@ export async function runEvalCommand(
);
}

// Write a stub benchmark.json before dispatching tests, carrying the planned
// execution count so an interrupted run can still surface as resumable in
// Studio (results.length < planned_test_count) even when every recorded row
// has execution_status: ok. The end-of-run write preserves this value via
// readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
// Skip on resume — we want to preserve the *original* planned count.
if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
await writeInitialBenchmarkArtifact(runDir, {
evalFile,
plannedTestCount: totalEvalCount,
experiment: normalizeExperimentName(options.experiment),
});
}

// Eval files run sequentially; within each file, --workers N test cases run in parallel.
// This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
// workspace races without any grouping complexity.
Expand Down
47 changes: 47 additions & 0 deletions apps/cli/src/commands/results/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,35 @@ export function registerEvalRoutes(
}
});

// ── Stop a running eval ────────────────────────────────────────────────
// POST (not DELETE) because Stop is part of the stop → resume → complete
// workflow, not a destructive cancel. The run remains resumable from the
// partial index.jsonl on disk. Idempotent: hitting /stop on a terminal
// run returns 200 with `stopped: false, reason: 'already_terminal'`
// rather than 4xx, so clients can fire-and-forget.
//
// SIGTERM the spawned CLI; the existing child.on('close') flips status
// to 'finished'/'failed'. The CLI's own signal handler walks its tracked
// grandchildren (claude/codex/pi/copilot subprocesses) and kills them
// before exiting.
app.post('/api/eval/run/:id/stop', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
const id = c.req.param('id');
const run = activeRuns.get(id ?? '');
if (!run) return c.json({ error: 'Run not found' }, 404);
if (run.status === 'finished' || run.status === 'failed' || !run.process) {
return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
}
try {
run.process.kill('SIGTERM');
} catch (err) {
return c.json({ error: (err as Error).message }, 500);
}
return c.json({ stopped: true, status: run.status });
});

// ── Run status ─────────────────────────────────────────────────────────
app.get('/api/eval/status/:id', (c) => {
const id = c.req.param('id');
Expand Down Expand Up @@ -576,6 +605,24 @@ export function registerEvalRoutes(
}
});

app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
const id = c.req.param('id');
const run = activeRuns.get(id ?? '');
if (!run) return c.json({ error: 'Run not found' }, 404);
if (run.status === 'finished' || run.status === 'failed' || !run.process) {
return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
}
try {
run.process.kill('SIGTERM');
} catch (err) {
return c.json({ error: (err as Error).message }, 500);
}
return c.json({ stopped: true, status: run.status });
});

app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => {
const id = c.req.param('id');
const run = activeRuns.get(id ?? '');
Expand Down
12 changes: 8 additions & 4 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
function deriveResumeMeta(
cwd: string,
manifestPath: string,
): { run_dir?: string; suite_filter?: string } {
const out: { run_dir?: string; suite_filter?: string } = {};
): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
const runDir = path.dirname(manifestPath);
const relative = path.relative(cwd, runDir);
// path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
Expand All @@ -359,15 +359,19 @@ function deriveResumeMeta(
const benchmarkPath = path.join(runDir, 'benchmark.json');
if (existsSync(benchmarkPath)) {
const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
metadata?: { eval_file?: string };
metadata?: { eval_file?: string; planned_test_count?: number };
};
const evalFile = parsed.metadata?.eval_file;
if (typeof evalFile === 'string' && evalFile.trim()) {
out.suite_filter = evalFile.trim();
}
const planned = parsed.metadata?.planned_test_count;
if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
out.planned_test_count = planned;
}
}
} catch {
// benchmark.json missing / unreadable / malformed — leave suite_filter unset.
// benchmark.json missing / unreadable / malformed — leave fields unset.
}
return out;
}
Expand Down
46 changes: 46 additions & 0 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,52 @@ describe('serve app', () => {
});
});

// ── POST /api/eval/run/:id/stop — interrupt a running eval ─────────────
//
// Stop is part of the stop → resume workflow, not a destructive cancel —
// POST (not DELETE) and idempotent on already-terminal runs. These tests
// validate routing/auth shape (404 unknown id, 403 read-only). The happy
// path SIGTERM behavior is covered by manual UAT because it requires a
// live subprocess that is reliably mid-run; unit tests that race a launch
// against a stop are flaky.

describe('POST /api/eval/run/:id/stop (stop API)', () => {
function makeAppForStop(opts?: { readOnly?: boolean }) {
return createApp([], tempDir, undefined, undefined, {
studioDir,
readOnly: opts?.readOnly === true,
});
}

it('returns 404 for an unknown run id', async () => {
const app = makeAppForStop();
const res = await app.request('/api/eval/run/no-such-id/stop', { method: 'POST' });
expect(res.status).toBe(404);
});

it('returns 403 in read-only mode', async () => {
const app = makeAppForStop({ readOnly: true });
const res = await app.request('/api/eval/run/anything/stop', { method: 'POST' });
expect(res.status).toBe(403);
});

it('returns 404 for benchmark-scoped stop with unknown run id', async () => {
const app = makeAppForStop();
const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id/stop', {
method: 'POST',
});
expect(res.status).toBe(404);
});

it('returns 403 in read-only mode for benchmark-scoped stop', async () => {
const app = makeAppForStop({ readOnly: true });
const res = await app.request('/api/benchmarks/some-id/eval/run/anything/stop', {
method: 'POST',
});
expect(res.status).toBe(403);
});
});

// ── POST /api/eval/preview — argument shaping for resume flags ─────────
//
// /api/eval/preview is a lightweight endpoint that returns the CLI
Expand Down
4 changes: 3 additions & 1 deletion apps/studio/src/components/ResumeRunActions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export interface ResumeRunActionsProps {
target?: string;
benchmarkId?: string;
isReadOnly: boolean;
plannedTestCount?: number;
}

export function ResumeRunActions({
Expand All @@ -44,12 +45,13 @@ export function ResumeRunActions({
target,
benchmarkId,
isReadOnly,
plannedTestCount,
}: ResumeRunActionsProps) {
const navigate = useNavigate();
const [busy, setBusy] = useState<ResumeMode | null>(null);
const [error, setError] = useState<string | null>(null);

if (!shouldShowResumeActions(results, isReadOnly)) return null;
if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null;

// Both actions need the run dir + the original eval file. Without those
// we can't target the existing run workspace, so we render the buttons
Expand Down
Loading
Loading