EntityProcess · christso · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts
@@ -1,6 +1,32 @@
 #!/usr/bin/env node
+import { killAllTrackedChildren } from '@agentv/core';
+
 import { runCli } from './index.js';
 
+// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting.
+// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans
+// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial
+// `index.jsonl` is already row-by-row durable, so finished tests survive.
+//
+// First signal: kill children, exit with the conventional 128+signal code.
+// Second signal within the same process: hard-exit so a hung child cannot
+// trap the user.
+let interrupted = false;
+function installShutdown(signal: NodeJS.Signals, exitCode: number) {
+  process.on(signal, () => {
+    if (interrupted) {
+      process.exit(1);
+    }
+    interrupted = true;
+    killAllTrackedChildren('SIGTERM');
+    // Defer exit one tick so SIGTERM has a chance to dispatch before the
+    // event loop tears down.
+    setTimeout(() => process.exit(exitCode), 50);
+  });
+}
+installShutdown('SIGINT', 130);
+installShutdown('SIGTERM', 143);
+
 runCli()
   .then(() => {
     process.exit(0);

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget(
 
 export async function aggregateRunDir(
   runDir: string,
-  options?: { evalFile?: string; experiment?: string },
+  options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
 ): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
   const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
   const content = await readFile(indexPath, 'utf8');
@@ -45,14 +45,36 @@ export async function aggregateRunDir(
   const timingPath = path.join(runDir, 'timing.json');
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g.
+  // the stub written at run start, or from the original run when this is a
+  // resume) unless an explicit value was passed.
+  const plannedTestCount =
+    options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));
+
+  const benchmark = buildBenchmarkArtifact(
+    results,
+    options?.evalFile,
+    options?.experiment,
+    plannedTestCount,
+  );
   const benchmarkPath = path.join(runDir, 'benchmark.json');
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
 
   const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
   return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
 }
 
+async function readPlannedTestCount(benchmarkPath: string): Promise<number | undefined> {
+  try {
+    const raw = await readFile(benchmarkPath, 'utf8');
+    const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
+    const value = parsed.metadata?.planned_test_count;
+    return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
 // ---------------------------------------------------------------------------
 // Artifact interfaces (snake_case to match skill-creator conventions)
 // ---------------------------------------------------------------------------
@@ -110,6 +132,13 @@ export interface BenchmarkArtifact {
     readonly targets: readonly string[];
     readonly tests_run: readonly string[];
     readonly experiment?: string;
+    /**
+     * Total number of test cases the run was planned to execute (across all
+     * targets and eval files). Written at run start so an interrupted run
+     * can be diagnosed as resumable when `tests_run.length < planned_test_count`,
+     * even if every recorded row has `execution_status: ok`.
+     */
+    readonly planned_test_count?: number;
   };
   readonly run_summary: Record<
     string,
@@ -364,6 +393,7 @@ export function buildBenchmarkArtifact(
   results: readonly EvaluationResult[],
   evalFile = '',
   experiment?: string,
+  plannedTestCount?: number,
 ): BenchmarkArtifact {
   const targetSet = new Set<string>();
   const testIdSet = new Set<string>();
@@ -457,13 +487,43 @@ export function buildBenchmarkArtifact(
       targets,
       tests_run: testIds,
       experiment,
+      planned_test_count: plannedTestCount,
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
     notes,
   };
 }
 
+/**
+ * Write a stub `benchmark.json` at the start of a run, before any tests
+ * have executed. Carries `planned_test_count` so an interrupted run can
+ * still be detected as resumable even when every recorded row has
+ * `execution_status: ok`.
+ *
+ * The end-of-run write (writeArtifactsFromResults / aggregateRunDir)
+ * overwrites this file with the full summary; preserve `planned_test_count`
+ * by passing it through.
+ */
+export async function writeInitialBenchmarkArtifact(
+  runDir: string,
+  options: {
+    evalFile: string;
+    plannedTestCount: number;
+    experiment?: string;
+  },
+): Promise<void> {
+  await mkdir(runDir, { recursive: true });
+  const stub = buildBenchmarkArtifact(
+    [],
+    options.evalFile,
+    options.experiment,
+    options.plannedTestCount,
+  );
+  const benchmarkPath = path.join(runDir, 'benchmark.json');
+  await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
+}
+
 export function buildAggregateGradingArtifact(
   results: readonly EvaluationResult[],
 ): AggregateGradingArtifact {
@@ -826,7 +886,7 @@ export async function writePerTestArtifacts(
 export async function writeArtifactsFromResults(
   results: readonly EvaluationResult[],
   outputDir: string,
-  options?: { evalFile?: string; experiment?: string },
+  options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
 ): Promise<{
   testArtifactDir: string;
   timingPath: string;
@@ -877,8 +937,16 @@ export async function writeArtifactsFromResults(
   const timing = buildTimingArtifact(results);
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
-  // Write benchmark
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  // Write benchmark — preserve `planned_test_count` from the run-start stub
+  // (or from the original run when this is a resume) unless an explicit
+  // value was passed by the caller.
+  const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
+  const benchmark = buildBenchmarkArtifact(
+    results,
+    options?.evalFile,
+    options?.experiment,
+    plannedTestCount,
+  );
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
 
   await writeJsonlFile(indexPath, indexRecords);

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -38,6 +38,7 @@ import {
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifactsFromResults,
+  writeInitialBenchmarkArtifact,
 } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
@@ -1447,6 +1448,21 @@ export async function runEvalCommand(
     );
   }
 
+  // Write a stub benchmark.json before dispatching tests, carrying the planned
+  // execution count so an interrupted run can still surface as resumable in
+  // Studio (results.length < planned_test_count) even when every recorded row
+  // has execution_status: ok. The end-of-run write preserves this value via
+  // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
+  // Skip on resume — we want to preserve the *original* planned count.
+  if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
+    const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
+    await writeInitialBenchmarkArtifact(runDir, {
+      evalFile,
+      plannedTestCount: totalEvalCount,
+      experiment: normalizeExperimentName(options.experiment),
+    });
+  }
+
   // Eval files run sequentially; within each file, --workers N test cases run in parallel.
   // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
   // workspace races without any grouping complexity.

diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
@@ -412,6 +412,35 @@ export function registerEvalRoutes(
     }
   });
 
+  // ── Stop a running eval ────────────────────────────────────────────────
+  // POST (not DELETE) because Stop is part of the stop → resume → complete
+  // workflow, not a destructive cancel. The run remains resumable from the
+  // partial index.jsonl on disk. Idempotent: hitting /stop on a terminal
+  // run returns 200 with `stopped: false, reason: 'already_terminal'`
+  // rather than 4xx, so clients can fire-and-forget.
+  //
+  // SIGTERM the spawned CLI; the existing child.on('close') flips status
+  // to 'finished'/'failed'. The CLI's own signal handler walks its tracked
+  // grandchildren (claude/codex/pi/copilot subprocesses) and kills them
+  // before exiting.
+  app.post('/api/eval/run/:id/stop', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    const id = c.req.param('id');
+    const run = activeRuns.get(id ?? '');
+    if (!run) return c.json({ error: 'Run not found' }, 404);
+    if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+      return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+    }
+    try {
+      run.process.kill('SIGTERM');
+    } catch (err) {
+      return c.json({ error: (err as Error).message }, 500);
+    }
+    return c.json({ stopped: true, status: run.status });
+  });
+
   // ── Run status ─────────────────────────────────────────────────────────
   app.get('/api/eval/status/:id', (c) => {
     const id = c.req.param('id');
@@ -576,6 +605,24 @@ export function registerEvalRoutes(
     }
   });
 
+  app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    const id = c.req.param('id');
+    const run = activeRuns.get(id ?? '');
+    if (!run) return c.json({ error: 'Run not found' }, 404);
+    if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+      return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+    }
+    try {
+      run.process.kill('SIGTERM');
+    } catch (err) {
+      return c.json({ error: (err as Error).message }, 500);
+    }
+    return c.json({ stopped: true, status: run.status });
+  });
+
   app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => {
     const id = c.req.param('id');
     const run = activeRuns.get(id ?? '');

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
 function deriveResumeMeta(
   cwd: string,
   manifestPath: string,
-): { run_dir?: string; suite_filter?: string } {
-  const out: { run_dir?: string; suite_filter?: string } = {};
+): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
+  const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
   const runDir = path.dirname(manifestPath);
   const relative = path.relative(cwd, runDir);
   // path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
@@ -359,15 +359,19 @@ function deriveResumeMeta(
     const benchmarkPath = path.join(runDir, 'benchmark.json');
     if (existsSync(benchmarkPath)) {
       const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
-        metadata?: { eval_file?: string };
+        metadata?: { eval_file?: string; planned_test_count?: number };
       };
       const evalFile = parsed.metadata?.eval_file;
       if (typeof evalFile === 'string' && evalFile.trim()) {
         out.suite_filter = evalFile.trim();
       }
+      const planned = parsed.metadata?.planned_test_count;
+      if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
+        out.planned_test_count = planned;
+      }
     }
   } catch {
-    // benchmark.json missing / unreadable / malformed — leave suite_filter unset.
+    // benchmark.json missing / unreadable / malformed — leave fields unset.
   }
   return out;
 }

diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
@@ -1003,6 +1003,52 @@ describe('serve app', () => {
     });
   });
 
+  // ── POST /api/eval/run/:id/stop — interrupt a running eval ─────────────
+  //
+  // Stop is part of the stop → resume workflow, not a destructive cancel —
+  // POST (not DELETE) and idempotent on already-terminal runs. These tests
+  // validate routing/auth shape (404 unknown id, 403 read-only). The happy
+  // path SIGTERM behavior is covered by manual UAT because it requires a
+  // live subprocess that is reliably mid-run; unit tests that race a launch
+  // against a stop are flaky.
+
+  describe('POST /api/eval/run/:id/stop (stop API)', () => {
+    function makeAppForStop(opts?: { readOnly?: boolean }) {
+      return createApp([], tempDir, undefined, undefined, {
+        studioDir,
+        readOnly: opts?.readOnly === true,
+      });
+    }
+
+    it('returns 404 for an unknown run id', async () => {
+      const app = makeAppForStop();
+      const res = await app.request('/api/eval/run/no-such-id/stop', { method: 'POST' });
+      expect(res.status).toBe(404);
+    });
+
+    it('returns 403 in read-only mode', async () => {
+      const app = makeAppForStop({ readOnly: true });
+      const res = await app.request('/api/eval/run/anything/stop', { method: 'POST' });
+      expect(res.status).toBe(403);
+    });
+
+    it('returns 404 for benchmark-scoped stop with unknown run id', async () => {
+      const app = makeAppForStop();
+      const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id/stop', {
+        method: 'POST',
+      });
+      expect(res.status).toBe(404);
+    });
+
+    it('returns 403 in read-only mode for benchmark-scoped stop', async () => {
+      const app = makeAppForStop({ readOnly: true });
+      const res = await app.request('/api/benchmarks/some-id/eval/run/anything/stop', {
+        method: 'POST',
+      });
+      expect(res.status).toBe(403);
+    });
+  });
+
   // ── POST /api/eval/preview — argument shaping for resume flags ─────────
   //
   // /api/eval/preview is a lightweight endpoint that returns the CLI

diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx
@@ -35,6 +35,7 @@ export interface ResumeRunActionsProps {
   target?: string;
   benchmarkId?: string;
   isReadOnly: boolean;
+  plannedTestCount?: number;
 }
 
 export function ResumeRunActions({
@@ -44,12 +45,13 @@ export function ResumeRunActions({
   target,
   benchmarkId,
   isReadOnly,
+  plannedTestCount,
 }: ResumeRunActionsProps) {
   const navigate = useNavigate();
   const [busy, setBusy] = useState<ResumeMode | null>(null);
   const [error, setError] = useState<string | null>(null);
 
-  if (!shouldShowResumeActions(results, isReadOnly)) return null;
+  if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null;
 
   // Both actions need the run dir + the original eval file. Without those
   // we can't target the existing run workspace, so we render the buttons