From f63cf2e93ce116fd2f647eaa1fccf553471374b5 Mon Sep 17 00:00:00 2001
From: Chris Huber <chubes@extrachill.com>
Date: Thu, 4 Jun 2026 10:09:22 -0400
Subject: [PATCH] add generic benchmark summary helpers

---
 README.md                              |  18 ++
 docs/benchmark-contract.md             | 152 ++++++++++++++
 package.json                           |   1 +
 packages/cli/src/cli-entry.ts          |   3 +
 packages/cli/src/command-router.ts     |  14 ++
 packages/cli/src/commands/benchmark.ts | 273 +++++++++++++++++++++++++
 packages/cli/src/output.ts             |   3 +
 scripts/benchmark-summary-smoke.ts     |  75 +++++++
 8 files changed, 539 insertions(+)
 create mode 100644 docs/benchmark-contract.md
 create mode 100644 packages/cli/src/commands/benchmark.ts
 create mode 100644 scripts/benchmark-summary-smoke.ts

diff --git a/README.md b/README.md
index 514fd65b..cf258503 100644
--- a/README.md
+++ b/README.md
@@ -939,6 +939,24 @@ npm run wp-codebox -- recipe-run \
 
 Each workload file returns a callable. The callable may return numeric metrics directly or a payload with `metrics` and `metadata` keys. The recipe output reports duration percentiles, custom metric aggregates, peak memory, runtime artifacts, and the parsed `benchResults` object in JSON output when a single `wordpress.bench` step runs. If earlier `wordpress.browser-probe` steps in the same recipe captured generic `performance` or `memory` artifacts, `wordpress.bench` promotes selected numeric browser values into each scenario's metrics using `browser_*` names, while the raw browser artifacts remain available under `files/browser/`.
 
+Use `bench summarize` to extract a stable automation envelope from saved `recipe-run --json` output:
+
+```bash
+npm run wp-codebox -- bench summarize \
+  --input ./artifacts/bench-plugin/recipe-run.json \
+  --json
+```
+
+Use `artifacts bench-results` to extract benchmark results from an artifact bundle command log:
+
+```bash
+npm run wp-codebox -- artifacts bench-results \
+  --bundle ./artifacts/bench-plugin \
+  --json
+```
+
+See [`docs/benchmark-contract.md`](docs/benchmark-contract.md) for the generic benchmark contract, result shape, artifact/provenance expectations, and the boundary between WP Codebox responsibilities and caller-owned scoring or product semantics.
+
 ### `agent-runtime-probe`
 
 Boot a sandbox with Agents API, Data Machine, and Data Machine Code mounted, then verify the stack loads.
diff --git a/docs/benchmark-contract.md b/docs/benchmark-contract.md
new file mode 100644
index 00000000..77552d02
--- /dev/null
+++ b/docs/benchmark-contract.md
@@ -0,0 +1,152 @@
+# Benchmark Contract
+
+WP Codebox provides a generic benchmark substrate for disposable WordPress
+runtimes. It owns workload execution, normalized metric envelopes, runtime
+evidence, and artifact extraction helpers. Callers own product semantics such as
+scenario catalogs, scoring, grading, model comparison, reward policy, retry
+policy, and reports.
+
+```text
+caller benchmark suite
+  -> writes a WP Codebox recipe
+  -> runs recipe-run in an isolated WordPress runtime
+  -> receives recipe output plus artifact bundle
+  -> extracts generic benchmark results
+  -> applies caller-owned scoring/reporting outside WP Codebox
+```
+
+## WP Codebox Responsibilities
+
+- Execute declared recipe steps in a disposable WordPress runtime.
+- Register generic benchmark commands such as `wordpress.bench`.
+- Capture runtime artifacts, command logs, browser evidence, and provenance.
+- Emit `benchResults` and `benchResultsList` in `wp-codebox/recipe-run/v1` JSON output when `wordpress.bench` steps succeed.
+- Provide CLI helpers that extract benchmark envelopes from saved `recipe-run` output or artifact bundles.
+- Keep helper output stable, JSON-friendly, and free of product-specific scoring fields.
+
+## Caller Responsibilities
+
+- Define the suite, scenario ids, task taxonomy, expected behavior, and run matrix.
+- Decide which metrics matter and how to compare them.
+- Score, grade, rank, retry, regress, or publish benchmark reports.
+- Store durable benchmark history and model/product metadata.
+- Interpret browser metrics or runtime artifacts in a product-specific context.
+
+## Workloads
+
+`wordpress.bench` currently supports plugin workloads discovered from
+`tests/bench/*.php` plus explicit `workloads-json` entries. Workloads can run PHP
+code and, through configured workload steps, WP-CLI commands. Each workload
+returns numeric metrics directly or an object with `metrics` and `metadata`.
+
+The command contract is intentionally broad enough for future workload types:
+
+- **PHP:** direct workload callables and inline configured workload steps.
+- **WP-CLI:** configured workload steps that execute in the same sandbox.
+- **Ability:** future ability-backed workload steps should still return generic numeric metrics and metadata.
+- **Browser:** `wordpress.browser-probe` captures generic browser performance and memory artifacts. When a recipe runs browser probes before `wordpress.bench`, selected numeric `browser_*` metrics are promoted into each benchmark scenario while raw browser artifacts remain in the bundle.
+
+## Result Shape
+
+The benchmark envelope is a JSON object with generic fields:
+
+```json
+{
+  "component_id": "bench-plugin",
+  "iterations": 3,
+  "warmup_iterations": 1,
+  "scenarios": [
+    {
+      "id": "noop",
+      "source": "file",
+      "iterations": 3,
+      "metrics": {
+        "duration_ms_mean": 1.23,
+        "peak_memory_bytes_mean": 123456
+      },
+      "metadata": {},
+      "artifacts": {}
+    }
+  ]
+}
+```
+
+Metrics are numeric and named by the workload/runtime surface. WP Codebox records
+them; it does not decide whether a value is good, bad, passing, failing, or
+regressed.
+
+## Running Benchmarks
+
+Use a recipe workflow step with `wordpress.bench`:
+
+```bash
+npm run wp-codebox -- recipe-run \
+  --recipe ./examples/recipes/bench-plugin.json \
+  --artifacts ./artifacts/bench-plugin \
+  --json > ./artifacts/bench-plugin/recipe-run.json
+```
+
+The `recipe-run` JSON output includes `benchResults` when exactly one successful
+`wordpress.bench` step ran, and `benchResultsList` when one or more benchmark
+steps ran.
+
+## Extracting Results
+
+Summarize saved `recipe-run` JSON:
+
+```bash
+npm run wp-codebox -- bench summarize \
+  --input ./artifacts/bench-plugin/recipe-run.json \
+  --json
+```
+
+Summarize an artifact bundle by reading its command log:
+
+```bash
+npm run wp-codebox -- artifacts bench-results \
+  --bundle ./artifacts/bench-plugin \
+  --json
+```
+
+Both commands emit `wp-codebox/benchmark-summary/v1` with the raw benchmark
+envelopes plus a flattened scenario summary for automation:
+
+```json
+{
+  "schema": "wp-codebox/benchmark-summary/v1",
+  "source": { "type": "recipe-run-output", "path": "/abs/recipe-run.json" },
+  "hasBenchResults": true,
+  "benchmarkCount": 1,
+  "scenarioCount": 1,
+  "benchmarks": [],
+  "scenarios": [
+    {
+      "componentId": "bench-plugin",
+      "id": "noop",
+      "source": "file",
+      "iterations": 3,
+      "metricCount": 2,
+      "metrics": {},
+      "artifacts": {}
+    }
+  ]
+}
+```
+
+Omit `--json` for a compact human-readable table. The human form is for quick
+inspection; automation should consume the JSON envelope.
+
+## Non-Responsibilities
+
+WP Codebox benchmark helpers do not define or store:
+
+- Product benchmark suites.
+- Rewards or graders.
+- Pass/fail scoring policies.
+- Model-eval metadata.
+- Competitor comparisons.
+- Historical regression decisions.
+- Publishing or PR/report workflows.
+
+Those belong to callers such as wp-gym, Studio Web, Homeboy rigs, or other eval
+harnesses that project WP Codebox evidence into their own product schemas.
diff --git a/package.json b/package.json
index b7993e70..be4406e2 100644
--- a/package.json
+++ b/package.json
@@ -61,6 +61,7 @@
     "phpunit-diagnostic-artifact-smoke": "tsx scripts/phpunit-diagnostic-artifact-smoke.ts",
     "plugin-check-normalization-smoke": "tsx scripts/plugin-check-normalization-smoke.ts",
     "bench-bootstrap-files-smoke": "tsx scripts/bench-bootstrap-files-smoke.ts",
+    "benchmark-summary-smoke": "tsx scripts/benchmark-summary-smoke.ts",
     "wordpress-recipe-builders-smoke": "tsx scripts/wordpress-recipe-builders-smoke.ts",
     "recipe-bench-smoke": "tsx scripts/recipe-bench-smoke.ts",
     "recipe-build-cli-smoke": "tsx scripts/recipe-build-cli-smoke.ts",
diff --git a/packages/cli/src/cli-entry.ts b/packages/cli/src/cli-entry.ts
index 4894c479..b800199f 100644
--- a/packages/cli/src/cli-entry.ts
+++ b/packages/cli/src/cli-entry.ts
@@ -1,5 +1,6 @@
 import { routeCliCommand } from "./command-router.js"
 import { runArtifactsBrowserMetricsCommand, runArtifactsVerifyCommand } from "./commands/artifacts.js"
+import { runArtifactsBenchResultsCommand, runBenchSummarizeCommand } from "./commands/benchmark.js"
 import { runCommandsCommand, runRecipeSchemaCommand } from "./commands/discovery.js"
 import { runCleanupCommand, runDoctorCommand } from "./commands/doctor.js"
 import { runRecipeBuildCommand } from "./commands/recipe-build.js"
@@ -20,6 +21,8 @@ export async function runCli(args: string[]): Promise<number> {
     workspacePolicyCheck: runWorkspacePolicyCheckCommand,
     artifactsVerify: runArtifactsVerifyCommand,
     artifactsBrowserMetrics: runArtifactsBrowserMetricsCommand,
+    artifactsBenchResults: runArtifactsBenchResultsCommand,
+    benchSummarize: runBenchSummarizeCommand,
     runsStatus: runRunsStatusCommand,
     runsArtifacts: runRunsArtifactsCommand,
     commands: runCommandsCommand,
diff --git a/packages/cli/src/command-router.ts b/packages/cli/src/command-router.ts
index e2c9eec7..fc0cab88 100644
--- a/packages/cli/src/command-router.ts
+++ b/packages/cli/src/command-router.ts
@@ -12,6 +12,8 @@ interface CliCommandRouter {
   workspacePolicyCheck: CliCommandHandler
   artifactsVerify: CliCommandHandler
   artifactsBrowserMetrics: CliCommandHandler
+  artifactsBenchResults: CliCommandHandler
+  benchSummarize: CliCommandHandler
   runsStatus: CliCommandHandler
   runsArtifacts: CliCommandHandler
   commands: CliCommandHandler
@@ -73,10 +75,22 @@ export async function routeCliCommand(argv: string[], router: CliCommandRouter):
       if (subcommand === "browser-metrics") {
         return router.artifactsBrowserMetrics(args)
       }
+      if (subcommand === "bench-results") {
+        return router.artifactsBenchResults(args)
+      }
       console.error(`Unknown artifacts command: ${subcommand ?? ""}`)
       router.printHelp()
       return 1
     }
+    case "bench": {
+      const subcommand = args.shift()
+      if (subcommand === "summarize") {
+        return router.benchSummarize(args)
+      }
+      console.error(`Unknown bench command: ${subcommand ?? ""}`)
+      router.printHelp()
+      return 1
+    }
     case "runs": {
       const subcommand = args.shift()
       if (subcommand === "status") {
diff --git a/packages/cli/src/commands/benchmark.ts b/packages/cli/src/commands/benchmark.ts
new file mode 100644
index 00000000..c180f3c7
--- /dev/null
+++ b/packages/cli/src/commands/benchmark.ts
@@ -0,0 +1,273 @@
+import { readFile } from "node:fs/promises"
+import { join, resolve } from "node:path"
+
+interface BenchmarkSummaryOptions {
+  inputPath?: string
+  bundleDirectory?: string
+  json: boolean
+}
+
+interface BenchResults {
+  component_id?: string
+  iterations?: number
+  warmup_iterations?: number
+  scenarios?: unknown[]
+  [key: string]: unknown
+}
+
+interface BenchmarkScenarioSummary {
+  componentId: string
+  id: string
+  source?: string
+  iterations?: number
+  metricCount: number
+  metrics: Record<string, number>
+  artifacts: Record<string, unknown>
+}
+
+interface BenchmarkSummaryOutput {
+  schema: "wp-codebox/benchmark-summary/v1"
+  source: {
+    type: "recipe-run-output" | "artifact-bundle"
+    path: string
+  }
+  hasBenchResults: boolean
+  benchmarkCount: number
+  scenarioCount: number
+  benchmarks: BenchResults[]
+  scenarios: BenchmarkScenarioSummary[]
+}
+
+export async function runBenchSummarizeCommand(args: string[]): Promise<number> {
+  const options = parseBenchmarkSummaryOptions(args)
+  const output = await summarizeBenchmarks(options)
+  if (!options.json) {
+    printBenchmarkSummaryHumanOutput(output)
+    return 0
+  }
+
+  process.stdout.write(`${JSON.stringify(output, null, 2)}\n`)
+  return 0
+}
+
+export async function runArtifactsBenchResultsCommand(args: string[]): Promise<number> {
+  const options = parseBenchmarkSummaryOptions(args, { requireBundle: true })
+  const output = await summarizeBenchmarks(options)
+  if (!options.json) {
+    printBenchmarkSummaryHumanOutput(output)
+    return 0
+  }
+
+  process.stdout.write(`${JSON.stringify(output, null, 2)}\n`)
+  return 0
+}
+
+async function summarizeBenchmarks(options: BenchmarkSummaryOptions): Promise<BenchmarkSummaryOutput> {
+  if (options.inputPath) {
+    const inputPath = resolve(options.inputPath)
+    const parsed = JSON.parse(await readFile(inputPath, "utf8")) as unknown
+    return benchmarkSummaryOutput({ type: "recipe-run-output", path: inputPath }, extractBenchResultsFromRecipeRun(parsed))
+  }
+
+  if (options.bundleDirectory) {
+    const bundleDirectory = resolve(options.bundleDirectory)
+    const commandsLog = await readFile(join(bundleDirectory, "logs", "commands.log"), "utf8").catch((error: unknown) => {
+      if (isRecord(error) && error.code === "ENOENT") {
+        return ""
+      }
+      throw error
+    })
+    return benchmarkSummaryOutput({ type: "artifact-bundle", path: bundleDirectory }, extractBenchResultsFromText(commandsLog))
+  }
+
+  throw new Error("Missing required option: --input or --bundle")
+}
+
+function benchmarkSummaryOutput(source: BenchmarkSummaryOutput["source"], benchmarks: BenchResults[]): BenchmarkSummaryOutput {
+  const scenarios = benchmarks.flatMap((benchmark) => benchmarkScenarioSummaries(benchmark))
+  return {
+    schema: "wp-codebox/benchmark-summary/v1",
+    source,
+    hasBenchResults: benchmarks.length > 0,
+    benchmarkCount: benchmarks.length,
+    scenarioCount: scenarios.length,
+    benchmarks,
+    scenarios,
+  }
+}
+
+function extractBenchResultsFromRecipeRun(value: unknown): BenchResults[] {
+  if (!isRecord(value)) {
+    return []
+  }
+
+  if (Array.isArray(value.benchResultsList)) {
+    return value.benchResultsList.filter(isBenchResults)
+  }
+
+  if (isBenchResults(value.benchResults)) {
+    return [value.benchResults]
+  }
+
+  return []
+}
+
+function extractBenchResultsFromText(text: string): BenchResults[] {
+  const results: BenchResults[] = []
+  for (const jsonObject of jsonObjectsInText(text)) {
+    const parsed = parseJsonObject(jsonObject)
+    if (isBenchResults(parsed)) {
+      results.push(parsed)
+    }
+  }
+  return results
+}
+
+function* jsonObjectsInText(text: string): Generator<string> {
+  let start = -1
+  let depth = 0
+  let inString = false
+  let escaped = false
+
+  for (let index = 0; index < text.length; index++) {
+    const char = text[index]
+
+    if (inString) {
+      if (escaped) {
+        escaped = false
+      } else if (char === "\\") {
+        escaped = true
+      } else if (char === "\"") {
+        inString = false
+      }
+      continue
+    }
+
+    if (char === "\"") {
+      inString = true
+      continue
+    }
+
+    if (char === "{") {
+      if (depth === 0) {
+        start = index
+      }
+      depth += 1
+      continue
+    }
+
+    if (char === "}" && depth > 0) {
+      depth -= 1
+      if (depth === 0 && start >= 0) {
+        yield text.slice(start, index + 1)
+        start = -1
+      }
+    }
+  }
+}
+
+function parseJsonObject(value: string): unknown {
+  try {
+    return JSON.parse(value)
+  } catch {
+    return undefined
+  }
+}
+
+function isBenchResults(value: unknown): value is BenchResults {
+  return isRecord(value) && Array.isArray(value.scenarios) && typeof value.component_id === "string"
+}
+
+function benchmarkScenarioSummaries(benchmark: BenchResults): BenchmarkScenarioSummary[] {
+  const componentId = typeof benchmark.component_id === "string" ? benchmark.component_id : "unknown"
+  return (benchmark.scenarios ?? []).filter(isRecord).map((scenario, index) => {
+    const metrics = numericRecord(scenario.metrics)
+    return {
+      componentId,
+      id: typeof scenario.id === "string" ? scenario.id : `scenario-${index + 1}`,
+      ...(typeof scenario.source === "string" ? { source: scenario.source } : {}),
+      ...(typeof scenario.iterations === "number" && Number.isFinite(scenario.iterations) ? { iterations: scenario.iterations } : {}),
+      metricCount: Object.keys(metrics).length,
+      metrics,
+      artifacts: isRecord(scenario.artifacts) ? scenario.artifacts : {},
+    }
+  })
+}
+
+function numericRecord(value: unknown): Record<string, number> {
+  if (!isRecord(value)) {
+    return {}
+  }
+
+  return Object.fromEntries(
+    Object.entries(value)
+      .filter((entry): entry is [string, number] => typeof entry[1] === "number" && Number.isFinite(entry[1]))
+      .sort(([left], [right]) => left.localeCompare(right)),
+  )
+}
+
+function printBenchmarkSummaryHumanOutput(output: BenchmarkSummaryOutput): void {
+  console.log("WP Codebox benchmark summary")
+  console.log(`Source: ${output.source.path}`)
+  console.log(`Benchmarks: ${output.benchmarkCount}`)
+  console.log(`Scenarios: ${output.scenarioCount}`)
+
+  if (output.scenarios.length === 0) {
+    return
+  }
+
+  console.log("Scenarios:")
+  for (const scenario of output.scenarios) {
+    console.log(`  ${scenario.componentId}/${scenario.id}: ${scenario.metricCount} metrics`)
+  }
+}
+
+function parseBenchmarkSummaryOptions(args: string[], config: { requireBundle?: boolean } = {}): BenchmarkSummaryOptions {
+  const options: Partial<BenchmarkSummaryOptions> = { json: false }
+
+  for (let index = 0; index < args.length; index++) {
+    const arg = args[index]
+
+    if (arg === "--json") {
+      options.json = true
+      continue
+    }
+
+    const [name, inlineValue] = arg.split("=", 2)
+    const value = inlineValue ?? args[++index]
+
+    if (!name.startsWith("--") || value === undefined) {
+      throw new Error(`Invalid argument: ${arg}`)
+    }
+
+    switch (name) {
+      case "--input":
+        options.inputPath = value
+        break
+      case "--bundle":
+      case "--artifacts":
+        options.bundleDirectory = value
+        break
+      default:
+        throw new Error(`Unknown option: ${name}`)
+    }
+  }
+
+  if (config.requireBundle && !options.bundleDirectory) {
+    throw new Error("Missing required option: --bundle")
+  }
+
+  if (config.requireBundle && options.inputPath) {
+    throw new Error("artifacts bench-results only accepts --bundle")
+  }
+
+  if (!config.requireBundle && !options.inputPath && !options.bundleDirectory) {
+    throw new Error("Missing required option: --input or --bundle")
+  }
+
+  return options as BenchmarkSummaryOptions
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+}
diff --git a/packages/cli/src/output.ts b/packages/cli/src/output.ts
index 048ece36..348073f1 100644
--- a/packages/cli/src/output.ts
+++ b/packages/cli/src/output.ts
@@ -252,8 +252,10 @@ export function printHelp(): void {
   wp-codebox workspace-policy check --workspace-root <path> --writable-root <path> [options]
   wp-codebox recipe build phpunit --options <path> [--output <path>]
   wp-codebox recipe validate --recipe <path> [--json]
+  wp-codebox bench summarize (--input <recipe-run.json>|--bundle <dir>) [--json]
   wp-codebox artifacts verify --bundle <dir> [--json]
   wp-codebox artifacts browser-metrics --bundle <dir> [--json]
+  wp-codebox artifacts bench-results --bundle <dir> [--json]
   wp-codebox runs status --registry <dir> --run-id <id> [--json]
   wp-codebox runs artifacts --registry <dir> --run-id <id> [--json]
   wp-codebox validate-blueprint --blueprint <json|file> [options]
@@ -266,6 +268,7 @@ Options:
   --options <path>    Recipe builder options JSON file for recipe build.
   --output <path>     Optional output JSON path for recipe build; defaults to stdout.
   --bundle <dir>      Artifact bundle directory for artifacts verify.
+  --input <path>      Saved recipe-run JSON output for benchmark summarization.
   --artifacts <dir>   Artifact root directory. Also accepted by artifacts verify.
   --run-registry <dir>
                        Durable run registry directory for recipe-run.
diff --git a/scripts/benchmark-summary-smoke.ts b/scripts/benchmark-summary-smoke.ts
new file mode 100644
index 00000000..5b42ae7d
--- /dev/null
+++ b/scripts/benchmark-summary-smoke.ts
@@ -0,0 +1,75 @@
+import assert from "node:assert/strict"
+import { spawnSync } from "node:child_process"
+import { mkdirSync, rmSync, writeFileSync } from "node:fs"
+import { dirname, resolve } from "node:path"
+import { fileURLToPath } from "node:url"
+
+const root = resolve(dirname(fileURLToPath(import.meta.url)), "..")
+const cli = resolve(root, "packages/cli/dist/index.js")
+const workspace = resolve(root, "artifacts/benchmark-summary-smoke")
+const recipeRunOutput = resolve(workspace, "recipe-run.json")
+const bundle = resolve(workspace, "bundle")
+
+rmSync(workspace, { recursive: true, force: true })
+mkdirSync(resolve(bundle, "logs"), { recursive: true })
+
+const benchResults = {
+  component_id: "bench-plugin",
+  iterations: 2,
+  warmup_iterations: 0,
+  scenarios: [
+    {
+      id: "noop",
+      source: "file",
+      iterations: 2,
+      metrics: {
+        duration_ms_mean: 3.5,
+        peak_memory_bytes_mean: 1234,
+        ignored_string: "not numeric",
+      },
+      artifacts: { report: { path: "workloads/report.json", kind: "json" } },
+    },
+  ],
+}
+
+writeFileSync(recipeRunOutput, `${JSON.stringify({
+  schema: "wp-codebox/recipe-run/v1",
+  success: true,
+  benchResults,
+}, null, 2)}\n`)
+
+writeFileSync(resolve(bundle, "logs", "commands.log"), `[2026-06-04T00:00:00.000Z] wordpress.bench component-id=bench-plugin
+exitCode=0
+${JSON.stringify(benchResults, null, 2)}
+`)
+
+const inputSummary = runJson("bench", "summarize", "--input", recipeRunOutput, "--json")
+assert.equal(inputSummary.schema, "wp-codebox/benchmark-summary/v1")
+assert.equal(inputSummary.source.type, "recipe-run-output")
+assert.equal(inputSummary.hasBenchResults, true)
+assert.equal(inputSummary.benchmarkCount, 1)
+assert.equal(inputSummary.scenarioCount, 1)
+assert.equal(inputSummary.scenarios[0].componentId, "bench-plugin")
+assert.equal(inputSummary.scenarios[0].id, "noop")
+assert.equal(inputSummary.scenarios[0].metricCount, 2)
+assert.equal(inputSummary.scenarios[0].metrics.duration_ms_mean, 3.5)
+assert.equal(inputSummary.scenarios[0].artifacts.report.path, "workloads/report.json")
+
+const bundleSummary = runJson("artifacts", "bench-results", "--bundle", bundle, "--json")
+assert.equal(bundleSummary.schema, "wp-codebox/benchmark-summary/v1")
+assert.equal(bundleSummary.source.type, "artifact-bundle")
+assert.equal(bundleSummary.hasBenchResults, true)
+assert.equal(bundleSummary.scenarioCount, 1)
+
+const human = spawnSync(process.execPath, [cli, "bench", "summarize", "--input", recipeRunOutput], { cwd: root, encoding: "utf8" })
+assert.equal(human.status, 0, human.stderr || human.stdout)
+assert.match(human.stdout, /WP Codebox benchmark summary/)
+assert.match(human.stdout, /bench-plugin\/noop: 2 metrics/)
+
+console.log("benchmark summary smoke passed")
+
+function runJson(...args: string[]): any {
+  const result = spawnSync(process.execPath, [cli, ...args], { cwd: root, encoding: "utf8" })
+  assert.equal(result.status, 0, result.stderr || result.stdout)
+  return JSON.parse(result.stdout)
+}