Automattic · chubes4 · Jun 4, 2026 · Jun 4, 2026
diff --git a/README.md b/README.md
@@ -939,6 +939,24 @@ npm run wp-codebox -- recipe-run \
 
 Each workload file returns a callable. The callable may return numeric metrics directly or a payload with `metrics` and `metadata` keys. The recipe output reports duration percentiles, custom metric aggregates, peak memory, runtime artifacts, and the parsed `benchResults` object in JSON output when a single `wordpress.bench` step runs. If earlier `wordpress.browser-probe` steps in the same recipe captured generic `performance` or `memory` artifacts, `wordpress.bench` promotes selected numeric browser values into each scenario's metrics using `browser_*` names, while the raw browser artifacts remain available under `files/browser/`.
 
+Use `bench summarize` to extract a stable automation envelope from saved `recipe-run --json` output:
+
+```bash
+npm run wp-codebox -- bench summarize \
+  --input ./artifacts/bench-plugin/recipe-run.json \
+  --json
+```
+
+Use `artifacts bench-results` to extract benchmark results from an artifact bundle command log:
+
+```bash
+npm run wp-codebox -- artifacts bench-results \
+  --bundle ./artifacts/bench-plugin \
+  --json
+```
+
+See [`docs/benchmark-contract.md`](docs/benchmark-contract.md) for the generic benchmark contract, result shape, artifact/provenance expectations, and the boundary between WP Codebox responsibilities and caller-owned scoring or product semantics.
+
 ### `agent-runtime-probe`
 
 Boot a sandbox with Agents API, Data Machine, and Data Machine Code mounted, then verify the stack loads.

diff --git a/docs/benchmark-contract.md b/docs/benchmark-contract.md
@@ -0,0 +1,152 @@
+# Benchmark Contract
+
+WP Codebox provides a generic benchmark substrate for disposable WordPress
+runtimes. It owns workload execution, normalized metric envelopes, runtime
+evidence, and artifact extraction helpers. Callers own product semantics such as
+scenario catalogs, scoring, grading, model comparison, reward policy, retry
+policy, and reports.
+
+```text
+caller benchmark suite
+  -> writes a WP Codebox recipe
+  -> runs recipe-run in an isolated WordPress runtime
+  -> receives recipe output plus artifact bundle
+  -> extracts generic benchmark results
+  -> applies caller-owned scoring/reporting outside WP Codebox
+```
+
+## WP Codebox Responsibilities
+
+- Execute declared recipe steps in a disposable WordPress runtime.
+- Register generic benchmark commands such as `wordpress.bench`.
+- Capture runtime artifacts, command logs, browser evidence, and provenance.
+- Emit `benchResults` and `benchResultsList` in `wp-codebox/recipe-run/v1` JSON output when `wordpress.bench` steps succeed.
+- Provide CLI helpers that extract benchmark envelopes from saved `recipe-run` output or artifact bundles.
+- Keep helper output stable, JSON-friendly, and free of product-specific scoring fields.
+
+## Caller Responsibilities
+
+- Define the suite, scenario ids, task taxonomy, expected behavior, and run matrix.
+- Decide which metrics matter and how to compare them.
+- Score, grade, rank, retry, regress, or publish benchmark reports.
+- Store durable benchmark history and model/product metadata.
+- Interpret browser metrics or runtime artifacts in a product-specific context.
+
+## Workloads
+
+`wordpress.bench` currently supports plugin workloads discovered from
+`tests/bench/*.php` plus explicit `workloads-json` entries. Workloads can run PHP
+code and, through configured workload steps, WP-CLI commands. Each workload
+returns numeric metrics directly or an object with `metrics` and `metadata`.
+
+The command contract is intentionally broad enough for future workload types:
+
+- **PHP:** direct workload callables and inline configured workload steps.
+- **WP-CLI:** configured workload steps that execute in the same sandbox.
+- **Ability:** future ability-backed workload steps should still return generic numeric metrics and metadata.
+- **Browser:** `wordpress.browser-probe` captures generic browser performance and memory artifacts. When a recipe runs browser probes before `wordpress.bench`, selected numeric `browser_*` metrics are promoted into each benchmark scenario while raw browser artifacts remain in the bundle.
+
+## Result Shape
+
+The benchmark envelope is a JSON object with generic fields:
+
+```json
+{
+  "component_id": "bench-plugin",
+  "iterations": 3,
+  "warmup_iterations": 1,
+  "scenarios": [
+    {
+      "id": "noop",
+      "source": "file",
+      "iterations": 3,
+      "metrics": {
+        "duration_ms_mean": 1.23,
+        "peak_memory_bytes_mean": 123456
+      },
+      "metadata": {},
+      "artifacts": {}
+    }
+  ]
+}
+```
+
+Metrics are numeric and named by the workload/runtime surface. WP Codebox records
+them; it does not decide whether a value is good, bad, passing, failing, or
+regressed.
+
+## Running Benchmarks
+
+Use a recipe workflow step with `wordpress.bench`:
+
+```bash
+npm run wp-codebox -- recipe-run \
+  --recipe ./examples/recipes/bench-plugin.json \
+  --artifacts ./artifacts/bench-plugin \
+  --json > ./artifacts/bench-plugin/recipe-run.json
+```
+
+The `recipe-run` JSON output includes `benchResults` when exactly one successful
+`wordpress.bench` step ran, and `benchResultsList` when one or more benchmark
+steps ran.
+
+## Extracting Results
+
+Summarize saved `recipe-run` JSON:
+
+```bash
+npm run wp-codebox -- bench summarize \
+  --input ./artifacts/bench-plugin/recipe-run.json \
+  --json
+```
+
+Summarize an artifact bundle by reading its command log:
+
+```bash
+npm run wp-codebox -- artifacts bench-results \
+  --bundle ./artifacts/bench-plugin \
+  --json
+```
+
+Both commands emit `wp-codebox/benchmark-summary/v1` with the raw benchmark
+envelopes plus a flattened scenario summary for automation:
+
+```json
+{
+  "schema": "wp-codebox/benchmark-summary/v1",
+  "source": { "type": "recipe-run-output", "path": "/abs/recipe-run.json" },
+  "hasBenchResults": true,
+  "benchmarkCount": 1,
+  "scenarioCount": 1,
+  "benchmarks": [],
+  "scenarios": [
+    {
+      "componentId": "bench-plugin",
+      "id": "noop",
+      "source": "file",
+      "iterations": 3,
+      "metricCount": 2,
+      "metrics": {},
+      "artifacts": {}
+    }
+  ]
+}
+```
+
+Omit `--json` for a compact human-readable table. The human form is for quick
+inspection; automation should consume the JSON envelope.
+
+## Non-Responsibilities
+
+WP Codebox benchmark helpers do not define or store:
+
+- Product benchmark suites.
+- Rewards or graders.
+- Pass/fail scoring policies.
+- Model-eval metadata.
+- Competitor comparisons.
+- Historical regression decisions.
+- Publishing or PR/report workflows.
+
+Those belong to callers such as wp-gym, Studio Web, Homeboy rigs, or other eval
+harnesses that project WP Codebox evidence into their own product schemas.
diff --git a/package.json b/package.json
@@ -61,6 +61,7 @@
     "phpunit-diagnostic-artifact-smoke": "tsx scripts/phpunit-diagnostic-artifact-smoke.ts",
     "plugin-check-normalization-smoke": "tsx scripts/plugin-check-normalization-smoke.ts",
     "bench-bootstrap-files-smoke": "tsx scripts/bench-bootstrap-files-smoke.ts",
+    "benchmark-summary-smoke": "tsx scripts/benchmark-summary-smoke.ts",
     "wordpress-recipe-builders-smoke": "tsx scripts/wordpress-recipe-builders-smoke.ts",
     "recipe-bench-smoke": "tsx scripts/recipe-bench-smoke.ts",
     "recipe-build-cli-smoke": "tsx scripts/recipe-build-cli-smoke.ts",

diff --git a/packages/cli/src/cli-entry.ts b/packages/cli/src/cli-entry.ts
@@ -1,5 +1,6 @@
 import { routeCliCommand } from "./command-router.js"
 import { runArtifactsBrowserMetricsCommand, runArtifactsVerifyCommand } from "./commands/artifacts.js"
+import { runArtifactsBenchResultsCommand, runBenchSummarizeCommand } from "./commands/benchmark.js"
 import { runCommandsCommand, runRecipeSchemaCommand } from "./commands/discovery.js"
 import { runCleanupCommand, runDoctorCommand } from "./commands/doctor.js"
 import { runRecipeBuildCommand } from "./commands/recipe-build.js"
@@ -20,6 +21,8 @@ export async function runCli(args: string[]): Promise<number> {
     workspacePolicyCheck: runWorkspacePolicyCheckCommand,
     artifactsVerify: runArtifactsVerifyCommand,
     artifactsBrowserMetrics: runArtifactsBrowserMetricsCommand,
+    artifactsBenchResults: runArtifactsBenchResultsCommand,
+    benchSummarize: runBenchSummarizeCommand,
     runsStatus: runRunsStatusCommand,
     runsArtifacts: runRunsArtifactsCommand,
     commands: runCommandsCommand,

diff --git a/packages/cli/src/command-router.ts b/packages/cli/src/command-router.ts
@@ -12,6 +12,8 @@ interface CliCommandRouter {
   workspacePolicyCheck: CliCommandHandler
   artifactsVerify: CliCommandHandler
   artifactsBrowserMetrics: CliCommandHandler
+  artifactsBenchResults: CliCommandHandler
+  benchSummarize: CliCommandHandler
   runsStatus: CliCommandHandler
   runsArtifacts: CliCommandHandler
   commands: CliCommandHandler
@@ -73,10 +75,22 @@ export async function routeCliCommand(argv: string[], router: CliCommandRouter):
       if (subcommand === "browser-metrics") {
         return router.artifactsBrowserMetrics(args)
       }
+      if (subcommand === "bench-results") {
+        return router.artifactsBenchResults(args)
+      }
       console.error(`Unknown artifacts command: ${subcommand ?? ""}`)
       router.printHelp()
       return 1
     }
+    case "bench": {
+      const subcommand = args.shift()
+      if (subcommand === "summarize") {
+        return router.benchSummarize(args)
+      }
+      console.error(`Unknown bench command: ${subcommand ?? ""}`)
+      router.printHelp()
+      return 1
+    }
     case "runs": {
       const subcommand = args.shift()
       if (subcommand === "status") {