From f63cf2e93ce116fd2f647eaa1fccf553471374b5 Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Thu, 4 Jun 2026 10:09:22 -0400 Subject: [PATCH] add generic benchmark summary helpers --- README.md | 18 ++ docs/benchmark-contract.md | 152 ++++++++++++++ package.json | 1 + packages/cli/src/cli-entry.ts | 3 + packages/cli/src/command-router.ts | 14 ++ packages/cli/src/commands/benchmark.ts | 273 +++++++++++++++++++++++++ packages/cli/src/output.ts | 3 + scripts/benchmark-summary-smoke.ts | 75 +++++++ 8 files changed, 539 insertions(+) create mode 100644 docs/benchmark-contract.md create mode 100644 packages/cli/src/commands/benchmark.ts create mode 100644 scripts/benchmark-summary-smoke.ts diff --git a/README.md b/README.md index 514fd65b..cf258503 100644 --- a/README.md +++ b/README.md @@ -939,6 +939,24 @@ npm run wp-codebox -- recipe-run \ Each workload file returns a callable. The callable may return numeric metrics directly or a payload with `metrics` and `metadata` keys. The recipe output reports duration percentiles, custom metric aggregates, peak memory, runtime artifacts, and the parsed `benchResults` object in JSON output when a single `wordpress.bench` step runs. If earlier `wordpress.browser-probe` steps in the same recipe captured generic `performance` or `memory` artifacts, `wordpress.bench` promotes selected numeric browser values into each scenario's metrics using `browser_*` names, while the raw browser artifacts remain available under `files/browser/`. +Use `bench summarize` to extract a stable automation envelope from saved `recipe-run --json` output: + +```bash +npm run wp-codebox -- bench summarize \ + --input ./artifacts/bench-plugin/recipe-run.json \ + --json +``` + +Use `artifacts bench-results` to extract benchmark results from an artifact bundle command log: + +```bash +npm run wp-codebox -- artifacts bench-results \ + --bundle ./artifacts/bench-plugin \ + --json +``` + +See [`docs/benchmark-contract.md`](docs/benchmark-contract.md) for the generic benchmark contract, result shape, artifact/provenance expectations, and the boundary between WP Codebox responsibilities and caller-owned scoring or product semantics. + ### `agent-runtime-probe` Boot a sandbox with Agents API, Data Machine, and Data Machine Code mounted, then verify the stack loads. diff --git a/docs/benchmark-contract.md b/docs/benchmark-contract.md new file mode 100644 index 00000000..77552d02 --- /dev/null +++ b/docs/benchmark-contract.md @@ -0,0 +1,152 @@ +# Benchmark Contract + +WP Codebox provides a generic benchmark substrate for disposable WordPress +runtimes. It owns workload execution, normalized metric envelopes, runtime +evidence, and artifact extraction helpers. Callers own product semantics such as +scenario catalogs, scoring, grading, model comparison, reward policy, retry +policy, and reports. + +```text +caller benchmark suite + -> writes a WP Codebox recipe + -> runs recipe-run in an isolated WordPress runtime + -> receives recipe output plus artifact bundle + -> extracts generic benchmark results + -> applies caller-owned scoring/reporting outside WP Codebox +``` + +## WP Codebox Responsibilities + +- Execute declared recipe steps in a disposable WordPress runtime. +- Register generic benchmark commands such as `wordpress.bench`. +- Capture runtime artifacts, command logs, browser evidence, and provenance. +- Emit `benchResults` and `benchResultsList` in `wp-codebox/recipe-run/v1` JSON output when `wordpress.bench` steps succeed. +- Provide CLI helpers that extract benchmark envelopes from saved `recipe-run` output or artifact bundles. +- Keep helper output stable, JSON-friendly, and free of product-specific scoring fields. + +## Caller Responsibilities + +- Define the suite, scenario ids, task taxonomy, expected behavior, and run matrix. +- Decide which metrics matter and how to compare them. +- Score, grade, rank, retry, regress, or publish benchmark reports. +- Store durable benchmark history and model/product metadata. +- Interpret browser metrics or runtime artifacts in a product-specific context. + +## Workloads + +`wordpress.bench` currently supports plugin workloads discovered from +`tests/bench/*.php` plus explicit `workloads-json` entries. Workloads can run PHP +code and, through configured workload steps, WP-CLI commands. Each workload +returns numeric metrics directly or an object with `metrics` and `metadata`. + +The command contract is intentionally broad enough for future workload types: + +- **PHP:** direct workload callables and inline configured workload steps. +- **WP-CLI:** configured workload steps that execute in the same sandbox. +- **Ability:** future ability-backed workload steps should still return generic numeric metrics and metadata. +- **Browser:** `wordpress.browser-probe` captures generic browser performance and memory artifacts. When a recipe runs browser probes before `wordpress.bench`, selected numeric `browser_*` metrics are promoted into each benchmark scenario while raw browser artifacts remain in the bundle. + +## Result Shape + +The benchmark envelope is a JSON object with generic fields: + +```json +{ + "component_id": "bench-plugin", + "iterations": 3, + "warmup_iterations": 1, + "scenarios": [ + { + "id": "noop", + "source": "file", + "iterations": 3, + "metrics": { + "duration_ms_mean": 1.23, + "peak_memory_bytes_mean": 123456 + }, + "metadata": {}, + "artifacts": {} + } + ] +} +``` + +Metrics are numeric and named by the workload/runtime surface. WP Codebox records +them; it does not decide whether a value is good, bad, passing, failing, or +regressed. + +## Running Benchmarks + +Use a recipe workflow step with `wordpress.bench`: + +```bash +npm run wp-codebox -- recipe-run \ + --recipe ./examples/recipes/bench-plugin.json \ + --artifacts ./artifacts/bench-plugin \ + --json > ./artifacts/bench-plugin/recipe-run.json +``` + +The `recipe-run` JSON output includes `benchResults` when exactly one successful +`wordpress.bench` step ran, and `benchResultsList` when one or more benchmark +steps ran. + +## Extracting Results + +Summarize saved `recipe-run` JSON: + +```bash +npm run wp-codebox -- bench summarize \ + --input ./artifacts/bench-plugin/recipe-run.json \ + --json +``` + +Summarize an artifact bundle by reading its command log: + +```bash +npm run wp-codebox -- artifacts bench-results \ + --bundle ./artifacts/bench-plugin \ + --json +``` + +Both commands emit `wp-codebox/benchmark-summary/v1` with the raw benchmark +envelopes plus a flattened scenario summary for automation: + +```json +{ + "schema": "wp-codebox/benchmark-summary/v1", + "source": { "type": "recipe-run-output", "path": "/abs/recipe-run.json" }, + "hasBenchResults": true, + "benchmarkCount": 1, + "scenarioCount": 1, + "benchmarks": [], + "scenarios": [ + { + "componentId": "bench-plugin", + "id": "noop", + "source": "file", + "iterations": 3, + "metricCount": 2, + "metrics": {}, + "artifacts": {} + } + ] +} +``` + +Omit `--json` for a compact human-readable table. The human form is for quick +inspection; automation should consume the JSON envelope. + +## Non-Responsibilities + +WP Codebox benchmark helpers do not define or store: + +- Product benchmark suites. +- Rewards or graders. +- Pass/fail scoring policies. +- Model-eval metadata. +- Competitor comparisons. +- Historical regression decisions. +- Publishing or PR/report workflows. + +Those belong to callers such as wp-gym, Studio Web, Homeboy rigs, or other eval +harnesses that project WP Codebox evidence into their own product schemas. diff --git a/package.json b/package.json index b7993e70..be4406e2 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,7 @@ "phpunit-diagnostic-artifact-smoke": "tsx scripts/phpunit-diagnostic-artifact-smoke.ts", "plugin-check-normalization-smoke": "tsx scripts/plugin-check-normalization-smoke.ts", "bench-bootstrap-files-smoke": "tsx scripts/bench-bootstrap-files-smoke.ts", + "benchmark-summary-smoke": "tsx scripts/benchmark-summary-smoke.ts", "wordpress-recipe-builders-smoke": "tsx scripts/wordpress-recipe-builders-smoke.ts", "recipe-bench-smoke": "tsx scripts/recipe-bench-smoke.ts", "recipe-build-cli-smoke": "tsx scripts/recipe-build-cli-smoke.ts", diff --git a/packages/cli/src/cli-entry.ts b/packages/cli/src/cli-entry.ts index 4894c479..b800199f 100644 --- a/packages/cli/src/cli-entry.ts +++ b/packages/cli/src/cli-entry.ts @@ -1,5 +1,6 @@ import { routeCliCommand } from "./command-router.js" import { runArtifactsBrowserMetricsCommand, runArtifactsVerifyCommand } from "./commands/artifacts.js" +import { runArtifactsBenchResultsCommand, runBenchSummarizeCommand } from "./commands/benchmark.js" import { runCommandsCommand, runRecipeSchemaCommand } from "./commands/discovery.js" import { runCleanupCommand, runDoctorCommand } from "./commands/doctor.js" import { runRecipeBuildCommand } from "./commands/recipe-build.js" @@ -20,6 +21,8 @@ export async function runCli(args: string[]): Promise { workspacePolicyCheck: runWorkspacePolicyCheckCommand, artifactsVerify: runArtifactsVerifyCommand, artifactsBrowserMetrics: runArtifactsBrowserMetricsCommand, + artifactsBenchResults: runArtifactsBenchResultsCommand, + benchSummarize: runBenchSummarizeCommand, runsStatus: runRunsStatusCommand, runsArtifacts: runRunsArtifactsCommand, commands: runCommandsCommand, diff --git a/packages/cli/src/command-router.ts b/packages/cli/src/command-router.ts index e2c9eec7..fc0cab88 100644 --- a/packages/cli/src/command-router.ts +++ b/packages/cli/src/command-router.ts @@ -12,6 +12,8 @@ interface CliCommandRouter { workspacePolicyCheck: CliCommandHandler artifactsVerify: CliCommandHandler artifactsBrowserMetrics: CliCommandHandler + artifactsBenchResults: CliCommandHandler + benchSummarize: CliCommandHandler runsStatus: CliCommandHandler runsArtifacts: CliCommandHandler commands: CliCommandHandler @@ -73,10 +75,22 @@ export async function routeCliCommand(argv: string[], router: CliCommandRouter): if (subcommand === "browser-metrics") { return router.artifactsBrowserMetrics(args) } + if (subcommand === "bench-results") { + return router.artifactsBenchResults(args) + } console.error(`Unknown artifacts command: ${subcommand ?? ""}`) router.printHelp() return 1 } + case "bench": { + const subcommand = args.shift() + if (subcommand === "summarize") { + return router.benchSummarize(args) + } + console.error(`Unknown bench command: ${subcommand ?? ""}`) + router.printHelp() + return 1 + } case "runs": { const subcommand = args.shift() if (subcommand === "status") { diff --git a/packages/cli/src/commands/benchmark.ts b/packages/cli/src/commands/benchmark.ts new file mode 100644 index 00000000..c180f3c7 --- /dev/null +++ b/packages/cli/src/commands/benchmark.ts @@ -0,0 +1,273 @@ +import { readFile } from "node:fs/promises" +import { join, resolve } from "node:path" + +interface BenchmarkSummaryOptions { + inputPath?: string + bundleDirectory?: string + json: boolean +} + +interface BenchResults { + component_id?: string + iterations?: number + warmup_iterations?: number + scenarios?: unknown[] + [key: string]: unknown +} + +interface BenchmarkScenarioSummary { + componentId: string + id: string + source?: string + iterations?: number + metricCount: number + metrics: Record + artifacts: Record +} + +interface BenchmarkSummaryOutput { + schema: "wp-codebox/benchmark-summary/v1" + source: { + type: "recipe-run-output" | "artifact-bundle" + path: string + } + hasBenchResults: boolean + benchmarkCount: number + scenarioCount: number + benchmarks: BenchResults[] + scenarios: BenchmarkScenarioSummary[] +} + +export async function runBenchSummarizeCommand(args: string[]): Promise { + const options = parseBenchmarkSummaryOptions(args) + const output = await summarizeBenchmarks(options) + if (!options.json) { + printBenchmarkSummaryHumanOutput(output) + return 0 + } + + process.stdout.write(`${JSON.stringify(output, null, 2)}\n`) + return 0 +} + +export async function runArtifactsBenchResultsCommand(args: string[]): Promise { + const options = parseBenchmarkSummaryOptions(args, { requireBundle: true }) + const output = await summarizeBenchmarks(options) + if (!options.json) { + printBenchmarkSummaryHumanOutput(output) + return 0 + } + + process.stdout.write(`${JSON.stringify(output, null, 2)}\n`) + return 0 +} + +async function summarizeBenchmarks(options: BenchmarkSummaryOptions): Promise { + if (options.inputPath) { + const inputPath = resolve(options.inputPath) + const parsed = JSON.parse(await readFile(inputPath, "utf8")) as unknown + return benchmarkSummaryOutput({ type: "recipe-run-output", path: inputPath }, extractBenchResultsFromRecipeRun(parsed)) + } + + if (options.bundleDirectory) { + const bundleDirectory = resolve(options.bundleDirectory) + const commandsLog = await readFile(join(bundleDirectory, "logs", "commands.log"), "utf8").catch((error: unknown) => { + if (isRecord(error) && error.code === "ENOENT") { + return "" + } + throw error + }) + return benchmarkSummaryOutput({ type: "artifact-bundle", path: bundleDirectory }, extractBenchResultsFromText(commandsLog)) + } + + throw new Error("Missing required option: --input or --bundle") +} + +function benchmarkSummaryOutput(source: BenchmarkSummaryOutput["source"], benchmarks: BenchResults[]): BenchmarkSummaryOutput { + const scenarios = benchmarks.flatMap((benchmark) => benchmarkScenarioSummaries(benchmark)) + return { + schema: "wp-codebox/benchmark-summary/v1", + source, + hasBenchResults: benchmarks.length > 0, + benchmarkCount: benchmarks.length, + scenarioCount: scenarios.length, + benchmarks, + scenarios, + } +} + +function extractBenchResultsFromRecipeRun(value: unknown): BenchResults[] { + if (!isRecord(value)) { + return [] + } + + if (Array.isArray(value.benchResultsList)) { + return value.benchResultsList.filter(isBenchResults) + } + + if (isBenchResults(value.benchResults)) { + return [value.benchResults] + } + + return [] +} + +function extractBenchResultsFromText(text: string): BenchResults[] { + const results: BenchResults[] = [] + for (const jsonObject of jsonObjectsInText(text)) { + const parsed = parseJsonObject(jsonObject) + if (isBenchResults(parsed)) { + results.push(parsed) + } + } + return results +} + +function* jsonObjectsInText(text: string): Generator { + let start = -1 + let depth = 0 + let inString = false + let escaped = false + + for (let index = 0; index < text.length; index++) { + const char = text[index] + + if (inString) { + if (escaped) { + escaped = false + } else if (char === "\\") { + escaped = true + } else if (char === "\"") { + inString = false + } + continue + } + + if (char === "\"") { + inString = true + continue + } + + if (char === "{") { + if (depth === 0) { + start = index + } + depth += 1 + continue + } + + if (char === "}" && depth > 0) { + depth -= 1 + if (depth === 0 && start >= 0) { + yield text.slice(start, index + 1) + start = -1 + } + } + } +} + +function parseJsonObject(value: string): unknown { + try { + return JSON.parse(value) + } catch { + return undefined + } +} + +function isBenchResults(value: unknown): value is BenchResults { + return isRecord(value) && Array.isArray(value.scenarios) && typeof value.component_id === "string" +} + +function benchmarkScenarioSummaries(benchmark: BenchResults): BenchmarkScenarioSummary[] { + const componentId = typeof benchmark.component_id === "string" ? benchmark.component_id : "unknown" + return (benchmark.scenarios ?? []).filter(isRecord).map((scenario, index) => { + const metrics = numericRecord(scenario.metrics) + return { + componentId, + id: typeof scenario.id === "string" ? scenario.id : `scenario-${index + 1}`, + ...(typeof scenario.source === "string" ? { source: scenario.source } : {}), + ...(typeof scenario.iterations === "number" && Number.isFinite(scenario.iterations) ? { iterations: scenario.iterations } : {}), + metricCount: Object.keys(metrics).length, + metrics, + artifacts: isRecord(scenario.artifacts) ? scenario.artifacts : {}, + } + }) +} + +function numericRecord(value: unknown): Record { + if (!isRecord(value)) { + return {} + } + + return Object.fromEntries( + Object.entries(value) + .filter((entry): entry is [string, number] => typeof entry[1] === "number" && Number.isFinite(entry[1])) + .sort(([left], [right]) => left.localeCompare(right)), + ) +} + +function printBenchmarkSummaryHumanOutput(output: BenchmarkSummaryOutput): void { + console.log("WP Codebox benchmark summary") + console.log(`Source: ${output.source.path}`) + console.log(`Benchmarks: ${output.benchmarkCount}`) + console.log(`Scenarios: ${output.scenarioCount}`) + + if (output.scenarios.length === 0) { + return + } + + console.log("Scenarios:") + for (const scenario of output.scenarios) { + console.log(` ${scenario.componentId}/${scenario.id}: ${scenario.metricCount} metrics`) + } +} + +function parseBenchmarkSummaryOptions(args: string[], config: { requireBundle?: boolean } = {}): BenchmarkSummaryOptions { + const options: Partial = { json: false } + + for (let index = 0; index < args.length; index++) { + const arg = args[index] + + if (arg === "--json") { + options.json = true + continue + } + + const [name, inlineValue] = arg.split("=", 2) + const value = inlineValue ?? args[++index] + + if (!name.startsWith("--") || value === undefined) { + throw new Error(`Invalid argument: ${arg}`) + } + + switch (name) { + case "--input": + options.inputPath = value + break + case "--bundle": + case "--artifacts": + options.bundleDirectory = value + break + default: + throw new Error(`Unknown option: ${name}`) + } + } + + if (config.requireBundle && !options.bundleDirectory) { + throw new Error("Missing required option: --bundle") + } + + if (config.requireBundle && options.inputPath) { + throw new Error("artifacts bench-results only accepts --bundle") + } + + if (!config.requireBundle && !options.inputPath && !options.bundleDirectory) { + throw new Error("Missing required option: --input or --bundle") + } + + return options as BenchmarkSummaryOptions +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value) +} diff --git a/packages/cli/src/output.ts b/packages/cli/src/output.ts index 048ece36..348073f1 100644 --- a/packages/cli/src/output.ts +++ b/packages/cli/src/output.ts @@ -252,8 +252,10 @@ export function printHelp(): void { wp-codebox workspace-policy check --workspace-root --writable-root [options] wp-codebox recipe build phpunit --options [--output ] wp-codebox recipe validate --recipe [--json] + wp-codebox bench summarize (--input |--bundle ) [--json] wp-codebox artifacts verify --bundle [--json] wp-codebox artifacts browser-metrics --bundle [--json] + wp-codebox artifacts bench-results --bundle [--json] wp-codebox runs status --registry --run-id [--json] wp-codebox runs artifacts --registry --run-id [--json] wp-codebox validate-blueprint --blueprint [options] @@ -266,6 +268,7 @@ Options: --options Recipe builder options JSON file for recipe build. --output Optional output JSON path for recipe build; defaults to stdout. --bundle Artifact bundle directory for artifacts verify. + --input Saved recipe-run JSON output for benchmark summarization. --artifacts Artifact root directory. Also accepted by artifacts verify. --run-registry Durable run registry directory for recipe-run. diff --git a/scripts/benchmark-summary-smoke.ts b/scripts/benchmark-summary-smoke.ts new file mode 100644 index 00000000..5b42ae7d --- /dev/null +++ b/scripts/benchmark-summary-smoke.ts @@ -0,0 +1,75 @@ +import assert from "node:assert/strict" +import { spawnSync } from "node:child_process" +import { mkdirSync, rmSync, writeFileSync } from "node:fs" +import { dirname, resolve } from "node:path" +import { fileURLToPath } from "node:url" + +const root = resolve(dirname(fileURLToPath(import.meta.url)), "..") +const cli = resolve(root, "packages/cli/dist/index.js") +const workspace = resolve(root, "artifacts/benchmark-summary-smoke") +const recipeRunOutput = resolve(workspace, "recipe-run.json") +const bundle = resolve(workspace, "bundle") + +rmSync(workspace, { recursive: true, force: true }) +mkdirSync(resolve(bundle, "logs"), { recursive: true }) + +const benchResults = { + component_id: "bench-plugin", + iterations: 2, + warmup_iterations: 0, + scenarios: [ + { + id: "noop", + source: "file", + iterations: 2, + metrics: { + duration_ms_mean: 3.5, + peak_memory_bytes_mean: 1234, + ignored_string: "not numeric", + }, + artifacts: { report: { path: "workloads/report.json", kind: "json" } }, + }, + ], +} + +writeFileSync(recipeRunOutput, `${JSON.stringify({ + schema: "wp-codebox/recipe-run/v1", + success: true, + benchResults, +}, null, 2)}\n`) + +writeFileSync(resolve(bundle, "logs", "commands.log"), `[2026-06-04T00:00:00.000Z] wordpress.bench component-id=bench-plugin +exitCode=0 +${JSON.stringify(benchResults, null, 2)} +`) + +const inputSummary = runJson("bench", "summarize", "--input", recipeRunOutput, "--json") +assert.equal(inputSummary.schema, "wp-codebox/benchmark-summary/v1") +assert.equal(inputSummary.source.type, "recipe-run-output") +assert.equal(inputSummary.hasBenchResults, true) +assert.equal(inputSummary.benchmarkCount, 1) +assert.equal(inputSummary.scenarioCount, 1) +assert.equal(inputSummary.scenarios[0].componentId, "bench-plugin") +assert.equal(inputSummary.scenarios[0].id, "noop") +assert.equal(inputSummary.scenarios[0].metricCount, 2) +assert.equal(inputSummary.scenarios[0].metrics.duration_ms_mean, 3.5) +assert.equal(inputSummary.scenarios[0].artifacts.report.path, "workloads/report.json") + +const bundleSummary = runJson("artifacts", "bench-results", "--bundle", bundle, "--json") +assert.equal(bundleSummary.schema, "wp-codebox/benchmark-summary/v1") +assert.equal(bundleSummary.source.type, "artifact-bundle") +assert.equal(bundleSummary.hasBenchResults, true) +assert.equal(bundleSummary.scenarioCount, 1) + +const human = spawnSync(process.execPath, [cli, "bench", "summarize", "--input", recipeRunOutput], { cwd: root, encoding: "utf8" }) +assert.equal(human.status, 0, human.stderr || human.stdout) +assert.match(human.stdout, /WP Codebox benchmark summary/) +assert.match(human.stdout, /bench-plugin\/noop: 2 metrics/) + +console.log("benchmark summary smoke passed") + +function runJson(...args: string[]): any { + const result = spawnSync(process.execPath, [cli, ...args], { cwd: root, encoding: "utf8" }) + assert.equal(result.status, 0, result.stderr || result.stdout) + return JSON.parse(result.stdout) +}