Automattic · chubes4 · May 30, 2026 · May 30, 2026
diff --git a/README.md b/README.md
@@ -462,15 +462,34 @@ Supported runtime commands today:
 - `wordpress.wp-cli`: run WP-CLI; accepts `command='wp option get home'` or plain args.
 - `wordpress.ability`: execute a registered WordPress Ability; accepts `name=<ability>` and optional JSON `input=<object>`.
 - `wordpress.browser-probe`: boot the live preview, visit `url=<path-or-url>` with Playwright, and capture generic browser replay/audit evidence under `files/browser/`.
-- `wordpress.browser-actions`: boot the live preview, run generic browser interactions, and capture replay/audit evidence under `files/browser/`.
+- `wordpress.browser-actions`: boot the live preview, drive it with an ordered interaction script (`steps-json`), assert browser behavior, and capture replay/audit evidence under `files/browser/`.
 
 `wordpress.run-php` loads `/wordpress/wp-load.php` by default. Use `--arg bootstrap=none` for raw PHP.
 
 `wordpress.wp-cli` automatically enables Playground's `wp-cli` extra library when the command is allowed by runtime policy.
 
 `wordpress.browser-probe` accepts `wait-for=domcontentloaded|load|networkidle|selector:<selector>|duration`, `duration=<n>s`, and `capture=console,errors,html,network,performance,memory,screenshot`. It records machine-readable evidence refs such as `files/browser/console.jsonl`, `files/browser/errors.jsonl`, `files/browser/network.jsonl`, `files/browser/performance.json`, `files/browser/memory.json`, `files/browser/checkpoints.jsonl`, `files/browser/snapshot.html`, `files/browser/screenshot.png`, and `files/browser/summary.json` when those captures are enabled. The summary includes requested/final URLs, viewport/device metadata, HTML and screenshot hashes, network event counts, optional final/peak browser memory and performance summaries, and a generic `artifact-backed|partial|diagnostic-only` replayability classification. Performance and memory captures use generic browser/CDP data only: JS heap when available, CDP `Performance.getMetrics`, CDP DOM counters, DOM/resource counts and byte totals, and long task counts/duration. Probe scripts may call `window.__wpCodeboxProbeCheckpoint(name, metadata)` when `performance` or `memory` capture is enabled to record named generic checkpoint snapshots. WP Codebox intentionally keeps these browser evidence fields generic; consumers such as eval harnesses may interpret them without WP Codebox adding scoring, grading, or benchmark semantics.
 
-`wordpress.browser-actions` accepts `actions-json=<array>` with ordered `navigate`, `click`, `fill`, `press`, `wait`, and `capture` actions. `navigate` uses `url` plus optional `waitFor=domcontentloaded|load|networkidle`; `click` uses `selector` or `text`; `fill` uses `selector` and `value`; `press` uses `key` plus optional `selector`; `wait` uses `selector` or `waitFor=domcontentloaded|load|networkidle|duration` with `duration=<n>s|<n>ms`. It records `files/browser/actions.jsonl`, `files/browser/action-summary.json`, and optional `console`, `errors`, `network`, `html`, and `screenshot` captures. Failures identify the failed action index/type in the action log, include serialized browser errors, and still write the requested audit artifacts when possible.
+`wordpress.browser-actions` drives the preview with an ordered interaction script so Codebox can prove a plugin still *works* under interaction, not just that it renders. Pass the script as `steps-json=<array>` (inline JSON, or `@<path>` to read it from a file); the legacy `actions-json=<array>` shape is still accepted and normalized to steps. Each step is a thin, stable mapping over a Playwright locator action — this is not a test-runner DSL.
+
+Step kinds: `navigate` (`url`, optional `waitFor=domcontentloaded|load|networkidle`), `click`/`hover` (`selector` or `text`), `fill`/`type` (`selector`, `value`), `press` (`key`, optional `selector`), `drag` (`from` selector, `to` as `{ "selector": ... }` or `{ "x": n, "y": n }`), `select` (`selector`, `value` or `values`), `waitFor` (`selector` or `waitFor=domcontentloaded|load|networkidle|duration|selector:<sel>`), `evaluate` (`expression`, optional `assert` to deep-equal the result), `expect` (`selector`, optional `state=visible|hidden|attached|detached|enabled|disabled|checked|unchecked|editable`), and `screenshot` (optional `name` for a named capture). Every step may set its own `timeout=<n>s`; the command also accepts a global `step-timeout=<n>s` (per step) and `timeout=<n>s` (total-script budget). Both are bounded and deterministic — the run stops cleanly on the first failing step, with no silent partial success.
+
+The arbitrary-JS `evaluate` step is policy-gated **separately** from the non-JS interaction steps: a script containing `evaluate` requires `wordpress.browser-actions.evaluate` in the runtime policy in addition to `wordpress.browser-actions`. Click/fill/drag/expect and friends never require the extra grant, so a consumer can allow UI driving while still forbidding arbitrary page JS.
+
+It records `files/browser/steps.jsonl` (per-step index, kind, selector, ok/fail, timing, and any named screenshot), `files/browser/action-summary.json` (with a machine-readable `assertions` block of `total`/`passed`/`failed` plus each `expect`/`evaluate` result), named `files/browser/screenshot-<name>.png` captures, and optional `console`, `errors`, `network`, `html`, and `screenshot` artifacts (capture defaults to `steps,console,errors,network,html,screenshot`; `actions` is accepted as an alias for `steps`). Failures identify the failed step index/kind in `steps.jsonl`, include serialized browser errors, and still write the requested audit artifacts when possible. Existing navigate-only invocations (just `url=`, no `steps-json`) behave exactly as before.
+
+```jsonc
+// steps-json: open the editor, drive the crop modal, assert it still works, capture it
+[
+  { "kind": "click",      "selector": "role=button[name='Social']" },
+  { "kind": "waitFor",    "selector": ".reactEasyCrop_Container" },
+  { "kind": "drag",       "from": ".reactEasyCrop_CropArea", "to": { "x": 40, "y": 40 } },
+  { "kind": "fill",       "selector": "#caption", "value": "smoke test" },
+  { "kind": "evaluate",   "expression": "document.querySelector('.crop').isConnected", "assert": true },
+  { "kind": "expect",     "selector": ".crop-confirm", "state": "visible" },
+  { "kind": "screenshot", "name": "after-crop" }
+]
+```
 
 WP Codebox defaults to WordPress `7.0` because the agent and AI plugin stacks need the modern WordPress AI surface. Override with `--wp trunk`, `--wp nightly`, or another supported Playground version.
 

diff --git a/package.json b/package.json
@@ -54,6 +54,7 @@
     "recipe-browser-smoke": "tsx scripts/recipe-browser-smoke.ts",
     "browser-probe-artifact-smoke": "tsx scripts/browser-probe-artifact-smoke.ts",
     "browser-actions-artifact-smoke": "tsx scripts/browser-actions-artifact-smoke.ts",
+    "browser-interaction-script-validation-smoke": "tsx scripts/browser-interaction-script-validation-smoke.ts",
     "preview-port-smoke": "tsx scripts/preview-port-smoke.ts",
     "preview-options-contract-smoke": "tsx scripts/preview-options-contract-smoke.ts",
     "preview-public-url-canonical-smoke": "tsx scripts/preview-public-url-canonical-smoke.ts",

diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
@@ -6,7 +6,7 @@ import { tmpdir } from "node:os"
 import { basename, dirname, join, relative, resolve } from "node:path"
 import { fileURLToPath } from "node:url"
 import { promisify } from "node:util"
-import { SANDBOX_DMC_PARENT_ONLY_ABILITIES, SANDBOX_DMC_SAFE_ABILITIES, SANDBOX_WORKSPACE_ROOT, calculateArtifactManifestFileSha256, checkWorkspacePolicy, commandRegistry, createRuntime, createWorkspaceRecipeJsonSchema, recipeCommandDefinitions, validateRuntimePolicy, verifyArtifactBundle, type ArtifactBundle, type ArtifactBundleVerificationResult, type ArtifactManifest, type CommandDefinition, type ExecutionResult, type MountSpec, type Runtime, type RuntimeInfo, type RuntimePolicy, type SandboxWorkspaceContract, type SandboxWorkspaceMode, type WorkspacePolicyResult, type WorkspaceRecipe, type WorkspaceRecipeExtraPlugin, type WorkspaceRecipeJsonSchema, type WorkspaceRecipePluginRuntime, type WorkspaceRecipePluginRuntimeHealthProbe, type WorkspaceRecipeSiteSeed, type WorkspaceRecipeStagedFile, type WorkspaceRecipeWorkspace } from "@chubes4/wp-codebox-core"
+import { SANDBOX_DMC_PARENT_ONLY_ABILITIES, SANDBOX_DMC_SAFE_ABILITIES, SANDBOX_WORKSPACE_ROOT, calculateArtifactManifestFileSha256, checkWorkspacePolicy, commandRegistry, createRuntime, createWorkspaceRecipeJsonSchema, recipeCommandDefinitions, validateBrowserInteractionScript, validateRuntimePolicy, verifyArtifactBundle, type ArtifactBundle, type ArtifactBundleVerificationResult, type ArtifactManifest, type CommandDefinition, type ExecutionResult, type MountSpec, type Runtime, type RuntimeInfo, type RuntimePolicy, type SandboxWorkspaceContract, type SandboxWorkspaceMode, type WorkspacePolicyResult, type WorkspaceRecipe, type WorkspaceRecipeExtraPlugin, type WorkspaceRecipeJsonSchema, type WorkspaceRecipePluginRuntime, type WorkspaceRecipePluginRuntimeHealthProbe, type WorkspaceRecipeSiteSeed, type WorkspaceRecipeStagedFile, type WorkspaceRecipeWorkspace } from "@chubes4/wp-codebox-core"
 import { createPlaygroundRuntimeBackend } from "@chubes4/wp-codebox-playground"
 import { agentRuntimeProbeCode, agentSandboxRunCode, resolveSandboxTaskCode } from "./agent-code.js"
 import { captureStdout, printArtifactVerifyHumanOutput, printBatchHumanOutput, printBlueprintValidateHumanOutput, printBootHumanOutput, printCommandCatalogHumanOutput, printHelp, printHumanOutput, printRecipeHumanOutput, printRecipeSchemaHumanOutput, printRecipeValidateHumanOutput, serializeError } from "./output.js"
@@ -3647,6 +3647,48 @@ async function validateRecipeStepArgs(step: WorkspaceRecipe["workflow"]["steps"]
     return
   }
 
+  if (step.command === "wordpress.browser-actions") {
+    const stepsJson = recipeStepArgValue(step.args ?? [], "steps-json")
+    const actionsJson = recipeStepArgValue(step.args ?? [], "actions-json")
+    const url = recipeStepArgValue(step.args ?? [], "url")?.trim()
+    if (!stepsJson && !actionsJson && !url) {
+      addIssue("missing-steps", `${path}.args`, "wordpress.browser-actions requires steps-json=<array> (or actions-json=<array>) or url=<path-or-url>.")
+    }
+
+    if (stepsJson && !stepsJson.startsWith("@")) {
+      let parsed: unknown
+      try {
+        parsed = JSON.parse(stepsJson)
+      } catch (error) {
+        addIssue("invalid-steps-json", `${path}.args`, `wordpress.browser-actions steps-json must be valid JSON: ${error instanceof Error ? error.message : String(error)}`)
+        parsed = undefined
+      }
+      if (parsed !== undefined) {
+        const result = validateBrowserInteractionScript(parsed)
+        for (const issue of result.issues) {
+          addIssue("invalid-step", `${path}.args`, `wordpress.browser-actions steps-json[${issue.index}]: ${issue.message}`)
+        }
+      }
+    }
+
+    for (const name of ["step-timeout", "timeout"] as const) {
+      const value = recipeStepArgValue(step.args ?? [], name)
+      if (value && !/^(\d+(?:\.\d+)?)(ms|s)$/.test(value)) {
+        addIssue("invalid-duration", `${path}.args`, `wordpress.browser-actions ${name} must look like 500ms or 2s.`)
+      }
+    }
+
+    const capture = recipeStepArgValue(step.args ?? [], "capture")
+    if (capture) {
+      for (const item of capture.split(",").map((value) => value.trim()).filter(Boolean)) {
+        if (!["steps", "actions", "console", "errors", "html", "network", "screenshot"].includes(item)) {
+          addIssue("invalid-capture", `${path}.args`, `wordpress.browser-actions capture does not support: ${item}`)
+        }
+      }
+    }
+    return
+  }
+
   if (step.command === "wordpress.ability") {
     if (!recipeStepArgValue(step.args ?? [], "name")?.trim()) {
       addIssue("missing-ability-name", `${path}.args`, "wordpress.ability requires name=<ability-name>.")
@@ -3949,13 +3991,32 @@ function recipePolicy(recipe: WorkspaceRecipe): RuntimePolicy {
   if ((recipe.inputs?.siteSeeds ?? []).some((siteSeed) => siteSeed.type === "fixture")) {
     commands.unshift("wordpress.run-php")
   }
+  // Auto-grant the evaluate capability when a browser-actions step opts into the
+  // arbitrary-JS escape hatch by including an evaluate step. Recipe authors opt in
+  // by writing the step; direct `run` invocations still control the gate via --policy.
+  if (recipeWorkflowSteps(recipe).some(({ step }) => step.command === "wordpress.browser-actions" && recipeStepUsesEvaluate(step))) {
+    commands.push("wordpress.browser-actions.evaluate")
+  }
 
   return {
     ...defaultPolicy,
     commands: [...new Set(commands)],
   }
 }
 
+function recipeStepUsesEvaluate(step: WorkspaceRecipe["workflow"]["steps"][number]): boolean {
+  const raw = recipeStepArgValue(step.args ?? [], "steps-json")
+  if (!raw || raw.startsWith("@")) {
+    return false
+  }
+  try {
+    const parsed = JSON.parse(raw)
+    return Array.isArray(parsed) && parsed.some((entry) => entry && typeof entry === "object" && (entry as { kind?: unknown }).kind === "evaluate")
+  } catch {
+    return false
+  }
+}
+
 function runPolicy(command: string): RuntimePolicy {
   return {
     ...defaultPolicy,