AgentWorkforce · kjgbot · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/docs/product/ricky-skill-embedding-boundary.md b/docs/product/ricky-skill-embedding-boundary.md
@@ -12,7 +12,7 @@ For strict TypeScript or proof-oriented workflow generation, the expected loaded
 
 `writing-agent-relay-workflows` affects the generated workflow contract by shaping the dedicated channel, explicit agents, step dependencies, review stages, and final signoff. `relay-80-100-workflow` affects validation by shaping soft validation, review/fix/final-review flow, final hard validation, git diff, and regression gates. These are generation-time effects because they are materialized into the workflow text and deterministic metadata before any workflow runner launches agents.
 
-The generated workflow also includes a deterministic `skill-boundary-metadata-gate`. This gate checks that the generated boundary metadata exists, records `generation_time_only`, names the loaded skills, includes the `generation_selection`, `generation_loading`, and applicable `generation_rendering` stages, and records effects such as `workflow_contract` and `validation_gates`. The gate proves the artifact carries the skill boundary forward as metadata; it does not prove runtime agents load skills.
+The generated workflow materializes this boundary as context metadata, including `loaded-skills.txt`, `skill-matches.json`, and `skill-application-boundary.json`. Ricky verifies that metadata in generation tests rather than re-checking its own serialized files with runtime shell text matches. Runtime gates should focus on agent-produced artifacts, validation commands, scoped diff evidence, and blocker files.
 
 ## Runtime Boundary
 

diff --git a/evals/suites/workflow-authoring/cases.jsonl b/evals/suites/workflow-authoring/cases.jsonl
@@ -4,7 +4,7 @@
 {"id":"workflow-authoring.distinct-reviewer","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Write a workflow that has Codex generate a convention update and then review it."},"expected":{"maxToolCalls":0,"must":["Assign a reviewer agent distinct from the writer when possible.","Persist significant review artifacts under `.workflow-artifacts/`.","Keep convention-only edits scoped to the declared convention files."],"mustNot":["Let the same agent both write and rubber-stamp the change without an explicit reason.","Skip deterministic file-existence, grep, symlink, or scoped change-detection checks.","Edit unrelated package metadata or generated workflows for a convention-only request."],"humanReviewRequired":true},"tags":["workflow-authoring","review"]}
 {"id":"workflow-authoring.fresh-eyes-loop-simple-test","suite":"workflow-authoring","executor":"ricky-cli","kind":"regression","input":{"message":"Generate a small Agent Relay workflow that adds one missing Vitest unit test for a TypeScript helper and proves the test passes."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","--- GENERATED ARTIFACT:",".agent(\"reviewer-claude\"",".agent(\"validator-claude\"",".agent(\"reviewer-codex\"",".agent(\"validator-codex\"","verdict: FINDINGS | NO_ISSUES_FOUND | BLOCKED","add or update appropriate tests, fixtures, assertions, or deterministic proofs","dependsOn: [\"final-fix-codex\"]"],"contentMatches":["\\.step\\(\"review-claude\"[\\s\\S]*\\.step\\(\"fix-loop\"[\\s\\S]*\\.step\\(\"final-review-claude\"[\\s\\S]*\\.step\\(\"final-fix-claude\"[\\s\\S]*\\.step\\(\"review-codex\"[\\s\\S]*\\.step\\(\"fix-loop-codex\"[\\s\\S]*\\.step\\(\"final-review-codex\"[\\s\\S]*\\.step\\(\"final-fix-codex\"[\\s\\S]*\\.step\\(\"final-review-pass-gate\"[\\s\\S]*\\.step\\(\"final-hard-validation\""],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Include the mandatory fresh-eyes review/fix loop even though the workflow is small.","Run the loop in this order: Claude review, Claude fix, Claude final review, Claude final fix, then Codex review, Codex fix, Codex final review, Codex final fix.","Require review output to use a structured verdict such as `FINDINGS`, `NO_ISSUES_FOUND`, or `BLOCKED`.","Require fix steps to add or update tests, fixtures, assertions, or deterministic proof for testable findings.","Put final deterministic acceptance after the Codex final fix."],"mustNot":["Treat the first passing test run as a substitute for fresh-eyes review.","Run Claude and Codex reviews in parallel before fixing.","Collapse all findings into one generic fix step with no final re-review.","Commit, open a PR, or hand off before the Codex loop finishes."],"humanReviewRequired":false},"tags":["workflow-authoring","review","tests","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a small Agent Relay workflow that adds one missing Vitest unit test for a TypeScript helper and proves the test passes.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
 {"id":"workflow-authoring.fresh-eyes-loop-medium-source-and-test","suite":"workflow-authoring","executor":"ricky-cli","kind":"regression","input":{"message":"Generate a Ricky workflow that changes one source file and one test file for a CLI parsing bug, with scoped diff evidence and a targeted Vitest command."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","src/surfaces/cli/flows/power-user-parser.ts","src/surfaces/cli/flows/power-user-parser.test.ts","npx vitest run src/surfaces/cli/flows/power-user-parser.test.ts","git diff --name-only","git ls-files --others --exclude-standard","review-claude.md","final-review-codex.md","codex-final-fix.md","dependsOn: [\"final-fix-codex\"]"],"contentMatches":["\\.step\\(\"review-claude\"[\\s\\S]*\\.step\\(\"fix-loop\"[\\s\\S]*\\.step\\(\"final-review-claude\"[\\s\\S]*\\.step\\(\"final-fix-claude\"[\\s\\S]*\\.step\\(\"review-codex\"[\\s\\S]*\\.step\\(\"fix-loop-codex\"[\\s\\S]*\\.step\\(\"final-review-codex\"[\\s\\S]*\\.step\\(\"final-fix-codex\"[\\s\\S]*\\.step\\(\"final-review-pass-gate\"[\\s\\S]*\\.step\\(\"final-hard-validation\""],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Preserve the Claude-then-Codex review/fix/final-review/final-fix order before final acceptance.","Keep deterministic file gates and scoped `git diff --name-only` / untracked-file checks limited to the declared source and test targets.","Feed review findings into fix steps and require fixers to harden tests when findings are testable.","Write review, fix, final-review, final-fix, validation, and signoff artifacts under `.workflow-artifacts/`."],"mustNot":["Use broad repo-wide change detection as the only proof.","Allow a single reviewer to rubber-stamp its own work without a distinct fresh-eyes pass.","Skip the Codex final review/fix loop because Claude already reviewed.","Move final hard validation before the Codex final fix."],"humanReviewRequired":false},"tags":["workflow-authoring","review","generation","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a Ricky workflow that changes one source file and one test file for a CLI parsing bug.\\n\\n## Target Files\\n\\n- src/surfaces/cli/flows/power-user-parser.ts\\n- src/surfaces/cli/flows/power-user-parser.test.ts\\n\\n## Acceptance\\n\\nRun `npx vitest run src/surfaces/cli/flows/power-user-parser.test.ts`.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
-{"id":"workflow-authoring.fresh-eyes-loop-complex-multitrack","suite":"workflow-authoring","executor":"ricky-cli","kind":"capability","input":{"message":"Generate a serious multi-track master executor workflow for three independent product slices: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","RICKY_MASTER_EXECUTOR_WORKFLOW","Master plan:","RICKY_CHILD_WORKFLOW_COMPLETE","review-claude","final-fix-codex","RICKY_CHILD_FRESH_EYES_LOOP_READY","BLOCKED_NO_COMMIT"],"contentMatches":["review-claude[\\s\\S]*fix-loop[\\s\\S]*final-review-claude[\\s\\S]*final-fix-claude[\\s\\S]*review-codex[\\s\\S]*fix-loop-codex[\\s\\S]*final-review-codex[\\s\\S]*final-fix-codex[\\s\\S]*final-review-pass-gate[\\s\\S]*final-hard-validation"],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Use a coordination shape that fits independent tracks while preserving deterministic gates after every editing step.","Ensure each implementation track or child workflow has the mandatory Claude-then-Codex fresh-eyes review/fix loop before track signoff.","Run final deterministic acceptance only after all Codex final fixes and post-fix reviews have completed.","Use `BLOCKED_NO_COMMIT` with evidence when a finding cannot be fixed, and skip commit or PR creation in that state.","Use the GitHub primitive for PR creation when shipping is in scope."],"mustNot":["Put one global review at the end and call that sufficient for all tracks.","Serialize independent tracks without a stated dependency reason.","Let PR creation, commit, or handoff race ahead of unresolved review findings.","Present tests, typecheck, or dry-run alone as the complete proof bar."],"humanReviewRequired":false},"tags":["workflow-authoring","review","multitrack","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a serious multi-track workflow for three independent product slices as smaller workflows run by a master executor: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR.\\n\\nUse independent child workflows with deterministic validation, fresh-eyes review/fix loops, and GitHub primitive PR creation when shipping is in scope.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
+{"id":"workflow-authoring.fresh-eyes-loop-complex-multitrack","suite":"workflow-authoring","executor":"ricky-cli","kind":"capability","input":{"message":"Generate a serious multi-track master executor workflow for three independent product slices: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","RICKY_MASTER_EXECUTOR_WORKFLOW","Master plan:","final-signoff","review-claude","final-fix-codex","RICKY_CHILD_FINAL_REVIEW_FILES_READY","BLOCKED_NO_COMMIT"],"contentMatches":["review-claude[\\s\\S]*fix-loop[\\s\\S]*final-review-claude[\\s\\S]*final-fix-claude[\\s\\S]*review-codex[\\s\\S]*fix-loop-codex[\\s\\S]*final-review-codex[\\s\\S]*final-fix-codex[\\s\\S]*final-review-pass-gate[\\s\\S]*final-hard-validation"],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Use a coordination shape that fits independent tracks while preserving deterministic gates after every editing step.","Ensure each implementation track or child workflow has the mandatory Claude-then-Codex fresh-eyes review/fix loop before track signoff.","Run final deterministic acceptance only after all Codex final fixes and post-fix reviews have completed.","Use `BLOCKED_NO_COMMIT` with evidence when a finding cannot be fixed, and skip commit or PR creation in that state.","Use the GitHub primitive for PR creation when shipping is in scope."],"mustNot":["Put one global review at the end and call that sufficient for all tracks.","Serialize independent tracks without a stated dependency reason.","Let PR creation, commit, or handoff race ahead of unresolved review findings.","Present tests, typecheck, or dry-run alone as the complete proof bar."],"humanReviewRequired":false},"tags":["workflow-authoring","review","multitrack","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a serious multi-track workflow for three independent product slices as smaller workflows run by a master executor: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR.\\n\\nUse independent child workflows with deterministic validation, fresh-eyes review/fix loops, and GitHub primitive PR creation when shipping is in scope.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
 {"id":"workflow-authoring.no-silent-mode-fallback","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Create a workflow for a user who asked to run in Cloud, but Cloud auth is missing."},"expected":{"maxToolCalls":0,"must":["Detect and report the missing Cloud readiness or auth condition before any expensive generation or run step.","Ask for an explicit user decision before switching to local/BYOH execution.","Preserve the requested execution mode in the workflow context and user-facing summary."],"mustNot":["Silently fall back from Cloud to local.","Claim a provider, account, credential, or integration is connected without a deterministic check.","Hide mode changes inside generic \"auto\" wording."],"humanReviewRequired":true},"tags":["workflow-authoring","local","cloud"]}
 {"id":"workflow-authoring.agent-assistant-boundary","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Update Ricky to reuse a new Agent Assistant primitive while preserving Ricky-owned local execution behavior."},"expected":{"maxToolCalls":0,"must":["Reuse the shared Agent Assistant package for neutral assistant/runtime mechanics where appropriate.","State the Ricky-owned behavior that must remain local, including workflow generation, LocalResponse, blocker taxonomy, recovery wording, and evidence semantics.","Add proof that the shared primitive is exercised in a real Ricky path, not only imported or documented."],"mustNot":["Move product-specific Ricky execution contracts into Agent Assistant without an explicit proof boundary.","Overclaim broad Agent Assistant adoption from a narrow adapter change.","Replace Ricky's local blocker and recovery contract with generic assistant output."],"humanReviewRequired":true},"tags":["workflow-authoring","agent-assistant","boundary"]}
 {"id":"workflow-authoring.evidence-trail","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Design a workflow that watches a long-running workflow, diagnoses a failure, attempts a safe repair, and reports the outcome."},"expected":{"maxToolCalls":0,"must":["Preserve an evidence trail that names commands, artifacts, failed steps, log locations, assertions, and side effects.","Distinguish successful repair, actionable blocker, unsupported condition, and unrecoverable error.","Include resumability guidance such as failed step, previous run id, or exact rerun command when available."],"mustNot":["Claim the workflow succeeded when a blocker or missing dependency stopped execution.","Drop log paths or side-effect summaries from the final outcome.","Retry destructive or credentialed actions without explicit authorization."],"humanReviewRequired":true},"tags":["workflow-authoring","evidence"]}

diff --git a/evals/suites/workflow-authoring/cases.md b/evals/suites/workflow-authoring/cases.md
@@ -168,10 +168,10 @@ contentIncludes:
 - status": "ok
 - RICKY_MASTER_EXECUTOR_WORKFLOW
 - Master plan:
-- RICKY_CHILD_WORKFLOW_COMPLETE
+- final-signoff
 - review-claude
 - final-fix-codex
-- RICKY_CHILD_FRESH_EYES_LOOP_READY
+- RICKY_CHILD_FINAL_REVIEW_FILES_READY
 - BLOCKED_NO_COMMIT
 contentMatches:
 - review-claude[\s\S]*fix-loop[\s\S]*final-review-claude[\s\S]*final-fix-claude[\s\S]*review-codex[\s\S]*fix-loop-codex[\s\S]*final-review-codex[\s\S]*final-fix-codex[\s\S]*final-review-pass-gate[\s\S]*final-hard-validation

diff --git a/scripts/evals/ci-review-comment.mjs b/scripts/evals/ci-review-comment.mjs
@@ -39,6 +39,8 @@ if (process.env.GITHUB_TOKEN && process.env.GITHUB_REPOSITORY && process.env.PR_
 function renderComment({ result, runDir }) {
   const failed = result.tests.filter((test) => test.status === 'failed');
   const skipped = result.tests.filter((test) => test.status === 'skipped');
+  const providerInfraSkipped = skipped.filter(isProviderInfrastructureSkip);
+  const blockingSkipped = skipped.filter((test) => !isProviderInfrastructureSkip(test));
   const needsHuman = result.tests.filter((test) => test.status === 'needs-human');
   const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput);
   const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test));
@@ -50,13 +52,25 @@ function renderComment({ result, runDir }) {
     `Mode: \`${result.mode}\``,
     `Git SHA: \`${result.git_sha}\``,
     '',
-    `**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped}`,
+    `**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped} | **Provider infra skipped:** ${providerInfraSkipped.length}`,
     '',
   ];
 
-  if (failed.length > 0 || skipped.length > 0) {
+  if (failed.length > 0 || blockingSkipped.length > 0) {
     lines.push('## Blocking Cases', '');
-    for (const test of [...failed, ...skipped]) {
+    for (const test of [...failed, ...blockingSkipped]) {
+      appendCaseDetails(lines, test, { forceOpen: true });
+    }
+  }
+
+  if (providerInfraSkipped.length > 0) {
+    lines.push(
+      '## Provider Infrastructure Skips',
+      '',
+      'These provider-backed cases were skipped after retryable provider outages. They are not treated as Ricky product regressions.',
+      '',
+    );
+    for (const test of providerInfraSkipped) {
       appendCaseDetails(lines, test, { forceOpen: true });
     }
   }
@@ -128,6 +142,11 @@ function appendCaseDetails(lines, test, { forceOpen }) {
   lines.push('</details>', '');
 }
 
+function isProviderInfrastructureSkip(test) {
+  if (test.status !== 'skipped') return false;
+  return String(test.error ?? '').startsWith('openrouter executor skipped; transient provider infrastructure unavailable');
+}
+
 function appendRickyOutput(lines, test) {
   const actualContent = getCapturedOutput(test).trim();
   lines.push('**Ricky output**', '');

diff --git a/scripts/evals/ci-summary.mjs b/scripts/evals/ci-summary.mjs
@@ -23,6 +23,8 @@ const result = readResultJson(resultPath);
 
 const failed = result.tests.filter((test) => test.status === 'failed');
 const skipped = result.tests.filter((test) => test.status === 'skipped');
+const providerInfraSkipped = skipped.filter(isProviderInfrastructureSkip);
+const blockingSkipped = skipped.filter((test) => !isProviderInfrastructureSkip(test));
 const needsHuman = result.tests.filter((test) => test.status === 'needs-human');
 const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput);
 const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test));
@@ -39,11 +41,14 @@ const lines = [
   `- Human cases missing Ricky output: ${missingOutputNeedsHuman.length}`,
   `- Failed: ${result.failed}`,
   `- Skipped: ${result.skipped}`,
+  `- Provider infrastructure skipped: ${providerInfraSkipped.length}`,
+  `- Blocking skipped: ${blockingSkipped.length}`,
   '',
 ];
 
 appendStatusSection(lines, 'Failed', failed);
-appendStatusSection(lines, 'Skipped', skipped);
+appendStatusSection(lines, 'Skipped', blockingSkipped);
+appendStatusSection(lines, 'Provider Infrastructure Skips', providerInfraSkipped);
 appendHumanReviewSection(lines, reviewableNeedsHuman, missingOutputNeedsHuman);
 
 const summary = `${lines.join('\n')}\n`;
@@ -53,7 +58,7 @@ if (process.env.GITHUB_STEP_SUMMARY) {
   writeFileSync(process.env.GITHUB_STEP_SUMMARY, summary, { flag: 'a' });
 }
 
-if (failed.length > 0 || skipped.length > 0 || missingOutputNeedsHuman.length > 0) {
+if (failed.length > 0 || blockingSkipped.length > 0 || missingOutputNeedsHuman.length > 0) {
   process.exitCode = 1;
 }
 
@@ -129,6 +134,11 @@ function getCapturedOutput(test) {
   );
 }
 
+function isProviderInfrastructureSkip(test) {
+  if (test.status !== 'skipped') return false;
+  return String(test.error ?? '').startsWith('openrouter executor skipped; transient provider infrastructure unavailable');
+}
+
 function findLatestRunDir() {
   if (!existsSync(RUNS_DIR)) return null;
   const runs = readdirSync(RUNS_DIR)