Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/product/ricky-skill-embedding-boundary.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ For strict TypeScript or proof-oriented workflow generation, the expected loaded

`writing-agent-relay-workflows` affects the generated workflow contract by shaping the dedicated channel, explicit agents, step dependencies, review stages, and final signoff. `relay-80-100-workflow` affects validation by shaping soft validation, review/fix/final-review flow, final hard validation, git diff, and regression gates. These are generation-time effects because they are materialized into the workflow text and deterministic metadata before any workflow runner launches agents.

The generated workflow also includes a deterministic `skill-boundary-metadata-gate`. This gate checks that the generated boundary metadata exists, records `generation_time_only`, names the loaded skills, includes the `generation_selection`, `generation_loading`, and applicable `generation_rendering` stages, and records effects such as `workflow_contract` and `validation_gates`. The gate proves the artifact carries the skill boundary forward as metadata; it does not prove runtime agents load skills.
The generated workflow materializes this boundary as context metadata, including `loaded-skills.txt`, `skill-matches.json`, and `skill-application-boundary.json`. Ricky verifies that metadata in generation tests rather than re-checking its own serialized files with runtime shell text matches. Runtime gates should focus on agent-produced artifacts, validation commands, scoped diff evidence, and blocker files.

## Runtime Boundary

Expand Down
2 changes: 1 addition & 1 deletion evals/suites/workflow-authoring/cases.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{"id":"workflow-authoring.distinct-reviewer","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Write a workflow that has Codex generate a convention update and then review it."},"expected":{"maxToolCalls":0,"must":["Assign a reviewer agent distinct from the writer when possible.","Persist significant review artifacts under `.workflow-artifacts/`.","Keep convention-only edits scoped to the declared convention files."],"mustNot":["Let the same agent both write and rubber-stamp the change without an explicit reason.","Skip deterministic file-existence, grep, symlink, or scoped change-detection checks.","Edit unrelated package metadata or generated workflows for a convention-only request."],"humanReviewRequired":true},"tags":["workflow-authoring","review"]}
{"id":"workflow-authoring.fresh-eyes-loop-simple-test","suite":"workflow-authoring","executor":"ricky-cli","kind":"regression","input":{"message":"Generate a small Agent Relay workflow that adds one missing Vitest unit test for a TypeScript helper and proves the test passes."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","--- GENERATED ARTIFACT:",".agent(\"reviewer-claude\"",".agent(\"validator-claude\"",".agent(\"reviewer-codex\"",".agent(\"validator-codex\"","verdict: FINDINGS | NO_ISSUES_FOUND | BLOCKED","add or update appropriate tests, fixtures, assertions, or deterministic proofs","dependsOn: [\"final-fix-codex\"]"],"contentMatches":["\\.step\\(\"review-claude\"[\\s\\S]*\\.step\\(\"fix-loop\"[\\s\\S]*\\.step\\(\"final-review-claude\"[\\s\\S]*\\.step\\(\"final-fix-claude\"[\\s\\S]*\\.step\\(\"review-codex\"[\\s\\S]*\\.step\\(\"fix-loop-codex\"[\\s\\S]*\\.step\\(\"final-review-codex\"[\\s\\S]*\\.step\\(\"final-fix-codex\"[\\s\\S]*\\.step\\(\"final-review-pass-gate\"[\\s\\S]*\\.step\\(\"final-hard-validation\""],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Include the mandatory fresh-eyes review/fix loop even though the workflow is small.","Run the loop in this order: Claude review, Claude fix, Claude final review, Claude final fix, then Codex review, Codex fix, Codex final review, Codex final fix.","Require review output to use a structured verdict such as `FINDINGS`, `NO_ISSUES_FOUND`, or `BLOCKED`.","Require fix steps to add or update tests, fixtures, assertions, or deterministic proof for testable findings.","Put final deterministic acceptance after the Codex final fix."],"mustNot":["Treat the first passing test run as a substitute for fresh-eyes review.","Run Claude and Codex reviews in parallel before fixing.","Collapse all findings into one generic fix step with no final re-review.","Commit, open a PR, or hand off before the Codex loop finishes."],"humanReviewRequired":false},"tags":["workflow-authoring","review","tests","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a small Agent Relay workflow that adds one missing Vitest unit test for a TypeScript helper and proves the test passes.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
{"id":"workflow-authoring.fresh-eyes-loop-medium-source-and-test","suite":"workflow-authoring","executor":"ricky-cli","kind":"regression","input":{"message":"Generate a Ricky workflow that changes one source file and one test file for a CLI parsing bug, with scoped diff evidence and a targeted Vitest command."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","src/surfaces/cli/flows/power-user-parser.ts","src/surfaces/cli/flows/power-user-parser.test.ts","npx vitest run src/surfaces/cli/flows/power-user-parser.test.ts","git diff --name-only","git ls-files --others --exclude-standard","review-claude.md","final-review-codex.md","codex-final-fix.md","dependsOn: [\"final-fix-codex\"]"],"contentMatches":["\\.step\\(\"review-claude\"[\\s\\S]*\\.step\\(\"fix-loop\"[\\s\\S]*\\.step\\(\"final-review-claude\"[\\s\\S]*\\.step\\(\"final-fix-claude\"[\\s\\S]*\\.step\\(\"review-codex\"[\\s\\S]*\\.step\\(\"fix-loop-codex\"[\\s\\S]*\\.step\\(\"final-review-codex\"[\\s\\S]*\\.step\\(\"final-fix-codex\"[\\s\\S]*\\.step\\(\"final-review-pass-gate\"[\\s\\S]*\\.step\\(\"final-hard-validation\""],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Preserve the Claude-then-Codex review/fix/final-review/final-fix order before final acceptance.","Keep deterministic file gates and scoped `git diff --name-only` / untracked-file checks limited to the declared source and test targets.","Feed review findings into fix steps and require fixers to harden tests when findings are testable.","Write review, fix, final-review, final-fix, validation, and signoff artifacts under `.workflow-artifacts/`."],"mustNot":["Use broad repo-wide change detection as the only proof.","Allow a single reviewer to rubber-stamp its own work without a distinct fresh-eyes pass.","Skip the Codex final review/fix loop because Claude already reviewed.","Move final hard validation before the Codex final fix."],"humanReviewRequired":false},"tags":["workflow-authoring","review","generation","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a Ricky workflow that changes one source file and one test file for a CLI parsing bug.\\n\\n## Target Files\\n\\n- src/surfaces/cli/flows/power-user-parser.ts\\n- src/surfaces/cli/flows/power-user-parser.test.ts\\n\\n## Acceptance\\n\\nRun `npx vitest run src/surfaces/cli/flows/power-user-parser.test.ts`.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
{"id":"workflow-authoring.fresh-eyes-loop-complex-multitrack","suite":"workflow-authoring","executor":"ricky-cli","kind":"capability","input":{"message":"Generate a serious multi-track master executor workflow for three independent product slices: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","RICKY_MASTER_EXECUTOR_WORKFLOW","Master plan:","RICKY_CHILD_WORKFLOW_COMPLETE","review-claude","final-fix-codex","RICKY_CHILD_FRESH_EYES_LOOP_READY","BLOCKED_NO_COMMIT"],"contentMatches":["review-claude[\\s\\S]*fix-loop[\\s\\S]*final-review-claude[\\s\\S]*final-fix-claude[\\s\\S]*review-codex[\\s\\S]*fix-loop-codex[\\s\\S]*final-review-codex[\\s\\S]*final-fix-codex[\\s\\S]*final-review-pass-gate[\\s\\S]*final-hard-validation"],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Use a coordination shape that fits independent tracks while preserving deterministic gates after every editing step.","Ensure each implementation track or child workflow has the mandatory Claude-then-Codex fresh-eyes review/fix loop before track signoff.","Run final deterministic acceptance only after all Codex final fixes and post-fix reviews have completed.","Use `BLOCKED_NO_COMMIT` with evidence when a finding cannot be fixed, and skip commit or PR creation in that state.","Use the GitHub primitive for PR creation when shipping is in scope."],"mustNot":["Put one global review at the end and call that sufficient for all tracks.","Serialize independent tracks without a stated dependency reason.","Let PR creation, commit, or handoff race ahead of unresolved review findings.","Present tests, typecheck, or dry-run alone as the complete proof bar."],"humanReviewRequired":false},"tags":["workflow-authoring","review","multitrack","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a serious multi-track workflow for three independent product slices as smaller workflows run by a master executor: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR.\\n\\nUse independent child workflows with deterministic validation, fresh-eyes review/fix loops, and GitHub primitive PR creation when shipping is in scope.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
{"id":"workflow-authoring.fresh-eyes-loop-complex-multitrack","suite":"workflow-authoring","executor":"ricky-cli","kind":"capability","input":{"message":"Generate a serious multi-track master executor workflow for three independent product slices: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR."},"expected":{"ok":true,"contentIncludes":["stage\": \"generate","status\": \"ok","RICKY_MASTER_EXECUTOR_WORKFLOW","Master plan:","final-signoff","review-claude","final-fix-codex","RICKY_CHILD_FINAL_REVIEW_FILES_READY","BLOCKED_NO_COMMIT"],"contentMatches":["review-claude[\\s\\S]*fix-loop[\\s\\S]*final-review-claude[\\s\\S]*final-fix-claude[\\s\\S]*review-codex[\\s\\S]*fix-loop-codex[\\s\\S]*final-review-codex[\\s\\S]*final-fix-codex[\\s\\S]*final-review-pass-gate[\\s\\S]*final-hard-validation"],"forbidPhrases":["TypeError","ReferenceError","needs_clarification"],"maxToolCalls":1,"must":["Use a coordination shape that fits independent tracks while preserving deterministic gates after every editing step.","Ensure each implementation track or child workflow has the mandatory Claude-then-Codex fresh-eyes review/fix loop before track signoff.","Run final deterministic acceptance only after all Codex final fixes and post-fix reviews have completed.","Use `BLOCKED_NO_COMMIT` with evidence when a finding cannot be fixed, and skip commit or PR creation in that state.","Use the GitHub primitive for PR creation when shipping is in scope."],"mustNot":["Put one global review at the end and call that sufficient for all tracks.","Serialize independent tracks without a stated dependency reason.","Let PR creation, commit, or handoff race ahead of unresolved review findings.","Present tests, typecheck, or dry-run alone as the complete proof bar."],"humanReviewRequired":false},"tags":["workflow-authoring","review","multitrack","fresh-eyes"],"mock":{"cwd":"temp","specFileContent":"Generate a serious multi-track workflow for three independent product slices as smaller workflows run by a master executor: runtime evidence, CLI status copy, and generation validation. Each track owns separate files and the final workflow may create a PR.\\n\\nUse independent child workflows with deterministic validation, fresh-eyes review/fix loops, and GitHub primitive PR creation when shipping is in scope.","argv":"--mode local --spec-file {{specFile}} --no-run --json --no-workforce-persona","includeGeneratedArtifacts":true}}
{"id":"workflow-authoring.no-silent-mode-fallback","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Create a workflow for a user who asked to run in Cloud, but Cloud auth is missing."},"expected":{"maxToolCalls":0,"must":["Detect and report the missing Cloud readiness or auth condition before any expensive generation or run step.","Ask for an explicit user decision before switching to local/BYOH execution.","Preserve the requested execution mode in the workflow context and user-facing summary."],"mustNot":["Silently fall back from Cloud to local.","Claim a provider, account, credential, or integration is connected without a deterministic check.","Hide mode changes inside generic \"auto\" wording."],"humanReviewRequired":true},"tags":["workflow-authoring","local","cloud"]}
{"id":"workflow-authoring.agent-assistant-boundary","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Update Ricky to reuse a new Agent Assistant primitive while preserving Ricky-owned local execution behavior."},"expected":{"maxToolCalls":0,"must":["Reuse the shared Agent Assistant package for neutral assistant/runtime mechanics where appropriate.","State the Ricky-owned behavior that must remain local, including workflow generation, LocalResponse, blocker taxonomy, recovery wording, and evidence semantics.","Add proof that the shared primitive is exercised in a real Ricky path, not only imported or documented."],"mustNot":["Move product-specific Ricky execution contracts into Agent Assistant without an explicit proof boundary.","Overclaim broad Agent Assistant adoption from a narrow adapter change.","Replace Ricky's local blocker and recovery contract with generic assistant output."],"humanReviewRequired":true},"tags":["workflow-authoring","agent-assistant","boundary"]}
{"id":"workflow-authoring.evidence-trail","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Design a workflow that watches a long-running workflow, diagnoses a failure, attempts a safe repair, and reports the outcome."},"expected":{"maxToolCalls":0,"must":["Preserve an evidence trail that names commands, artifacts, failed steps, log locations, assertions, and side effects.","Distinguish successful repair, actionable blocker, unsupported condition, and unrecoverable error.","Include resumability guidance such as failed step, previous run id, or exact rerun command when available."],"mustNot":["Claim the workflow succeeded when a blocker or missing dependency stopped execution.","Drop log paths or side-effect summaries from the final outcome.","Retry destructive or credentialed actions without explicit authorization."],"humanReviewRequired":true},"tags":["workflow-authoring","evidence"]}
Expand Down
4 changes: 2 additions & 2 deletions evals/suites/workflow-authoring/cases.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ contentIncludes:
- status": "ok
- RICKY_MASTER_EXECUTOR_WORKFLOW
- Master plan:
- RICKY_CHILD_WORKFLOW_COMPLETE
- final-signoff
- review-claude
- final-fix-codex
- RICKY_CHILD_FRESH_EYES_LOOP_READY
- RICKY_CHILD_FINAL_REVIEW_FILES_READY
- BLOCKED_NO_COMMIT
contentMatches:
- review-claude[\s\S]*fix-loop[\s\S]*final-review-claude[\s\S]*final-fix-claude[\s\S]*review-codex[\s\S]*fix-loop-codex[\s\S]*final-review-codex[\s\S]*final-fix-codex[\s\S]*final-review-pass-gate[\s\S]*final-hard-validation
Expand Down
25 changes: 22 additions & 3 deletions scripts/evals/ci-review-comment.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ if (process.env.GITHUB_TOKEN && process.env.GITHUB_REPOSITORY && process.env.PR_
function renderComment({ result, runDir }) {
const failed = result.tests.filter((test) => test.status === 'failed');
const skipped = result.tests.filter((test) => test.status === 'skipped');
const providerInfraSkipped = skipped.filter(isProviderInfrastructureSkip);
const blockingSkipped = skipped.filter((test) => !isProviderInfrastructureSkip(test));
const needsHuman = result.tests.filter((test) => test.status === 'needs-human');
const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput);
const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test));
Expand All @@ -50,13 +52,25 @@ function renderComment({ result, runDir }) {
`Mode: \`${result.mode}\``,
`Git SHA: \`${result.git_sha}\``,
'',
`**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped}`,
`**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped} | **Provider infra skipped:** ${providerInfraSkipped.length}`,
'',
];

if (failed.length > 0 || skipped.length > 0) {
if (failed.length > 0 || blockingSkipped.length > 0) {
lines.push('## Blocking Cases', '');
for (const test of [...failed, ...skipped]) {
for (const test of [...failed, ...blockingSkipped]) {
appendCaseDetails(lines, test, { forceOpen: true });
}
}

if (providerInfraSkipped.length > 0) {
lines.push(
'## Provider Infrastructure Skips',
'',
'These provider-backed cases were skipped after retryable provider outages. They are not treated as Ricky product regressions.',
'',
);
for (const test of providerInfraSkipped) {
appendCaseDetails(lines, test, { forceOpen: true });
}
}
Expand Down Expand Up @@ -128,6 +142,11 @@ function appendCaseDetails(lines, test, { forceOpen }) {
lines.push('</details>', '');
}

function isProviderInfrastructureSkip(test) {
if (test.status !== 'skipped') return false;
return String(test.error ?? '').startsWith('openrouter executor skipped; transient provider infrastructure unavailable');
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Provider-infrastructure skip detection is string-literal duplicated instead of using a shared source, making skip classification brittle to message drift.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At scripts/evals/ci-review-comment.mjs, line 147:

<comment>Provider-infrastructure skip detection is string-literal duplicated instead of using a shared source, making skip classification brittle to message drift.</comment>

<file context>
@@ -128,6 +142,11 @@ function appendCaseDetails(lines, test, { forceOpen }) {
 
+function isProviderInfrastructureSkip(test) {
+  if (test.status !== 'skipped') return false;
+  return String(test.error ?? '').startsWith('openrouter executor skipped; transient provider infrastructure unavailable');
+}
+
</file context>

}

function appendRickyOutput(lines, test) {
const actualContent = getCapturedOutput(test).trim();
lines.push('**Ricky output**', '');
Expand Down
14 changes: 12 additions & 2 deletions scripts/evals/ci-summary.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ const result = readResultJson(resultPath);

const failed = result.tests.filter((test) => test.status === 'failed');
const skipped = result.tests.filter((test) => test.status === 'skipped');
const providerInfraSkipped = skipped.filter(isProviderInfrastructureSkip);
const blockingSkipped = skipped.filter((test) => !isProviderInfrastructureSkip(test));
const needsHuman = result.tests.filter((test) => test.status === 'needs-human');
const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput);
const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test));
Expand All @@ -39,11 +41,14 @@ const lines = [
`- Human cases missing Ricky output: ${missingOutputNeedsHuman.length}`,
`- Failed: ${result.failed}`,
`- Skipped: ${result.skipped}`,
`- Provider infrastructure skipped: ${providerInfraSkipped.length}`,
`- Blocking skipped: ${blockingSkipped.length}`,
'',
];

appendStatusSection(lines, 'Failed', failed);
appendStatusSection(lines, 'Skipped', skipped);
appendStatusSection(lines, 'Skipped', blockingSkipped);
appendStatusSection(lines, 'Provider Infrastructure Skips', providerInfraSkipped);
appendHumanReviewSection(lines, reviewableNeedsHuman, missingOutputNeedsHuman);

const summary = `${lines.join('\n')}\n`;
Expand All @@ -53,7 +58,7 @@ if (process.env.GITHUB_STEP_SUMMARY) {
writeFileSync(process.env.GITHUB_STEP_SUMMARY, summary, { flag: 'a' });
}

if (failed.length > 0 || skipped.length > 0 || missingOutputNeedsHuman.length > 0) {
if (failed.length > 0 || blockingSkipped.length > 0 || missingOutputNeedsHuman.length > 0) {
process.exitCode = 1;
}

Expand Down Expand Up @@ -129,6 +134,11 @@ function getCapturedOutput(test) {
);
}

function isProviderInfrastructureSkip(test) {
if (test.status !== 'skipped') return false;
return String(test.error ?? '').startsWith('openrouter executor skipped; transient provider infrastructure unavailable');
}

function findLatestRunDir() {
if (!existsSync(RUNS_DIR)) return null;
const runs = readdirSync(RUNS_DIR)
Expand Down
Loading
Loading