diff --git a/.msd/autofix-findings-summary.txt b/.msd/autofix-findings-summary.txt new file mode 100644 index 000000000..bbcc95c4e --- /dev/null +++ b/.msd/autofix-findings-summary.txt @@ -0,0 +1,10 @@ +1. [MEDIUM] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +2. [MEDIUM] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +3. [LOW] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +4. [LOW] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +5. [LOW] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +6. [LOW] packages/sdk/src/runner.ts — packages/sdk/src/runner.ts +7. [LOW] packages/sdk/src/cli.ts — packages/sdk/src/cli.ts +8. [LOW] packages/sdk/src/cli.ts — packages/sdk/src/cli.ts +9. [LOW] packages/sdk/src/file-db.ts — packages/sdk/src/file-db.ts +10. [LOW] packages/sdk/src/file-db.ts — packages/sdk/src/file-db.ts diff --git a/.msd/autofix-plan.json b/.msd/autofix-plan.json new file mode 100644 index 000000000..7a78bf66d --- /dev/null +++ b/.msd/autofix-plan.json @@ -0,0 +1,49 @@ +{ + "groups": [ + { + "id": "group-1", + "label": "runner.ts reliability and readability fixes", + "domain": "reliability", + "findings": [ + "packages/sdk/src/runner.ts-Silent failure: optional config param on resume()-historian-review, developer-cross-review, security-cross-review-medium", + "packages/sdk/src/runner.ts-Synthetic timestamps in reconstructed runs-historian-review, developer-cross-review-medium", + "packages/sdk/src/runner.ts-Path traversal via unsanitized runId-security-review, all cross-reviewers-low", + "packages/sdk/src/runner.ts-failedStepName heuristic asymmetry with startFrom-historian-review, developer-cross-review-low", + "packages/sdk/src/runner.ts-.report.json files invisible to cache reconstruction-historian-review, developer-cross-review-low", + "packages/sdk/src/runner.ts-Nested ternary workflow selection readability-developer-review, all cross-reviewers-low" + ], + "files": [ + "packages/sdk/src/runner.ts" + ], + "rationale": "All 6 findings target runner.ts — grouping avoids file conflicts and lets one worker handle related resume/reconstruction logic" + }, + { + "id": "group-2", + "label": "cli.ts flag parsing and config version fixes", + "domain": "code-quality", + "findings": [ + "packages/sdk/src/cli.ts-String matching brittleness for flag parsing-developer-review, historian-cross-review-low", + "packages/sdk/src/cli.ts-Config version mismatch during resume-historian-cross-review-low" + ], + "files": [ + "packages/sdk/src/cli.ts" + ], + "rationale": "Both findings target cli.ts — same file, related CLI parsing concerns" + }, + { + "id": "group-3", + "label": "file-db.ts output validation and error handling", + "domain": "reliability", + "findings": [ + "packages/sdk/src/file-db.ts-hasStepOutputs false positive potential-historian-review-low", + "packages/sdk/src/file-db.ts-File path disclosure in append() catch — DISPUTED NON-ISSUE-security-review (disputed by security-cross-review, developer-cross-review)-low" + ], + "files": [ + "packages/sdk/src/file-db.ts" + ], + "rationale": "Both findings target file-db.ts — same file, related data integrity concerns" + } + ], + "totalGroups": 3, + "conflictCheck": "no file appears in multiple groups" +} \ No newline at end of file diff --git a/.trajectories/active/traj_1774617159310_87c5f71c.json b/.trajectories/active/traj_1774617159310_87c5f71c.json new file mode 100644 index 000000000..01569663d --- /dev/null +++ b/.trajectories/active/traj_1774617159310_87c5f71c.json @@ -0,0 +1,406 @@ +{ + "id": "traj_1774617159310_87c5f71c", + "version": 1, + "task": { + "title": "autofix-swarm-Agentworkforce-relay-workflow", + "source": { + "system": "workflow-runner", + "id": "8f29a399f4d82a7d88bf39bb" + } + }, + "status": "active", + "startedAt": "2026-03-27T13:12:39.310Z", + "agents": [ + { + "name": "orchestrator", + "role": "workflow-runner", + "joinedAt": "2026-03-27T13:12:39.310Z" + }, + { + "name": "lead", + "role": "specialist", + "joinedAt": "2026-03-27T13:12:43.433Z" + }, + { + "name": "fix-worker-1", + "role": "specialist", + "joinedAt": "2026-03-27T13:14:23.398Z" + }, + { + "name": "fix-worker-2", + "role": "specialist", + "joinedAt": "2026-03-27T13:14:25.399Z" + }, + { + "name": "fix-worker-3", + "role": "specialist", + "joinedAt": "2026-03-27T13:14:27.399Z" + }, + { + "name": "fix-worker-4", + "role": "specialist", + "joinedAt": "2026-03-27T13:14:29.399Z" + }, + { + "name": "verifier", + "role": "specialist", + "joinedAt": "2026-03-27T13:18:31.200Z" + } + ], + "chapters": [ + { + "id": "ch_47b43958", + "title": "Planning", + "agentName": "orchestrator", + "startedAt": "2026-03-27T13:12:39.310Z", + "events": [ + { + "ts": 1774617159310, + "type": "note", + "content": "Purpose: Swarm autofix: 10 findings for Agentworkforce/relay (source: pr_review)" + }, + { + "ts": 1774617159310, + "type": "note", + "content": "Approach: 14-step dag workflow — Parsed 14 steps, 2 parallel tracks, 12 dependent steps, DAG validated, no cycles" + } + ], + "endedAt": "2026-03-27T13:12:43.285Z" + }, + { + "id": "ch_0ffd568d", + "title": "Execution: init-msd-dir, read-context", + "agentName": "orchestrator", + "startedAt": "2026-03-27T13:12:43.285Z", + "events": [], + "endedAt": "2026-03-27T13:12:43.414Z" + }, + { + "id": "ch_181743b4", + "title": "Convergence: init-msd-dir + read-context", + "agentName": "orchestrator", + "startedAt": "2026-03-27T13:12:43.414Z", + "events": [ + { + "ts": 1774617163415, + "type": "reflection", + "content": "init-msd-dir + read-context resolved. 2/2 steps completed. All steps completed on first attempt. Unblocking: write-findings, plan.", + "significance": "high", + "raw": { + "confidence": 0.75, + "focalPoints": [ + "init-msd-dir: completed", + "read-context: completed" + ] + } + } + ], + "endedAt": "2026-03-27T13:12:43.433Z" + }, + { + "id": "ch_df21ba53", + "title": "Execution: plan", + "agentName": "lead", + "startedAt": "2026-03-27T13:12:43.433Z", + "events": [ + { + "ts": 1774617163433, + "type": "note", + "content": "\"plan\": You are the LEAD ARCHITECT for an autofix session on Agentworkforce/relay", + "raw": { + "agent": "lead" + } + }, + { + "ts": 1774617263325, + "type": "completion-marker", + "content": "\"plan\" marker-based completion — Legacy STEP_COMPLETE marker observed (6 signal(s), 1 relevant channel post(s), 3 file change(s); signals=plan, COMPLETE, >0q>4m0q>4m0q>4m **Status:** ✅ Completed +> **Confidence:** 83% +> **Started:** March 26, 2026 at 11:14 PM +> **Completed:** March 26, 2026 at 11:17 PM + +--- + +## Summary + +Added resume fallback tests covering cache reconstruction cases and file-db append diagnostics. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Added targeted resume-fallback and file-db diagnostics tests before implementation +- **Chose:** Added targeted resume-fallback and file-db diagnostics tests before implementation +- **Reasoning:** These tests codify the new resume reconstruction contract and preserve the existing DB happy path while exposing the current missing fallback behavior. + +--- + +## Chapters + +### 1. Work +*Agent: default* + +- Added targeted resume-fallback and file-db diagnostics tests before implementation: Added targeted resume-fallback and file-db diagnostics tests before implementation diff --git a/.trajectories/completed/traj_1774563046213_6359d1ff.json b/.trajectories/completed/traj_1774563046213_6359d1ff.json new file mode 100644 index 000000000..de34079ae --- /dev/null +++ b/.trajectories/completed/traj_1774563046213_6359d1ff.json @@ -0,0 +1,463 @@ +{ + "id": "traj_1774563046213_6359d1ff", + "version": 1, + "task": { + "title": "fix-resume-fallback-workflow", + "source": { + "system": "workflow-runner", + "id": "beaff872dcafd78fa86bc1f7" + } + }, + "status": "completed", + "startedAt": "2026-03-26T22:10:46.213Z", + "agents": [ + { + "name": "orchestrator", + "role": "workflow-runner", + "joinedAt": "2026-03-26T22:10:46.213Z" + }, + { + "name": "architect", + "role": "specialist", + "joinedAt": "2026-03-26T22:11:06.721Z" + }, + { + "name": "runner-worker", + "role": "specialist", + "joinedAt": "2026-03-26T22:14:19.775Z" + }, + { + "name": "db-worker", + "role": "specialist", + "joinedAt": "2026-03-26T22:14:19.775Z" + }, + { + "name": "test-worker", + "role": "specialist", + "joinedAt": "2026-03-26T22:14:19.775Z" + }, + { + "name": "reviewer", + "role": "specialist", + "joinedAt": "2026-03-26T22:20:07.221Z" + } + ], + "chapters": [ + { + "id": "ch_ebc1566f", + "title": "Planning", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:10:46.213Z", + "events": [ + { + "ts": 1774563046213, + "type": "note", + "content": "Purpose: Fix --resume to fall back to step-output cache when workflow-runs.jsonl is missing" + }, + { + "ts": 1774563046213, + "type": "note", + "content": "Approach: 16-step dag workflow — Parsed 16 steps, 8 parallel tracks, 8 dependent steps, DAG validated, no cycles" + } + ], + "endedAt": "2026-03-26T22:10:51.529Z" + }, + { + "id": "ch_0c0edcfe", + "title": "Execution: read-runner-resume, read-runner-execute, read-runner-fs-helpers, read-file-db, read-types, read-cli, read-existing-tests, read-runner-constructor", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:10:51.529Z", + "events": [], + "endedAt": "2026-03-26T22:11:06.718Z" + }, + { + "id": "ch_5e690eeb", + "title": "Convergence: read-runner-resume + read-runner-execute + read-runner-fs-helpers + read-file-db + read-types + read-cli + read-existing-tests + read-runner-constructor", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:11:06.718Z", + "events": [ + { + "ts": 1774563066720, + "type": "reflection", + "content": "read-runner-resume + read-runner-execute + read-runner-fs-helpers + read-file-db + read-types + read-cli + read-existing-tests + read-runner-constructor resolved. 8/8 steps completed. All steps completed on first attempt. Unblocking: design.", + "significance": "high", + "raw": { + "confidence": 0.75, + "focalPoints": [ + "read-runner-resume: completed", + "read-runner-execute: completed", + "read-runner-fs-helpers: completed", + "read-file-db: completed", + "read-types: completed", + "read-cli: completed", + "read-existing-tests: completed", + "read-runner-constructor: completed" + ] + } + } + ], + "endedAt": "2026-03-26T22:11:06.721Z" + }, + { + "id": "ch_3eefedd2", + "title": "Execution: design", + "agentName": "architect", + "startedAt": "2026-03-26T22:11:06.721Z", + "events": [ + { + "ts": 1774563066721, + "type": "note", + "content": "\"design\": Design the fix for --resume falling back to the step-output filesystem cache", + "raw": { + "agent": "architect" + } + }, + { + "ts": 1774563259772, + "type": "completion-marker", + "content": "\"design\" marker-based completion — Legacy STEP_COMPLETE marker observed (5 signal(s), 1 relevant channel post(s), 2 file change(s); signals=**Design complete.** Written to `docs/resume-fallback-design.md`., design, COMPLETE, >1u>4;2m>0q>4m1u>4;2m▗ ▗ ▖ ▖ Claude Code v2.1.84, DESIGN_COMPLETE; channel=**Design complete.** Written to `docs/resume-fallback-design.md`.\n\n**Summary of approach:**\n\n1. **New method** `reconstructRunFromCache(runId, config, workflowN; files=modified:.claude/settings.json, created:docs/resume-fallback-design.md)", + "significance": "medium", + "raw": { + "stepName": "design", + "completionMode": "marker", + "reason": "Legacy STEP_COMPLETE marker observed", + "evidence": { + "summary": "5 signal(s), 1 relevant channel post(s), 2 file change(s)", + "signals": [ + "**Design complete.** Written to `docs/resume-fallback-design.md`.", + "design", + "COMPLETE", + ">1u>4;2m>0q>4m1u>4;2m▗ ▗ ▖ ▖ Claude Code v2.1.84", + "DESIGN_COMPLETE" + ], + "channelPosts": [ + "**Design complete.** Written to `docs/resume-fallback-design.md`.\n\n**Summary of approach:**\n\n1. **New method** `reconstructRunFromCache(runId, config, workflowN" + ], + "files": [ + "modified:.claude/settings.json", + "created:docs/resume-fallback-design.md" + ] + } + } + }, + { + "ts": 1774563259773, + "type": "finding", + "content": "\"design\" completed → ✢", + "significance": "medium" + } + ], + "endedAt": "2026-03-26T22:14:19.775Z" + }, + { + "id": "ch_fa296109", + "title": "Execution: impl-resume-fallback, impl-db-diagnostics, impl-tests", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:14:19.775Z", + "events": [], + "endedAt": "2026-03-26T22:14:19.776Z" + }, + { + "id": "ch_b06d79ad", + "title": "Execution: impl-resume-fallback", + "agentName": "runner-worker", + "startedAt": "2026-03-26T22:14:19.776Z", + "events": [ + { + "ts": 1774563259776, + "type": "note", + "content": "\"impl-resume-fallback\": Implement the resume fallback in runner.ts", + "raw": { + "agent": "runner-worker" + } + } + ], + "endedAt": "2026-03-26T22:14:19.776Z" + }, + { + "id": "ch_a9803898", + "title": "Execution: impl-db-diagnostics", + "agentName": "db-worker", + "startedAt": "2026-03-26T22:14:19.776Z", + "events": [ + { + "ts": 1774563259776, + "type": "note", + "content": "\"impl-db-diagnostics\": Improve file-db.ts diagnostics and add CLI warning improvements", + "raw": { + "agent": "db-worker" + } + } + ], + "endedAt": "2026-03-26T22:14:19.776Z" + }, + { + "id": "ch_a5d7528c", + "title": "Execution: impl-tests", + "agentName": "test-worker", + "startedAt": "2026-03-26T22:14:19.776Z", + "events": [ + { + "ts": 1774563259776, + "type": "note", + "content": "\"impl-tests\": Write tests for the resume fallback behavior", + "raw": { + "agent": "test-worker" + } + }, + { + "ts": 1774563332201, + "type": "completion-evidence", + "content": "\"impl-db-diagnostics\" verification-based completion — Verification passed (6 signal(s), 2 file change(s), exit=0; signals=0, design, COMPLETE, 2026-03-26T22:14:22.209873Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---, 2026-03-26T22:14:22.209873Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---, Verification passed; files=modified:packages/sdk/src/workflows/cli.ts, modified:packages/sdk/src/workflows/file-db.ts; exit=0)", + "significance": "medium", + "raw": { + "stepName": "impl-db-diagnostics", + "completionMode": "verification", + "reason": "Verification passed", + "evidence": { + "summary": "6 signal(s), 2 file change(s), exit=0", + "signals": [ + "0", + "design", + "COMPLETE", + "2026-03-26T22:14:22.209873Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---", + "2026-03-26T22:14:22.209873Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---", + "Verification passed" + ], + "files": [ + "modified:packages/sdk/src/workflows/cli.ts", + "modified:packages/sdk/src/workflows/file-db.ts" + ], + "exitCode": 0 + } + } + }, + { + "ts": 1774563332201, + "type": "finding", + "content": "\"impl-db-diagnostics\" completed → Implemented the requested changes.\n\nChanges written to disk:\n- [file-db.ts](/Users/khaliqgant/Projects/AgentWorkforce/re", + "significance": "medium" + }, + { + "ts": 1774563483555, + "type": "completion-evidence", + "content": "\"impl-tests\" verification-based completion — Verification passed (6 signal(s), 3 file change(s), exit=0; signals=design, unknown, COMPLETE, APPROVE, 2026-03-26T22:14:22.190950Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---, Verification passed; files=created:packages/sdk/src/__tests__/resume-fallback.test.ts, modified:packages/sdk/src/workflows/cli.ts, modified:packages/sdk/src/workflows/file-db.ts; exit=0)", + "significance": "medium", + "raw": { + "stepName": "impl-tests", + "completionMode": "verification", + "reason": "Verification passed", + "evidence": { + "summary": "6 signal(s), 3 file change(s), exit=0", + "signals": [ + "design", + "unknown", + "COMPLETE", + "APPROVE", + "2026-03-26T22:14:22.190950Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---", + "Verification passed" + ], + "files": [ + "created:packages/sdk/src/__tests__/resume-fallback.test.ts", + "modified:packages/sdk/src/workflows/cli.ts", + "modified:packages/sdk/src/workflows/file-db.ts" + ], + "exitCode": 0 + } + } + }, + { + "ts": 1774563483555, + "type": "finding", + "content": "\"impl-tests\" completed → Added [resume-fallback.test.ts](/Users/khaliqgant/Projects/AgentWorkforce/relay/packages/sdk/src/__tests__/resume-fallba", + "significance": "medium" + }, + { + "ts": 1774563600439, + "type": "completion-evidence", + "content": "\"impl-resume-fallback\" verification-based completion — Verification passed (6 signal(s), 4 file change(s), exit=0; signals=0, design, COMPLETE, 2026-03-26T22:14:22.130729Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---, 2026-03-26T22:14:22.130729Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---, Verification passed; files=created:packages/sdk/src/__tests__/resume-fallback.test.ts, modified:packages/sdk/src/workflows/cli.ts, modified:packages/sdk/src/workflows/file-db.ts, modified:packages/sdk/src/workflows/runner.ts; exit=0)", + "significance": "medium", + "raw": { + "stepName": "impl-resume-fallback", + "completionMode": "verification", + "reason": "Verification passed", + "evidence": { + "summary": "6 signal(s), 4 file change(s), exit=0", + "signals": [ + "0", + "design", + "COMPLETE", + "2026-03-26T22:14:22.130729Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---", + "2026-03-26T22:14:22.130729Z ERROR codex_core::codex: failed to load skill /Users/khaliqgant/Projects/AgentWorkforce/relay/.agents/skills/creating-agent-skills-skill/SKILL.md: missing YAML frontmatter delimited by ---", + "Verification passed" + ], + "files": [ + "created:packages/sdk/src/__tests__/resume-fallback.test.ts", + "modified:packages/sdk/src/workflows/cli.ts", + "modified:packages/sdk/src/workflows/file-db.ts", + "modified:packages/sdk/src/workflows/runner.ts" + ], + "exitCode": 0 + } + } + }, + { + "ts": 1774563600439, + "type": "finding", + "content": "\"impl-resume-fallback\" completed → Implemented the resume fallback in [`packages/sdk/src/workflows/runner.ts`](/Users/khaliqgant/Projects/AgentWorkforce/re", + "significance": "medium" + } + ], + "endedAt": "2026-03-26T22:20:00.441Z" + }, + { + "id": "ch_6069dd38", + "title": "Convergence: impl-resume-fallback + impl-db-diagnostics + impl-tests", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:20:00.441Z", + "events": [ + { + "ts": 1774563600442, + "type": "reflection", + "content": "impl-resume-fallback + impl-db-diagnostics + impl-tests resolved. 3/3 steps completed. All steps completed on first attempt. Unblocking: verify-build.", + "significance": "high", + "raw": { + "confidence": 1, + "focalPoints": [ + "impl-resume-fallback: completed", + "impl-db-diagnostics: completed", + "impl-tests: completed" + ] + } + } + ], + "endedAt": "2026-03-26T22:20:07.222Z" + }, + { + "id": "ch_c4a26591", + "title": "Execution: review", + "agentName": "reviewer", + "startedAt": "2026-03-26T22:20:07.222Z", + "events": [ + { + "ts": 1774563607222, + "type": "note", + "content": "\"review\": Review the resume fallback implementation", + "raw": { + "agent": "reviewer" + } + }, + { + "ts": 1774563699435, + "type": "completion-evidence", + "content": "\"review\" verification-based completion — Verification passed (2 signal(s), 2 file change(s), exit=0; signals=0, Verification passed; files=modified:.claude/settings.json, created:docs/resume-fallback-review.md; exit=0)", + "significance": "medium", + "raw": { + "stepName": "review", + "completionMode": "verification", + "reason": "Verification passed", + "evidence": { + "summary": "2 signal(s), 2 file change(s), exit=0", + "signals": [ + "0", + "Verification passed" + ], + "files": [ + "modified:.claude/settings.json", + "created:docs/resume-fallback-review.md" + ], + "exitCode": 0 + } + } + }, + { + "ts": 1774563699435, + "type": "finding", + "content": "\"review\" completed → Review written to `docs/resume-fallback-review.md`.", + "significance": "medium" + } + ], + "endedAt": "2026-03-26T22:21:39.437Z" + }, + { + "id": "ch_85a4f8b8", + "title": "Execution: fix-issues", + "agentName": "architect", + "startedAt": "2026-03-26T22:21:39.437Z", + "events": [ + { + "ts": 1774563699437, + "type": "note", + "content": "\"fix-issues\": Address any review feedback or build/test failures", + "raw": { + "agent": "architect" + } + }, + { + "ts": 1774563807587, + "type": "completion-marker", + "content": "\"fix-issues\" marker-based completion — Legacy STEP_COMPLETE marker observed (6 signal(s), 2 relevant channel post(s), 6 file change(s); signals=COMPLETE, **FIX_COMPLETE**, fix-issues, >1u>4;2m>0q>4m1u>4;2m▗ ▗ ▖ ▖ Claude Code v2.1.84, FIX_COMPLETE, **[fix-issues] Output:**; channel=**FIX_COMPLETE**\n\n**Summary:** Review's blocking issue resolved. Tests executed and all passing.\n\n**What was fixed:**\n- `runner.ts` `reconstructRunFromCache` — | **[fix-issues] Output:**\n```\npackages/sdk && npx\n vitest --run src/__tests__/resume-fallback.test.ts 2>&1)\n RUN v3.2.4 /Users/khaliqgant/Projects/Age; files=modified:.claude/settings.json, created:.turbo/cache/4d0e6994e2101327-manifest.json, created:.turbo/cache/4d0e6994e2101327-meta.json, created:.turbo/cache/4d0e6994e2101327.tar.zst, created:.turbo/cache/b92de645f3f74692-manifest.json, created:.turbo/cache/b92de645f3f74692-meta.json)", + "significance": "medium", + "raw": { + "stepName": "fix-issues", + "completionMode": "marker", + "reason": "Legacy STEP_COMPLETE marker observed", + "evidence": { + "summary": "6 signal(s), 2 relevant channel post(s), 6 file change(s)", + "signals": [ + "COMPLETE", + "**FIX_COMPLETE**", + "fix-issues", + ">1u>4;2m>0q>4m1u>4;2m▗ ▗ ▖ ▖ Claude Code v2.1.84", + "FIX_COMPLETE", + "**[fix-issues] Output:**" + ], + "channelPosts": [ + "**FIX_COMPLETE**\n\n**Summary:** Review's blocking issue resolved. Tests executed and all passing.\n\n**What was fixed:**\n- `runner.ts` `reconstructRunFromCache` — ", + "**[fix-issues] Output:**\n```\npackages/sdk && npx\n vitest --run src/__tests__/resume-fallback.test.ts 2>&1)\n RUN v3.2.4 /Users/khaliqgant/Projects/Age" + ], + "files": [ + "modified:.claude/settings.json", + "created:.turbo/cache/4d0e6994e2101327-manifest.json", + "created:.turbo/cache/4d0e6994e2101327-meta.json", + "created:.turbo/cache/4d0e6994e2101327.tar.zst", + "created:.turbo/cache/b92de645f3f74692-manifest.json", + "created:.turbo/cache/b92de645f3f74692-meta.json" + ] + } + } + }, + { + "ts": 1774563807588, + "type": "finding", + "content": "\"fix-issues\" completed → ✢", + "significance": "medium" + } + ], + "endedAt": "2026-03-26T22:23:27.597Z" + }, + { + "id": "ch_dce801dd", + "title": "Retrospective", + "agentName": "orchestrator", + "startedAt": "2026-03-26T22:23:27.597Z", + "events": [ + { + "ts": 1774563807597, + "type": "reflection", + "content": "All 16 steps completed in 13min. (completed in 13 minutes)", + "significance": "high" + } + ], + "endedAt": "2026-03-26T22:23:27.597Z" + } + ], + "completedAt": "2026-03-26T22:23:27.597Z", + "retrospective": { + "summary": "All 16 steps completed in 13min.", + "approach": "dag workflow (5 agents)", + "confidence": 0.84375, + "learnings": [], + "challenges": [] + } +} \ No newline at end of file diff --git a/.trajectories/index.json b/.trajectories/index.json index b7ad69bc3..c6b013436 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-03-26T20:11:23.431Z", + "lastUpdated": "2026-03-26T22:17:47.879Z", "trajectories": { "traj_1b1dj40sl6jl": { "title": "Revert aggressive retry logic in relay-pty-orchestrator", @@ -919,6 +919,13 @@ "startedAt": "2026-03-26T19:53:02.196Z", "completedAt": "2026-03-26T20:11:23.315Z", "path": "/Users/khaliqgant/Projects/Agent Workforce/relay/.trajectories/completed/2026-03/traj_jhnp7malmci4.json" + }, + "traj_4ghb800vy5ti": { + "title": "Write tests for resume fallback behavior", + "status": "completed", + "startedAt": "2026-03-26T22:14:28.964Z", + "completedAt": "2026-03-26T22:17:47.783Z", + "path": "/Users/khaliqgant/Projects/AgentWorkforce/relay/.trajectories/completed/2026-03/traj_4ghb800vy5ti.json" } } -} \ No newline at end of file +} diff --git a/package-lock.json b/package-lock.json index f00a8f87a..d145f3708 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "agent-relay", - "version": "3.2.18", + "version": "3.2.21", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "agent-relay", - "version": "3.2.18", + "version": "3.2.21", "bundleDependencies": [ "@agent-relay/cloud", "@agent-relay/config", @@ -24,14 +24,14 @@ "web" ], "dependencies": { - "@agent-relay/cloud": "3.2.18", - "@agent-relay/config": "3.2.18", - "@agent-relay/hooks": "3.2.18", - "@agent-relay/sdk": "3.2.18", - "@agent-relay/telemetry": "3.2.18", - "@agent-relay/trajectory": "3.2.18", - "@agent-relay/user-directory": "3.2.18", - "@agent-relay/utils": "3.2.18", + "@agent-relay/cloud": "3.2.21", + "@agent-relay/config": "3.2.21", + "@agent-relay/hooks": "3.2.21", + "@agent-relay/sdk": "3.2.21", + "@agent-relay/telemetry": "3.2.21", + "@agent-relay/trajectory": "3.2.21", + "@agent-relay/user-directory": "3.2.21", + "@agent-relay/utils": "3.2.21", "@aws-sdk/client-s3": "^3.1004.0", "@modelcontextprotocol/sdk": "^1.0.0", "@relaycast/mcp": "1.0.0", @@ -1308,7 +1308,7 @@ }, "node_modules/@clack/prompts/node_modules/is-unicode-supported": { "version": "1.3.0", - "extraneous": true, + "dev": true, "inBundle": true, "license": "MIT", "engines": { @@ -14961,10 +14961,10 @@ }, "packages/acp-bridge": { "name": "@agent-relay/acp-bridge", - "version": "3.2.18", + "version": "3.2.21", "license": "Apache-2.0", "dependencies": { - "@agent-relay/sdk": "3.2.18", + "@agent-relay/sdk": "3.2.21", "@agentclientprotocol/sdk": "^0.12.0" }, "bin": { @@ -14981,13 +14981,13 @@ }, "packages/brand": { "name": "@agent-relay/brand", - "version": "1.1.0" + "version": "3.2.21" }, "packages/cloud": { "name": "@agent-relay/cloud", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18", + "@agent-relay/config": "3.2.21", "@aws-sdk/client-s3": "^3.1004.0", "ignore": "^7.0.5", "tar": "^7.5.10" @@ -15000,7 +15000,7 @@ }, "packages/config": { "name": "@agent-relay/config", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { "zod": "^3.23.8", "zod-to-json-schema": "^3.23.1" @@ -15013,11 +15013,11 @@ }, "packages/hooks": { "name": "@agent-relay/hooks", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18", - "@agent-relay/sdk": "3.2.18", - "@agent-relay/trajectory": "3.2.18" + "@agent-relay/config": "3.2.21", + "@agent-relay/sdk": "3.2.21", + "@agent-relay/trajectory": "3.2.21" }, "devDependencies": { "@types/node": "^22.19.3", @@ -15027,9 +15027,9 @@ }, "packages/memory": { "name": "@agent-relay/memory", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/hooks": "3.2.18" + "@agent-relay/hooks": "3.2.21" }, "devDependencies": { "@types/node": "^22.19.3", @@ -15039,11 +15039,11 @@ }, "packages/openclaw": { "name": "@agent-relay/openclaw", - "version": "3.2.18", + "version": "3.2.21", "hasInstallScript": true, "license": "Apache-2.0", "dependencies": { - "@agent-relay/sdk": "3.2.18", + "@agent-relay/sdk": "3.2.21", "@relaycast/sdk": "^1.0.0", "ws": "^8.0.0" }, @@ -15867,9 +15867,9 @@ }, "packages/policy": { "name": "@agent-relay/policy", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18" + "@agent-relay/config": "3.2.21" }, "devDependencies": { "@types/node": "^22.19.3", @@ -15879,9 +15879,9 @@ }, "packages/sdk": { "name": "@agent-relay/sdk", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18", + "@agent-relay/config": "3.2.21", "@relaycast/sdk": "^1.1.0", "@sinclair/typebox": "^0.34.48", "chalk": "^4.1.2", @@ -15965,7 +15965,7 @@ }, "packages/telemetry": { "name": "@agent-relay/telemetry", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { "posthog-node": "^4.0.1" }, @@ -15977,9 +15977,9 @@ }, "packages/trajectory": { "name": "@agent-relay/trajectory", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18" + "@agent-relay/config": "3.2.21" }, "devDependencies": { "@types/node": "^22.19.3", @@ -15989,9 +15989,9 @@ }, "packages/user-directory": { "name": "@agent-relay/user-directory", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/utils": "3.2.18" + "@agent-relay/utils": "3.2.21" }, "devDependencies": { "@types/node": "^22.19.3", @@ -16001,9 +16001,9 @@ }, "packages/utils": { "name": "@agent-relay/utils", - "version": "3.2.18", + "version": "3.2.21", "dependencies": { - "@agent-relay/config": "3.2.18", + "@agent-relay/config": "3.2.21", "compare-versions": "^6.1.1" }, "devDependencies": { diff --git a/packages/sdk/src/__tests__/resume-fallback.test.ts b/packages/sdk/src/__tests__/resume-fallback.test.ts new file mode 100644 index 000000000..5f4e4f147 --- /dev/null +++ b/packages/sdk/src/__tests__/resume-fallback.test.ts @@ -0,0 +1,415 @@ +/** + * Tests for resuming workflow execution from cached step outputs when the JSONL + * run database is missing or unavailable. + */ + +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + chmodSync, + mkdirSync, + mkdtempSync, + rmSync, + writeFileSync, +} from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import type { WorkflowDb } from '../workflows/runner.js'; +import type { RelayYamlConfig, WorkflowRunRow, WorkflowStepRow } from '../workflows/types.js'; + +// ── Mock fetch ─────────────────────────────────────────────────────────────── + +const mockFetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => Promise.resolve({ data: { api_key: 'rk_live_test', workspace_id: 'ws-test' } }), + text: () => Promise.resolve(''), +}); +vi.stubGlobal('fetch', mockFetch); + +// ── Mock RelayCast SDK ─────────────────────────────────────────────────────── + +const mockRelaycastAgent = { + send: vi.fn().mockResolvedValue(undefined), + heartbeat: vi.fn().mockResolvedValue(undefined), + channels: { + create: vi.fn().mockResolvedValue(undefined), + join: vi.fn().mockResolvedValue(undefined), + invite: vi.fn().mockResolvedValue(undefined), + }, +}; + +const mockRelaycast = { + agents: { + register: vi.fn().mockResolvedValue({ token: 'token-1' }), + }, + as: vi.fn().mockReturnValue(mockRelaycastAgent), +}; + +class MockRelayError extends Error { + code: string; + constructor(code: string, message: string, status = 400) { + super(message); + this.code = code; + this.name = 'RelayError'; + (this as any).status = status; + } +} + +vi.mock('@relaycast/sdk', () => ({ + RelayCast: vi.fn().mockImplementation(() => mockRelaycast), + RelayError: MockRelayError, +})); + +// ── Mock AgentRelay ────────────────────────────────────────────────────────── + +let waitForExitFn: (ms?: number) => Promise<'exited' | 'timeout' | 'released'>; + +const mockAgent = { + name: 'test-agent-abc', + get waitForExit() { return waitForExitFn; }, + get waitForIdle() { return vi.fn().mockImplementation(() => new Promise(() => {})); }, + release: vi.fn().mockResolvedValue(undefined), +}; + +const mockHuman = { + name: 'WorkflowRunner', + sendMessage: vi.fn().mockResolvedValue(undefined), +}; + +const mockRelayInstance = { + spawnPty: vi.fn().mockImplementation(async ({ name, task }: { name: string; task?: string }) => { + const stepComplete = task?.match(/STEP_COMPLETE:([^\n]+)/)?.[1]?.trim(); + const isReview = task?.includes('REVIEW_DECISION: APPROVE or REJECT'); + const output = isReview + ? 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: looks good\n' + : stepComplete + ? `STEP_COMPLETE:${stepComplete}\n` + : 'STEP_COMPLETE:unknown\n'; + + queueMicrotask(() => { + if (typeof mockRelayInstance.onWorkerOutput === 'function') { + mockRelayInstance.onWorkerOutput({ name, chunk: output }); + } + }); + + return { ...mockAgent, name }; + }), + human: vi.fn().mockReturnValue(mockHuman), + shutdown: vi.fn().mockResolvedValue(undefined), + onBrokerStderr: vi.fn().mockReturnValue(() => {}), + onWorkerOutput: null as ((frame: { name: string; chunk: string }) => void) | null, + onMessageReceived: null as any, + onAgentSpawned: null as any, + onAgentReleased: null as any, + onAgentExited: null as any, + onAgentIdle: null as any, + onDeliveryUpdate: null as any, + listAgentsRaw: vi.fn().mockResolvedValue([]), +}; + +vi.mock('../relay.js', () => ({ + AgentRelay: vi.fn().mockImplementation(() => mockRelayInstance), +})); + +// Import after mocking +const { WorkflowRunner } = await import('../workflows/runner.js'); +const { JsonFileWorkflowDb } = await import('../workflows/file-db.js'); + +// ── Helpers ────────────────────────────────────────────────────────────────── + +function makeDb(): WorkflowDb { + const runs = new Map(); + const steps = new Map(); + + return { + insertRun: vi.fn(async (run: WorkflowRunRow) => { + runs.set(run.id, { ...run }); + }), + updateRun: vi.fn(async (id: string, patch: Partial) => { + const existing = runs.get(id); + if (existing) runs.set(id, { ...existing, ...patch }); + }), + getRun: vi.fn(async (id: string) => { + const run = runs.get(id); + return run ? { ...run } : null; + }), + insertStep: vi.fn(async (step: WorkflowStepRow) => { + steps.set(step.id, { ...step }); + }), + updateStep: vi.fn(async (id: string, patch: Partial) => { + const existing = steps.get(id); + if (existing) steps.set(id, { ...existing, ...patch }); + }), + getStepsByRunId: vi.fn(async (runId: string) => { + return [...steps.values()].filter((s) => s.runId === runId); + }), + }; +} + +function makeResumeConfig(): RelayYamlConfig { + return { + version: '1', + name: 'test-resume-fallback', + swarm: { pattern: 'dag' }, + agents: [ + { name: 'agent-a', cli: 'claude' }, + ], + workflows: [ + { + name: 'default', + steps: [ + { name: 'step-a', agent: 'agent-a', task: 'Do step A' }, + { name: 'step-b', agent: 'agent-a', task: 'Do step B', dependsOn: ['step-a'] }, + { name: 'step-c', agent: 'agent-a', task: 'Do step C', dependsOn: ['step-b'] }, + ], + }, + ], + trajectories: false, + }; +} + +function makeTemplateConfig(): RelayYamlConfig { + return { + version: '1', + name: 'test-resume-template', + swarm: { pattern: 'dag' }, + agents: [ + { name: 'agent-a', cli: 'claude' }, + ], + workflows: [ + { + name: 'default', + steps: [ + { name: 'step-a', agent: 'agent-a', task: 'Generate input' }, + { + name: 'step-b', + agent: 'agent-a', + task: 'Use cached value: {{steps.step-a.output}}', + dependsOn: ['step-a'], + }, + ], + }, + ], + trajectories: false, + }; +} + +function makeRunRow(runId: string, config: RelayYamlConfig, status: WorkflowRunRow['status'] = 'failed'): WorkflowRunRow { + const now = new Date().toISOString(); + return { + id: runId, + workspaceId: 'ws-test', + workflowName: 'default', + pattern: config.swarm.pattern, + status, + config, + startedAt: now, + createdAt: now, + updatedAt: now, + }; +} + +function makeStepRow( + runId: string, + stepName: string, + task: string, + dependsOn: string[] = [], + status: WorkflowStepRow['status'] = 'pending', + output?: string +): WorkflowStepRow { + const now = new Date().toISOString(); + return { + id: `${runId}-${stepName}`, + runId, + stepName, + agentName: 'agent-a', + stepType: 'agent', + status, + task, + dependsOn, + output, + retryCount: 0, + createdAt: now, + updatedAt: now, + startedAt: status !== 'pending' ? now : undefined, + completedAt: status === 'completed' ? now : undefined, + }; +} + +function writeCachedOutput(tmpDir: string, runId: string, stepName: string, output: string): void { + const outputDir = path.join(tmpDir, '.agent-relay', 'step-outputs', runId); + mkdirSync(outputDir, { recursive: true }); + writeFileSync(path.join(outputDir, `${stepName}.md`), output); +} + +// ── Tests ──────────────────────────────────────────────────────────────────── + +describe('resume fallback to step-output cache', () => { + let db: WorkflowDb; + let runner: InstanceType; + let tmpDir: string; + + beforeEach(() => { + vi.clearAllMocks(); + waitForExitFn = vi.fn().mockResolvedValue('exited'); + mockRelayInstance.onWorkerOutput = null; + tmpDir = mkdtempSync(path.join(os.tmpdir(), 'resume-fallback-')); + db = makeDb(); + runner = new WorkflowRunner({ db, workspaceId: 'ws-test', cwd: tmpDir }); + }); + + afterEach(() => { + try { rmSync(tmpDir, { recursive: true, force: true }); } catch {} + }); + + it('should reconstruct run from step-output cache when JSONL missing', async () => { + const runId = 'resume-cache-run'; + const config = makeResumeConfig(); + writeCachedOutput(tmpDir, runId, 'step-a', 'cached-a'); + writeCachedOutput(tmpDir, runId, 'step-b', 'cached-b'); + + const events: Array<{ type: string; stepName?: string }> = []; + runner.on((event) => { + if ('stepName' in event) { + events.push({ type: event.type, stepName: event.stepName }); + } + }); + + const run = await (runner as any).resume(runId, undefined, config); + expect(run.status, run.error).toBe('completed'); + + const startedSteps = events.filter((e) => e.type === 'step:started').map((e) => e.stepName); + expect(startedSteps).not.toContain('step-a'); + expect(startedSteps).not.toContain('step-b'); + expect(startedSteps).toContain('step-c'); + }); + + it('should throw "not found" when neither JSONL nor cache exists', async () => { + const config = makeResumeConfig(); + + await expect((runner as any).resume('nonexistent-id', undefined, config)).rejects.toThrow('not found'); + }); + + it('should prefer JSONL database over step-output cache', async () => { + const runId = 'resume-db-run'; + const config = makeResumeConfig(); + const dbPath = path.join(tmpDir, '.agent-relay', 'workflow-runs.jsonl'); + const fileDb = new JsonFileWorkflowDb(dbPath); + const dbRunner = new WorkflowRunner({ db: fileDb, workspaceId: 'ws-test', cwd: tmpDir }); + + await fileDb.insertRun(makeRunRow(runId, config)); + await fileDb.insertStep(makeStepRow(runId, 'step-a', 'Do step A', [], 'failed')); + await fileDb.insertStep(makeStepRow(runId, 'step-b', 'Do step B', ['step-a'], 'pending')); + await fileDb.insertStep(makeStepRow(runId, 'step-c', 'Do step C', ['step-b'], 'pending')); + + writeCachedOutput(tmpDir, runId, 'step-a', 'cached-a-from-fallback'); + + const events: Array<{ type: string; stepName?: string }> = []; + dbRunner.on((event) => { + if ('stepName' in event) { + events.push({ type: event.type, stepName: event.stepName }); + } + }); + + const run = await dbRunner.resume(runId); + expect(run.status, run.error).toBe('completed'); + + const startedSteps = events.filter((e) => e.type === 'step:started').map((e) => e.stepName); + expect(startedSteps).toContain('step-a'); + expect(startedSteps).toContain('step-b'); + expect(startedSteps).toContain('step-c'); + }); + + it('should handle empty step-output directory gracefully', async () => { + const runId = 'resume-empty-cache'; + const config = makeResumeConfig(); + mkdirSync(path.join(tmpDir, '.agent-relay', 'step-outputs', runId), { recursive: true }); + + const events: Array<{ type: string; stepName?: string }> = []; + runner.on((event) => { + if ('stepName' in event) { + events.push({ type: event.type, stepName: event.stepName }); + } + }); + + const run = await (runner as any).resume(runId, undefined, config); + expect(run.status, run.error).toBe('completed'); + + const startedSteps = events.filter((e) => e.type === 'step:started').map((e) => e.stepName); + expect(startedSteps).toContain('step-a'); + expect(startedSteps).toContain('step-b'); + expect(startedSteps).toContain('step-c'); + }); + + it('should load cached output into step template variables', async () => { + const runId = 'resume-template-cache'; + const config = makeTemplateConfig(); + writeCachedOutput(tmpDir, runId, 'step-a', 'hello world'); + + const run = await (runner as any).resume(runId, undefined, config); + expect(run.status, run.error).toBe('completed'); + + const spawnedTasks = mockRelayInstance.spawnPty.mock.calls.map( + ([args]) => (args as { task?: string }).task ?? '' + ); + expect(spawnedTasks.some((task) => task.includes('Use cached value: hello world'))).toBe(true); + }); + + it('should skip .report.json files when scanning step outputs', async () => { + const runId = 'resume-report-cache'; + const config = makeResumeConfig(); + const outputDir = path.join(tmpDir, '.agent-relay', 'step-outputs', runId); + mkdirSync(outputDir, { recursive: true }); + writeFileSync(path.join(outputDir, 'step-a.md'), 'cached-a'); + writeFileSync(path.join(outputDir, 'step-a.report.json'), '{"summary":"done"}'); + writeFileSync(path.join(outputDir, 'step-b.report.json'), '{"summary":"metadata only"}'); + + const events: Array<{ type: string; stepName?: string }> = []; + runner.on((event) => { + if ('stepName' in event) { + events.push({ type: event.type, stepName: event.stepName }); + } + }); + + const run = await (runner as any).resume(runId, undefined, config); + expect(run.status, run.error).toBe('completed'); + + const startedSteps = events.filter((e) => e.type === 'step:started').map((e) => e.stepName); + expect(startedSteps).not.toContain('step-a'); + expect(startedSteps).toContain('step-b'); + expect(startedSteps).toContain('step-c'); + }); +}); + +describe('file-db append diagnostics', () => { + let tmpDir: string; + + beforeEach(() => { + vi.clearAllMocks(); + tmpDir = mkdtempSync(path.join(os.tmpdir(), 'file-db-warn-')); + }); + + afterEach(() => { + try { + chmodSync(path.join(tmpDir, 'readonly'), 0o755); + } catch {} + try { rmSync(tmpDir, { recursive: true, force: true }); } catch {} + }); + + it('should warn once when append fails', async () => { + const readonlyDir = path.join(tmpDir, 'readonly'); + mkdirSync(readonlyDir, { recursive: true }); + chmodSync(readonlyDir, 0o555); + + const dbPath = path.join(readonlyDir, 'workflow-runs.jsonl'); + const fileDb = new JsonFileWorkflowDb(dbPath); + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + const config = makeResumeConfig(); + + await fileDb.insertRun(makeRunRow('warn-run-1', config)); + await fileDb.insertRun(makeRunRow('warn-run-2', config)); + + expect(warnSpy).toHaveBeenCalledTimes(1); + + warnSpy.mockRestore(); + }); +}); diff --git a/packages/sdk/src/workflows/cli.ts b/packages/sdk/src/workflows/cli.ts index 7a33e77ef..5d0077a51 100644 --- a/packages/sdk/src/workflows/cli.ts +++ b/packages/sdk/src/workflows/cli.ts @@ -52,6 +52,21 @@ type ExecuteOptions = { previousRunId?: string; }; +/** Flags that consume the next argument as their value. Single source of truth for CLI parsing. */ +const FLAGS_WITH_VALUES = new Set(['--resume', '--workflow', '--start-from', '--previous-run-id']); + +function getYamlPathArg(args: string[]): string | undefined { + for (let i = 0; i < args.length; i += 1) { + const arg = args[i]; + if (arg.startsWith('--')) { + if (FLAGS_WITH_VALUES.has(arg)) i += 1; + continue; + } + return arg; + } + return undefined; +} + interface RenderableTask { output?: string; title: string; @@ -302,6 +317,7 @@ async function runWithListr( async function main(): Promise { const args = process.argv.slice(2); + const yamlPath = getYamlPathArg(args); if (args.length === 0 || args.includes('--help')) { printUsage(); @@ -358,7 +374,37 @@ async function main(): Promise { break; } }); - const result = await runner.resume(runId); + let result: RunnerResult; + try { + const resumeConfig = yamlPath ? await runner.parseYamlFile(yamlPath) : undefined; + if (resumeConfig) { + console.warn( + chalk.yellow( + '[workflow] warning: resuming with current config from disk — ' + + 'if the workflow YAML changed since the original run, behaviour may differ' + ) + ); + } + result = await runner.resume(runId, undefined, resumeConfig); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + const isRunNotFound = message.startsWith(`Run "${runId}" not found`); + if (isRunNotFound) { + if (fileDb.hasStepOutputs(runId)) { + console.error( + chalk.red( + `Error: ${message}. Step outputs exist for this run, but persisted run state is missing from ${dbPath}. ` + + `Use --start-from with --previous-run-id ${runId} to recover from the cached step outputs instead.` + ) + ); + } else { + console.error(chalk.red(`Error: ${message}`)); + } + } else { + console.error(chalk.red(`Error: ${message}`)); + } + process.exit(1); + } if (result.status === 'completed') { console.log(chalk.green('\nWorkflow completed successfully.')); @@ -371,7 +417,6 @@ async function main(): Promise { } // ── Normal / validate / dry-run mode ────────────────────────────────────── - const yamlPath = args[0]; let workflowName: string | undefined; const workflowIdx = args.indexOf('--workflow'); @@ -391,6 +436,12 @@ async function main(): Promise { previousRunId = args[prevRunIdx + 1]; } + if (!yamlPath) { + console.error(chalk.red('Error: workflow YAML path is required')); + printUsage(); + process.exit(1); + } + const isValidate = args.includes('--validate'); const isDryRun = !!process.env.DRY_RUN; diff --git a/packages/sdk/src/workflows/file-db.ts b/packages/sdk/src/workflows/file-db.ts index dc59bc630..35f2a3bdd 100644 --- a/packages/sdk/src/workflows/file-db.ts +++ b/packages/sdk/src/workflows/file-db.ts @@ -1,4 +1,4 @@ -import { appendFileSync, mkdirSync, readFileSync } from 'node:fs'; +import { appendFileSync, existsSync, mkdirSync, readdirSync, readFileSync } from 'node:fs'; import path from 'node:path'; import type { WorkflowRunRow, WorkflowStepRow } from './types.js'; @@ -24,6 +24,7 @@ export class JsonFileWorkflowDb implements WorkflowDb { /** Whether the storage directory is writable. False = silent no-op mode. */ private readonly writable: boolean; + private appendFailedOnce = false; constructor(filePath: string) { this.filePath = filePath; @@ -43,14 +44,32 @@ export class JsonFileWorkflowDb implements WorkflowDb { return this.writable; } + hasStepOutputs(runId: string): boolean { + try { + const dir = path.join(path.dirname(this.filePath), 'step-outputs', runId); + return existsSync(dir) && readdirSync(dir).length > 0; + } catch { + return false; + } + } + // ── Private helpers ───────────────────────────────────────────────────── private append(entry: DbEntry): void { if (!this.writable) return; try { appendFileSync(this.filePath, JSON.stringify(entry) + '\n', 'utf8'); - } catch { - // Non-critical — workflow execution continues; resume won't be available. + } catch (err) { + if (!this.appendFailedOnce) { + this.appendFailedOnce = true; + console.warn( + '[workflow] warning: failed to write run state to ' + + this.filePath + + ' — --resume will not be available for this run. Use --start-from instead. ' + + 'Error: ' + + (err instanceof Error ? err.message : String(err)) + ); + } } } diff --git a/packages/sdk/src/workflows/runner.ts b/packages/sdk/src/workflows/runner.ts index 8d96c102b..4b2846ef0 100644 --- a/packages/sdk/src/workflows/runner.ts +++ b/packages/sdk/src/workflows/runner.ts @@ -1952,14 +1952,25 @@ export class WorkflowRunner { } /** Resume a previously paused or partially completed run. */ - async resume(runId: string, vars?: VariableContext): Promise { + async resume(runId: string, vars?: VariableContext, config?: RelayYamlConfig): Promise { // Set up abort controller early so callers can abort() even during setup this.abortController = new AbortController(); this.paused = false; - const run = await this.db.getRun(runId); + let run = await this.db.getRun(runId); + let stepStates = new Map(); if (!run) { - throw new Error(`Run "${runId}" not found`); + const reconstructed = this.reconstructRunFromCache(runId, config); + if (!reconstructed) { + throw new Error(`Run "${runId}" not found (no database entry or cached step outputs)`); + } + this.log('[resume] Reconstructing run from cached step outputs (workflow-runs.jsonl missing)'); + run = reconstructed.run; + stepStates = reconstructed.stepStates; + await this.db.insertRun(run); + for (const [, state] of stepStates) { + await this.db.insertStep(state.row); + } } this.persistRunIdHint(runId); @@ -1967,25 +1978,26 @@ export class WorkflowRunner { throw new Error(`Run "${runId}" is in status "${run.status}" and cannot be resumed`); } - const config = vars ? this.resolveVariables(run.config, vars) : run.config; + const resolvedConfig = vars ? this.resolveVariables(run.config, vars) : run.config; // Resolve path definitions (same as execute()) so workdir lookups work on resume - const pathResult = this.resolvePathDefinitions(config.paths, this.cwd); + const pathResult = this.resolvePathDefinitions(resolvedConfig.paths, this.cwd); if (pathResult.errors.length > 0) { throw new Error(`Path validation failed:\n ${pathResult.errors.join('\n ')}`); } this.resolvedPaths = pathResult.resolved; - const workflows = config.workflows ?? []; + const workflows = resolvedConfig.workflows ?? []; const workflow = workflows.find((w) => w.name === run.workflowName); if (!workflow) { throw new Error(`Workflow "${run.workflowName}" not found in stored config`); } - const existingSteps = await this.db.getStepsByRunId(runId); - const stepStates = new Map(); - for (const stepRow of existingSteps) { - stepStates.set(stepRow.stepName, { row: stepRow }); + if (stepStates.size === 0) { + const existingSteps = await this.db.getStepsByRunId(runId); + for (const stepRow of existingSteps) { + stepStates.set(stepRow.stepName, { row: stepRow }); + } } // Reset failed steps to pending for retry @@ -2006,7 +2018,7 @@ export class WorkflowRunner { return this.runWorkflowCore({ run, workflow, - config, + config: resolvedConfig, stepStates, isResume: true, }); @@ -6547,8 +6559,16 @@ export class WorkflowRunner { .slice(0, 32); } + /** Validate that a runId is safe for use in file paths (no traversal). */ + private validateRunId(runId: string): void { + if (/[/\\]|^\.\.?$/.test(runId) || runId.includes('..')) { + throw new Error(`Invalid runId: "${runId}" contains path traversal characters`); + } + } + /** Directory for persisted step outputs: .agent-relay/step-outputs/{runId}/ */ private getStepOutputDir(runId: string): string { + this.validateRunId(runId); return path.join(this.cwd, '.agent-relay', 'step-outputs', runId); } @@ -6638,6 +6658,153 @@ export class WorkflowRunner { } } + /** Match the best workflow from config given a set of cached step names. */ + private matchWorkflowFromCache( + workflows: WorkflowDefinition[], + cachedStepNames: Set + ): WorkflowDefinition | null { + if (workflows.length === 1) return workflows[0]; + + if (cachedStepNames.size === 0) { + // No cached steps to disambiguate — ambiguous when multiple workflows exist + this.log('[resume] Multiple workflows in config with empty cache — cannot disambiguate'); + return null; + } + + // Score each workflow by how many cached steps match, excluding those with unknown steps + const scored = workflows + .map((candidate) => ({ + workflow: candidate, + matchedSteps: candidate.steps.filter((step) => cachedStepNames.has(step.name)).length, + unknownSteps: [...cachedStepNames].filter( + (name) => !candidate.steps.some((step) => step.name === name) + ).length, + })) + .filter((candidate) => candidate.unknownSteps === 0) + .sort((a, b) => b.matchedSteps - a.matchedSteps); + + return scored[0]?.workflow ?? null; + } + + private reconstructRunFromCache( + runId: string, + config?: RelayYamlConfig + ): { run: WorkflowRunRow; stepStates: Map } | null { + const stepOutputDir = this.getStepOutputDir(runId); + if (!existsSync(stepOutputDir)) return null; + + let resumeConfig = config ?? this.currentConfig; + if (!resumeConfig) { + // Attempt to load config from relay.yaml on disk (resume() may call before runWorkflowCore sets currentConfig) + const yamlPath = path.join(this.cwd, 'relay.yaml'); + if (existsSync(yamlPath)) { + try { + const raw = readFileSync(yamlPath, 'utf-8'); + resumeConfig = this.parseYamlString(raw, yamlPath); + } catch { + return null; + } + } else { + return null; + } + } + + let entries: Dirent[]; + try { + entries = readdirSync(stepOutputDir, { withFileTypes: true }); + } catch { + return null; + } + + const cachedStepNames = new Set( + entries + .filter((entry) => entry.isFile() && entry.name.endsWith('.md')) + .map((entry) => entry.name.slice(0, -3)) + .filter(Boolean) + ); + const workflows = resumeConfig.workflows ?? []; + if (workflows.length === 0) return null; + + // Empty cache directory is valid — all steps will be re-run + const workflow = this.matchWorkflowFromCache(workflows, cachedStepNames); + if (!workflow) return null; + + // Use actual file modification times from cached outputs instead of synthetic timestamps + const stepMtimes = new Map(); + let earliestMtime = Date.now(); + for (const stepName of cachedStepNames) { + try { + const mdPath = path.join(stepOutputDir, `${stepName}.md`); + const reportPath = path.join(stepOutputDir, `${stepName}.report.json`); + const mdStat = existsSync(mdPath) ? statSync(mdPath) : null; + const reportStat = existsSync(reportPath) ? statSync(reportPath) : null; + // Use the latest mtime between .md and .report.json + const mtime = Math.max(mdStat?.mtimeMs ?? 0, reportStat?.mtimeMs ?? 0); + if (mtime > 0) { + stepMtimes.set(stepName, new Date(mtime).toISOString()); + if (mtime < earliestMtime) earliestMtime = mtime; + } + } catch { + // Fall back to current time if stat fails + } + } + const fallbackTime = new Date().toISOString(); + + const completedSteps = new Set(workflow.steps.filter((step) => cachedStepNames.has(step.name)).map((step) => step.name)); + // Heuristic: mark the first eligible non-completed step as failed (the likely failure point) + const failedStepName = workflow.steps.find( + (step) => !completedSteps.has(step.name) && (step.dependsOn ?? []).every((dep) => completedSteps.has(dep)) + )?.name; + + const runStartedAt = new Date(earliestMtime).toISOString(); + const run: WorkflowRunRow = { + id: runId, + workspaceId: this.workspaceId, + workflowName: workflow.name, + pattern: resumeConfig.swarm.pattern, + status: 'failed', + config: resumeConfig, + startedAt: runStartedAt, + createdAt: runStartedAt, + updatedAt: fallbackTime, + }; + + const stepStates = new Map(); + for (const step of workflow.steps) { + const isNonAgent = step.type === 'deterministic' || step.type === 'worktree' || step.type === 'integration'; + const cachedOutput = completedSteps.has(step.name) ? this.loadStepOutput(runId, step.name) : undefined; + const status: WorkflowStepStatus = + completedSteps.has(step.name) ? 'completed' : step.name === failedStepName ? 'failed' : 'pending'; + + const stepRow: WorkflowStepRow = { + id: this.generateId(), + runId, + stepName: step.name, + agentName: isNonAgent ? null : (step.agent ?? null), + stepType: isNonAgent ? (step.type as 'deterministic' | 'worktree' | 'integration') : 'agent', + status, + task: + step.type === 'deterministic' + ? (step.command ?? '') + : step.type === 'worktree' + ? (step.branch ?? '') + : step.type === 'integration' + ? (`${step.integration}.${step.action}`) + : (step.task ?? ''), + dependsOn: step.dependsOn ?? [], + output: cachedOutput, + error: status === 'failed' ? 'Recovered from cached step outputs' : undefined, + completedAt: status === 'completed' ? (stepMtimes.get(step.name) ?? fallbackTime) : undefined, + retryCount: 0, + createdAt: stepMtimes.get(step.name) ?? fallbackTime, + updatedAt: stepMtimes.get(step.name) ?? fallbackTime, + }; + stepStates.set(step.name, { row: stepRow }); + } + + return { run, stepStates }; + } + /** Get or create the worker logs directory (.agent-relay/team/worker-logs) */ private getWorkerLogsDir(): string { const logsDir = path.join(this.cwd, '.agent-relay', 'team', 'worker-logs');