Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,820 changes: 1,820 additions & 0 deletions packages/sdk/src/__tests__/completion-pipeline.test.ts

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions packages/sdk/src/__tests__/e2e-owner-review.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -762,12 +762,12 @@ describe('PR #511 E2E: Auto Step Owner + Review Gating', () => {
// ── Scenario 9: Owner completion marker validation ─────────────────────

describe('Scenario 9: Owner completion marker', () => {
it('should fail when owner does not produce STEP_COMPLETE marker', async () => {
it('should fail when owner does not provide a marker, decision, or evidence', async () => {
mockSpawnOutputs = ['The work is done but I forgot the sentinel.\n'];

const run = await runner.execute(makeConfig(), 'default');
expect(run.status).toBe('failed');
expect(run.error).toContain('owner completion marker');
expect(run.error).toContain('owner completion decision missing');
}, 15000);

it('should succeed when owner produces correct STEP_COMPLETE:step-name', async () => {
Expand Down
68 changes: 68 additions & 0 deletions packages/sdk/src/__tests__/idle-nudge.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,43 @@ describe('Idle Nudge Detection', () => {
expect(run.status).toBe('failed');
expect(run.error).toContain('timed out');
});

it('keeps a supervising lead alive after idle nudges are exhausted', async () => {
let exitCallCount = 0;
waitForExitFn = vi.fn().mockImplementation(() => {
exitCallCount++;
return Promise.resolve(exitCallCount < 3 ? 'timeout' : 'exited');
});

const config = makeConfig({
swarm: {
pattern: 'hub-spoke',
idleNudge: { nudgeAfterMs: 50, escalateAfterMs: 50, maxNudges: 1 },
channel: 'lead-supervision',
},
});
const agentDef = { name: 'team-lead', cli: 'claude', role: 'Lead coordinator' };
const step = {
name: 'step-1',
agent: 'team-lead',
task: 'Monitor #lead-supervision for WORKER_DONE, wait for the handoff, then exit.',
};

(runner as any).currentConfig = config;
expect((runner as any).shouldPreserveIdleSupervisor(agentDef, step)).toBe(true);

const result = await (runner as any).waitForExitWithIdleNudging(
mockAgent,
agentDef,
step,
500,
true
);

expect(result).toBe('exited');
expect(waitForExitFn).toHaveBeenCalledTimes(3);
expect(mockRelease).not.toHaveBeenCalled();
});
});

describe('Idle = done (no idleNudge config)', () => {
Expand Down Expand Up @@ -369,6 +406,37 @@ describe('Idle Nudge Detection', () => {
expect(mockRelease).not.toHaveBeenCalled();
});

it('does not treat supervisory lead idleness as completion', async () => {
waitForExitFn = vi.fn().mockResolvedValue('exited');
waitForIdleFn = vi.fn().mockResolvedValue('idle');

const config = makeConfig({
swarm: { pattern: 'hub-spoke', channel: 'lead-supervision' },
});
const agentDef = { name: 'team-lead', cli: 'claude', role: 'Lead coordinator' };
const step = {
name: 'step-1',
agent: 'team-lead',
task: 'Wait on #lead-supervision for WORKER_DONE before handing off.',
};

(runner as any).currentConfig = config;
expect((runner as any).shouldPreserveIdleSupervisor(agentDef, step)).toBe(true);

const result = await (runner as any).waitForExitWithIdleNudging(
mockAgent,
agentDef,
step,
500,
true
);

expect(result).toBe('exited');
expect(waitForExitFn).toHaveBeenCalledTimes(1);
expect(waitForIdleFn).not.toHaveBeenCalled();
expect(mockRelease).not.toHaveBeenCalled();
});

it('both timeout: fails step with timeout error', async () => {
waitForExitFn = vi.fn().mockResolvedValue('timeout');
waitForIdleFn = vi.fn().mockResolvedValue('timeout');
Expand Down
117 changes: 113 additions & 4 deletions packages/sdk/src/__tests__/workflow-runner.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -543,11 +543,11 @@ agents:
expect(run.status, run.error).toBe('completed');
});

it('should fail when owner response does not include completion marker', async () => {
it('should fail when owner response provides no decision, marker, or evidence', async () => {
mockSpawnOutputs = ['Owner completed work but forgot sentinel\n'];
const run = await runner.execute(makeConfig(), 'default');
expect(run.status).toBe('failed');
expect(run.error).toContain('owner completion marker');
expect(run.error).toContain('owner completion decision missing');
});

it('should run specialist work in a separate process and mirror worker output to the channel', async () => {
Expand All @@ -564,8 +564,11 @@ agents:
expect(spawnCalls[0][0].name).toContain('step-1-worker');
expect(spawnCalls[1][0].name).toContain('step-1-owner');
expect(spawnCalls[0][0].task).not.toContain('STEP_COMPLETE:step-1');
expect(spawnCalls[0][0].task).toContain('WORKER COMPLETION CONTRACT');
expect(spawnCalls[0][0].task).toContain('WORKER_DONE: <brief summary>');
expect(spawnCalls[1][0].task).toContain('You are the step owner/supervisor for step "step-1".');
expect(spawnCalls[1][0].task).toContain('runtime: step-1-worker');
expect(spawnCalls[1][0].task).toContain('LEAD_DONE: <brief summary>');

const channelMessages = (mockRelaycastAgent.send as any).mock.calls.map(
([, text]: [string, string]) => text
Expand All @@ -574,6 +577,112 @@ agents:
expect(channelMessages.some((text: string) => text.includes('worker finished'))).toBe(true);
});

it('should apply verification fallback for self-owned interactive steps', async () => {
mockSpawnOutputs = [
'LEAD_DONE\n',
'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
];

const run = await runner.execute(
makeConfig({
agents: [{ name: 'team-lead', cli: 'claude', role: 'Lead coordinator' }],
workflows: [
{
name: 'default',
steps: [
{
name: 'lead-step',
agent: 'team-lead',
task: 'Output exactly:\nLEAD_DONE\n/exit',
verification: { type: 'exit_code', value: 0 },
},
],
},
],
}),
'default'
);

expect(run.status, run.error).toBe('completed');
const steps = await db.getStepsByRunId(run.id);
expect(steps[0]?.completionReason).toBe('completed_verified');
});

it('should keep explicit interactive workers self-owned without extra supervisor/reviewer spawns', async () => {
const ownerAssignments: Array<{ owner: string; specialist: string }> = [];
runner.on((event) => {
if (event.type === 'step:owner-assigned') {
ownerAssignments.push({ owner: event.ownerName, specialist: event.specialistName });
}
});

mockSpawnOutputs = ['STEP_COMPLETE:worker-step\nWORKER_DONE_LOCAL\n'];

const run = await runner.execute(
makeConfig({
agents: [
{ name: 'team-lead', cli: 'claude', role: 'Lead coordinator', preset: 'lead' },
{ name: 'relay-worker', cli: 'codex', preset: 'worker', interactive: true },
],
workflows: [
{
name: 'default',
steps: [
{
name: 'worker-step',
agent: 'relay-worker',
task: 'Output exactly:\nWORKER_DONE_LOCAL\n/exit',
verification: { type: 'output_contains', value: 'WORKER_DONE_LOCAL' },
},
],
},
],
}),
'default'
);

expect(ownerAssignments).toContainEqual({ owner: 'relay-worker', specialist: 'relay-worker' });
expect(run.status).toBe('failed');
expect(run.error).toContain('verification failed');

const spawnCalls = (mockRelayInstance.spawnPty as any).mock.calls;
expect(spawnCalls).toHaveLength(1);
expect(spawnCalls[0][0].task).toContain('STEP OWNER CONTRACT');
expect(spawnCalls[0][0].name).not.toContain('-owner-');
expect(spawnCalls[0][0].name).not.toContain('-review-');
});

it('should pass canonical bypass args to interactive codex PTY spawns', async () => {
mockSpawnOutputs = [
'LEAD_DONE\n',
'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
];

const run = await runner.execute(
makeConfig({
agents: [{ name: 'lead', cli: 'codex', role: 'Lead coordinator' }],
workflows: [
{
name: 'default',
steps: [
{
name: 'lead-step',
agent: 'lead',
task: 'Output exactly:\nLEAD_DONE\n/exit',
verification: { type: 'exit_code', value: 0 },
},
],
},
],
}),
'default'
);

expect(run.status, run.error).toBe('completed');
const spawnCalls = (mockRelayInstance.spawnPty as any).mock.calls;
expect(spawnCalls[0][0].args).toEqual(['--dangerously-bypass-approvals-and-sandbox']);
});

it('should let the owner complete after checking file-based artifacts', async () => {
const tmpDir = mkdtempSync(path.join(os.tmpdir(), 'relay-owner-file-'));
const artifact = path.join(tmpDir, 'artifact.txt');
Expand Down Expand Up @@ -615,8 +724,8 @@ agents:
expect(stepRows[0].output).not.toContain('Worker already exited; artifacts look correct');
});

it('should fail closed when review response is malformed', async () => {
mockSpawnOutputs = ['STEP_COMPLETE:step-1\n', 'REVIEW_REASON: looks fine\n'];
it('should fail when review response lacks any usable decision signal', async () => {
mockSpawnOutputs = ['STEP_COMPLETE:step-1\n', 'I need more context before deciding.\n'];
const run = await runner.execute(makeConfig(), 'default');
expect(run.status).toBe('failed');
expect(run.error).toContain('review response malformed');
Expand Down
54 changes: 43 additions & 11 deletions packages/sdk/src/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ workflows:
agent: backend
task: "Build the REST API endpoints for user management"
verification:
type: output_contains
value: "BUILD_COMPLETE"
type: file_exists
value: "src/api/users.ts"
retries: 1

- name: write-tests
Expand Down Expand Up @@ -154,22 +154,50 @@ await runWorkflow("workflow.yaml", {

### Verification Checks

Each step can include a verification check that must pass for the step to be considered complete:
Each step can include a verification check. Verification is one input to the runner's **completion decision pipeline** — when verification passes, the step completes even without a sentinel marker.

| Type | Description |
|------|-------------|
| `output_contains` | Step output must contain the specified string |
| `exit_code` | Agent must exit with the specified code |
| `exit_code` | Agent must exit with the specified code (preferred for code-editing steps) |
| `file_exists` | A file must exist at the specified path after the step |
| `output_contains` | Step output must contain the specified string (optional accelerator) |
| `custom` | No-op in the runner; handled by external callers |

```yaml
# Preferred — deterministic verification
verification:
type: exit_code
value: "0"
description: "Process exited successfully"

# Also valid — output_contains as an optional accelerator
verification:
type: output_contains
value: "IMPLEMENTATION_COMPLETE"
description: "Agent must confirm completion"
description: "Agent confirms completion (optional fast-path)"
```

### Completion Decision Pipeline

The runner uses a multi-signal pipeline to decide step completion:

1. **Deterministic verification** — if a verification check passes, the step completes immediately (`completed_verified`)
2. **Owner decision** — the step owner can issue `OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL` (`completed_by_owner_decision`)
3. **Evidence-based completion** — channel messages, file artifacts, and exit codes are collected as evidence (`completed_by_evidence`)
4. **Marker fast-path** — `STEP_COMPLETE:<step-name>` still works as an accelerator but is never required

| Completion State | Meaning |
|---|---|
| `completed_verified` | Deterministic verification passed |
| `completed_by_owner_decision` | Owner approved the step |
| `completed_by_evidence` | Evidence-based completion |
| `retry_requested_by_owner` | Owner requested retry |
| `failed_verification` | Verification explicitly failed |
| `failed_owner_decision` | Owner rejected the step |
| `failed_no_evidence` | No verification, no owner decision, no evidence |

**Review parsing is tolerant:** The runner accepts semantically equivalent outputs like "Approved", "Complete", "LGTM" — not just exact `REVIEW_DECISION: APPROVE` strings.

## Swarm Patterns

The `swarm.pattern` field controls how agents are coordinated:
Expand Down Expand Up @@ -642,12 +670,16 @@ The runner emits two new events for idle nudging:

## Automatic Step Owner and Review

For interactive agent steps, the runner now hardens handoffs automatically:
For interactive agent steps, the runner uses a point-person-led completion model:

1. **Elects a step owner** (prefers lead/coordinator-style agents, falls back to the step agent)
2. **Runs a completion decision pipeline** — checks deterministic verification first, then owner judgment, then evidence
3. **Owner can issue structured decisions** via `OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL|NEEDS_CLARIFICATION` with optional `REASON: <text>`
4. **Review parsing is tolerant** — accepts "Approved", "Complete", "LGTM", not just exact `REVIEW_DECISION: APPROVE`
5. **Markers are optional accelerators** — `STEP_COMPLETE:<step-name>` still works as a fast-path but is never required
6. Stores primary output plus review output in the step artifact

1. Elects a step owner (prefers lead/coordinator-style agents, falls back to the step agent)
2. Requires the owner to provide an explicit completion signal (`STEP_COMPLETE:<step-name>`)
3. Runs a review pass before marking the step complete (prefers reviewer-style agents when present)
4. Stores primary output plus review output in the step artifact
**Evidence-based completion:** The runner collects channel messages, file artifacts, process exit codes, and coordination signals (e.g., WORKER_DONE posted in channel) as completion evidence. When sufficient evidence exists, the step completes without requiring any sentinel marker.

Deterministic and worktree steps are unchanged and do not require owner/review delegation.

Expand Down
Loading
Loading