AgentWorkforce · khaliqgant · Mar 13, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -762,12 +762,12 @@ describe('PR #511 E2E: Auto Step Owner + Review Gating', () => {
   // ── Scenario 9: Owner completion marker validation ─────────────────────
 
   describe('Scenario 9: Owner completion marker', () => {
-    it('should fail when owner does not produce STEP_COMPLETE marker', async () => {
+    it('should fail when owner does not provide a marker, decision, or evidence', async () => {
       mockSpawnOutputs = ['The work is done but I forgot the sentinel.\n'];
 
       const run = await runner.execute(makeConfig(), 'default');
       expect(run.status).toBe('failed');
-      expect(run.error).toContain('owner completion marker');
+      expect(run.error).toContain('owner completion decision missing');
     }, 15000);
 
     it('should succeed when owner produces correct STEP_COMPLETE:step-name', async () => {

@@ -340,6 +340,43 @@ describe('Idle Nudge Detection', () => {
       expect(run.status).toBe('failed');
       expect(run.error).toContain('timed out');
     });
+
+    it('keeps a supervising lead alive after idle nudges are exhausted', async () => {
+      let exitCallCount = 0;
+      waitForExitFn = vi.fn().mockImplementation(() => {
+        exitCallCount++;
+        return Promise.resolve(exitCallCount < 3 ? 'timeout' : 'exited');
+      });
+
+      const config = makeConfig({
+        swarm: {
+          pattern: 'hub-spoke',
+          idleNudge: { nudgeAfterMs: 50, escalateAfterMs: 50, maxNudges: 1 },
+          channel: 'lead-supervision',
+        },
+      });
+      const agentDef = { name: 'team-lead', cli: 'claude', role: 'Lead coordinator' };
+      const step = {
+        name: 'step-1',
+        agent: 'team-lead',
+        task: 'Monitor #lead-supervision for WORKER_DONE, wait for the handoff, then exit.',
+      };
+
+      (runner as any).currentConfig = config;
+      expect((runner as any).shouldPreserveIdleSupervisor(agentDef, step)).toBe(true);
+
+      const result = await (runner as any).waitForExitWithIdleNudging(
+        mockAgent,
+        agentDef,
+        step,
+        500,
+        true
+      );
+
+      expect(result).toBe('exited');
+      expect(waitForExitFn).toHaveBeenCalledTimes(3);
+      expect(mockRelease).not.toHaveBeenCalled();
+    });
   });
 
   describe('Idle = done (no idleNudge config)', () => {
@@ -369,6 +406,37 @@ describe('Idle Nudge Detection', () => {
       expect(mockRelease).not.toHaveBeenCalled();
     });
 
+    it('does not treat supervisory lead idleness as completion', async () => {
+      waitForExitFn = vi.fn().mockResolvedValue('exited');
+      waitForIdleFn = vi.fn().mockResolvedValue('idle');
+
+      const config = makeConfig({
+        swarm: { pattern: 'hub-spoke', channel: 'lead-supervision' },
+      });
+      const agentDef = { name: 'team-lead', cli: 'claude', role: 'Lead coordinator' };
+      const step = {
+        name: 'step-1',
+        agent: 'team-lead',
+        task: 'Wait on #lead-supervision for WORKER_DONE before handing off.',
+      };
+
+      (runner as any).currentConfig = config;
+      expect((runner as any).shouldPreserveIdleSupervisor(agentDef, step)).toBe(true);
+
+      const result = await (runner as any).waitForExitWithIdleNudging(
+        mockAgent,
+        agentDef,
+        step,
+        500,
+        true
+      );
+
+      expect(result).toBe('exited');
+      expect(waitForExitFn).toHaveBeenCalledTimes(1);
+      expect(waitForIdleFn).not.toHaveBeenCalled();
+      expect(mockRelease).not.toHaveBeenCalled();
+    });
+
     it('both timeout: fails step with timeout error', async () => {
       waitForExitFn = vi.fn().mockResolvedValue('timeout');
       waitForIdleFn = vi.fn().mockResolvedValue('timeout');

@@ -543,11 +543,11 @@ agents:
       expect(run.status, run.error).toBe('completed');
     });
 
-    it('should fail when owner response does not include completion marker', async () => {
+    it('should fail when owner response provides no decision, marker, or evidence', async () => {
       mockSpawnOutputs = ['Owner completed work but forgot sentinel\n'];
       const run = await runner.execute(makeConfig(), 'default');
       expect(run.status).toBe('failed');
-      expect(run.error).toContain('owner completion marker');
+      expect(run.error).toContain('owner completion decision missing');
     });
 
     it('should run specialist work in a separate process and mirror worker output to the channel', async () => {
@@ -564,8 +564,11 @@ agents:
       expect(spawnCalls[0][0].name).toContain('step-1-worker');
       expect(spawnCalls[1][0].name).toContain('step-1-owner');
       expect(spawnCalls[0][0].task).not.toContain('STEP_COMPLETE:step-1');
+      expect(spawnCalls[0][0].task).toContain('WORKER COMPLETION CONTRACT');
+      expect(spawnCalls[0][0].task).toContain('WORKER_DONE: <brief summary>');
       expect(spawnCalls[1][0].task).toContain('You are the step owner/supervisor for step "step-1".');
       expect(spawnCalls[1][0].task).toContain('runtime: step-1-worker');
+      expect(spawnCalls[1][0].task).toContain('LEAD_DONE: <brief summary>');
 
       const channelMessages = (mockRelaycastAgent.send as any).mock.calls.map(
         ([, text]: [string, string]) => text
@@ -574,6 +577,112 @@ agents:
       expect(channelMessages.some((text: string) => text.includes('worker finished'))).toBe(true);
     });
 
+    it('should apply verification fallback for self-owned interactive steps', async () => {
+      mockSpawnOutputs = [
+        'LEAD_DONE\n',
+        'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
+      ];
+
+      const run = await runner.execute(
+        makeConfig({
+          agents: [{ name: 'team-lead', cli: 'claude', role: 'Lead coordinator' }],
+          workflows: [
+            {
+              name: 'default',
+              steps: [
+                {
+                  name: 'lead-step',
+                  agent: 'team-lead',
+                  task: 'Output exactly:\nLEAD_DONE\n/exit',
+                  verification: { type: 'exit_code', value: 0 },
+                },
+              ],
+            },
+          ],
+        }),
+        'default'
+      );
+
+      expect(run.status, run.error).toBe('completed');
+      const steps = await db.getStepsByRunId(run.id);
+      expect(steps[0]?.completionReason).toBe('completed_verified');
+    });
+
+    it('should keep explicit interactive workers self-owned without extra supervisor/reviewer spawns', async () => {
+      const ownerAssignments: Array<{ owner: string; specialist: string }> = [];
+      runner.on((event) => {
+        if (event.type === 'step:owner-assigned') {
+          ownerAssignments.push({ owner: event.ownerName, specialist: event.specialistName });
+        }
+      });
+
+      mockSpawnOutputs = ['STEP_COMPLETE:worker-step\nWORKER_DONE_LOCAL\n'];
+
+      const run = await runner.execute(
+        makeConfig({
+          agents: [
+            { name: 'team-lead', cli: 'claude', role: 'Lead coordinator', preset: 'lead' },
+            { name: 'relay-worker', cli: 'codex', preset: 'worker', interactive: true },
+          ],
+          workflows: [
+            {
+              name: 'default',
+              steps: [
+                {
+                  name: 'worker-step',
+                  agent: 'relay-worker',
+                  task: 'Output exactly:\nWORKER_DONE_LOCAL\n/exit',
+                  verification: { type: 'output_contains', value: 'WORKER_DONE_LOCAL' },
+                },
+              ],
+            },
+          ],
+        }),
+        'default'
+      );
+
+      expect(ownerAssignments).toContainEqual({ owner: 'relay-worker', specialist: 'relay-worker' });
+      expect(run.status).toBe('failed');
+      expect(run.error).toContain('verification failed');
+
+      const spawnCalls = (mockRelayInstance.spawnPty as any).mock.calls;
+      expect(spawnCalls).toHaveLength(1);
+      expect(spawnCalls[0][0].task).toContain('STEP OWNER CONTRACT');
+      expect(spawnCalls[0][0].name).not.toContain('-owner-');
+      expect(spawnCalls[0][0].name).not.toContain('-review-');
+    });
+
+    it('should pass canonical bypass args to interactive codex PTY spawns', async () => {
+      mockSpawnOutputs = [
+        'LEAD_DONE\n',
+        'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
+      ];
+
+      const run = await runner.execute(
+        makeConfig({
+          agents: [{ name: 'lead', cli: 'codex', role: 'Lead coordinator' }],
+          workflows: [
+            {
+              name: 'default',
+              steps: [
+                {
+                  name: 'lead-step',
+                  agent: 'lead',
+                  task: 'Output exactly:\nLEAD_DONE\n/exit',
+                  verification: { type: 'exit_code', value: 0 },
+                },
+              ],
+            },
+          ],
+        }),
+        'default'
+      );
+
+      expect(run.status, run.error).toBe('completed');
+      const spawnCalls = (mockRelayInstance.spawnPty as any).mock.calls;
+      expect(spawnCalls[0][0].args).toEqual(['--dangerously-bypass-approvals-and-sandbox']);
+    });
+
     it('should let the owner complete after checking file-based artifacts', async () => {
       const tmpDir = mkdtempSync(path.join(os.tmpdir(), 'relay-owner-file-'));
       const artifact = path.join(tmpDir, 'artifact.txt');
@@ -615,8 +724,8 @@ agents:
       expect(stepRows[0].output).not.toContain('Worker already exited; artifacts look correct');
     });
 
-    it('should fail closed when review response is malformed', async () => {
-      mockSpawnOutputs = ['STEP_COMPLETE:step-1\n', 'REVIEW_REASON: looks fine\n'];
+    it('should fail when review response lacks any usable decision signal', async () => {
+      mockSpawnOutputs = ['STEP_COMPLETE:step-1\n', 'I need more context before deciding.\n'];
       const run = await runner.execute(makeConfig(), 'default');
       expect(run.status).toBe('failed');
       expect(run.error).toContain('review response malformed');

@@ -104,8 +104,8 @@ workflows:
         agent: backend
         task: "Build the REST API endpoints for user management"
         verification:
-          type: output_contains
-          value: "BUILD_COMPLETE"
+          type: file_exists
+          value: "src/api/users.ts"
         retries: 1
 
       - name: write-tests
@@ -154,22 +154,50 @@ await runWorkflow("workflow.yaml", {
 
 ### Verification Checks
 
-Each step can include a verification check that must pass for the step to be considered complete:
+Each step can include a verification check. Verification is one input to the runner's **completion decision pipeline** — when verification passes, the step completes even without a sentinel marker.
 
 | Type | Description |
 |------|-------------|
-| `output_contains` | Step output must contain the specified string |
-| `exit_code` | Agent must exit with the specified code |
+| `exit_code` | Agent must exit with the specified code (preferred for code-editing steps) |
 | `file_exists` | A file must exist at the specified path after the step |
+| `output_contains` | Step output must contain the specified string (optional accelerator) |
 | `custom` | No-op in the runner; handled by external callers |
 
 ```yaml
+# Preferred — deterministic verification
+verification:
+  type: exit_code
+  value: "0"
+  description: "Process exited successfully"
+
+# Also valid — output_contains as an optional accelerator
 verification:
   type: output_contains
   value: "IMPLEMENTATION_COMPLETE"
-  description: "Agent must confirm completion"
+  description: "Agent confirms completion (optional fast-path)"
 ```
 
+### Completion Decision Pipeline
+
+The runner uses a multi-signal pipeline to decide step completion:
+
+1. **Deterministic verification** — if a verification check passes, the step completes immediately (`completed_verified`)
+2. **Owner decision** — the step owner can issue `OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL` (`completed_by_owner_decision`)
+3. **Evidence-based completion** — channel messages, file artifacts, and exit codes are collected as evidence (`completed_by_evidence`)
+4. **Marker fast-path** — `STEP_COMPLETE:<step-name>` still works as an accelerator but is never required
+
+| Completion State | Meaning |
+|---|---|
+| `completed_verified` | Deterministic verification passed |
+| `completed_by_owner_decision` | Owner approved the step |
+| `completed_by_evidence` | Evidence-based completion |
+| `retry_requested_by_owner` | Owner requested retry |
+| `failed_verification` | Verification explicitly failed |
+| `failed_owner_decision` | Owner rejected the step |
+| `failed_no_evidence` | No verification, no owner decision, no evidence |
+
+**Review parsing is tolerant:** The runner accepts semantically equivalent outputs like "Approved", "Complete", "LGTM" — not just exact `REVIEW_DECISION: APPROVE` strings.
+
 ## Swarm Patterns
 
 The `swarm.pattern` field controls how agents are coordinated:
@@ -642,12 +670,16 @@ The runner emits two new events for idle nudging:
 
 ## Automatic Step Owner and Review
 
-For interactive agent steps, the runner now hardens handoffs automatically:
+For interactive agent steps, the runner uses a point-person-led completion model:
+
+1. **Elects a step owner** (prefers lead/coordinator-style agents, falls back to the step agent)
+2. **Runs a completion decision pipeline** — checks deterministic verification first, then owner judgment, then evidence
+3. **Owner can issue structured decisions** via `OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL|NEEDS_CLARIFICATION` with optional `REASON: <text>`
+4. **Review parsing is tolerant** — accepts "Approved", "Complete", "LGTM", not just exact `REVIEW_DECISION: APPROVE`
+5. **Markers are optional accelerators** — `STEP_COMPLETE:<step-name>` still works as a fast-path but is never required
+6. Stores primary output plus review output in the step artifact
 
-1. Elects a step owner (prefers lead/coordinator-style agents, falls back to the step agent)
-2. Requires the owner to provide an explicit completion signal (`STEP_COMPLETE:<step-name>`)
-3. Runs a review pass before marking the step complete (prefers reviewer-style agents when present)
-4. Stores primary output plus review output in the step artifact
+**Evidence-based completion:** The runner collects channel messages, file artifacts, process exit codes, and coordination signals (e.g., WORKER_DONE posted in channel) as completion evidence. When sufficient evidence exists, the step completes without requiring any sentinel marker.
 
 Deterministic and worktree steps are unchanged and do not require owner/review delegation.