From 9435b71f33879eb18c58a77058fcf1efb9b82d6a Mon Sep 17 00:00:00 2001
From: Khaliq <khaliqgant@gmail.com>
Date: Fri, 15 May 2026 23:03:11 +0200
Subject: [PATCH] fix(auto-fix): don't resume from a gate whose inputs come
 from skipped agent steps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a generated child workflow failed at a deterministic gate (e.g.
`final-review-pass-gate`, which greps an artifact written by upstream
`final-fix-claude`/`final-fix-codex` agent steps), the auto-fix loop
resumed `--start-from <gate> --previous-run-id <id>`. That skips the
14 upstream steps including the agent producers, so the gate's input
artifact is never regenerated and the gate fails identically on every
attempt. The child exhausts its 7 auto-fix attempts, the master
orchestrator retries the whole child, and it loops ~37 min/cycle
forever — observed on the workspace-primitives PR-39 master run
(child 10 `update-last-week`).

Add `resolveSafeResumeAnchor`: parse the workflow step graph from the
artifact via the TypeScript AST (per AGENTS.md grammar-aware-parser
rule), and when the failed step is a non-regenerating step (type
deterministic/worktree/integration, no `agent:`) whose transitive
dependency chain contains an agent/regenerating step, move the resume
anchor to the *nearest* upstream producer so the gate's input is
regenerated. Earlier artifacts persist on disk between runs so they
need not be recomputed.

Gate the downgrade on `failedStepRepaired`: when a repair rewrote the
failed step's own config (e.g. the `lead-plan-gate` marker-append
logic — the gate itself was the bug), the fix lives in that step and
resuming from it is correct, so the anchor is left unchanged. The
downgrade only fires when the repair left the failed step untouched.

Applies at the persona/deterministic-repair retry site (carries
old+new artifact content) and the repair-provider-failure site. The
direct-repair path (V1 env/binary blockers) has no artifact content in
scope and is unaffected.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/local/auto-fix-loop.test.ts |  56 +++++++-
 src/local/auto-fix-loop.ts      | 242 +++++++++++++++++++++++++++++++-
 2 files changed, 291 insertions(+), 7 deletions(-)
diff --git a/src/local/auto-fix-loop.test.ts b/src/local/auto-fix-loop.test.ts
index 3a341d59..e4444665 100644
--- a/src/local/auto-fix-loop.test.ts
+++ b/src/local/auto-fix-loop.test.ts
@@ -4,7 +4,7 @@ import { join } from 'node:path';
 
 import { describe, expect, it, vi } from 'vitest';
 
-import { repairWorkflowDeterministically, runWithAutoFix } from './auto-fix-loop.js';
+import { repairWorkflowDeterministically, resolveSafeResumeAnchor, runWithAutoFix } from './auto-fix-loop.js';
 import type { LocalClassifiedBlocker, LocalResponse } from './entrypoint.js';
 import type { LocalInvocationRequest } from './request-normalizer.js';
 import type { FailureClassification } from '../runtime/failure/types.js';
@@ -1074,6 +1074,60 @@ describe('isSyntheticStageId', () => {
   });
 });
 
+describe('resolveSafeResumeAnchor', () => {
+  // Mirrors the generated child-workflow shape that caused the
+  // final-review-pass-gate infinite-retry loop: agent steps produce review
+  // artifacts, then a deterministic gate greps them.
+  const reviewGateWorkflow = `import { workflow } from '@agent-relay/sdk/workflows';
+const wf = workflow('child')
+  .step("prepare-context", { type: "deterministic", command: "echo prep" })
+  .step("implement-slice", { agent: "impl-codex", dependsOn: ["prepare-context"], task: "do work" })
+  .step("review-claude", { agent: "reviewer-claude", dependsOn: ["implement-slice"], task: "review" })
+  .step("final-fix-codex", { agent: "validator-codex", dependsOn: ["review-claude"], task: "final fix" })
+  .step("final-review-pass-gate", { type: "deterministic", dependsOn: ["final-fix-codex"], command: "grep -F MARKER artifact.md" })
+  .step("final-signoff", { type: "deterministic", dependsOn: ["final-review-pass-gate"], command: "echo done" });
+`;
+
+  it('moves the anchor off a deterministic gate to its nearest upstream agent producer', () => {
+    expect(resolveSafeResumeAnchor('final-review-pass-gate', reviewGateWorkflow)).toBe('final-fix-codex');
+  });
+
+  it('leaves an agent step as its own resume anchor', () => {
+    expect(resolveSafeResumeAnchor('review-claude', reviewGateWorkflow)).toBe('review-claude');
+  });
+
+  it('keeps the anchor when the failed step is the gate but the chain is purely deterministic', () => {
+    const deterministicOnly = `import { workflow } from '@agent-relay/sdk/workflows';
+const wf = workflow('det')
+  .step("build", { type: "deterministic", command: "make" })
+  .step("verify", { type: "deterministic", dependsOn: ["build"], command: "test -f out" });
+`;
+    expect(resolveSafeResumeAnchor('verify', deterministicOnly)).toBe('verify');
+  });
+
+  it('returns the failed step unchanged when content is missing or unparseable', () => {
+    expect(resolveSafeResumeAnchor('final-review-pass-gate', undefined)).toBe('final-review-pass-gate');
+    expect(resolveSafeResumeAnchor('final-review-pass-gate', '')).toBe('final-review-pass-gate');
+    expect(resolveSafeResumeAnchor(undefined, reviewGateWorkflow)).toBeUndefined();
+  });
+
+  it('returns the failed step unchanged when it is not a known step', () => {
+    expect(resolveSafeResumeAnchor('does-not-exist', reviewGateWorkflow)).toBe('does-not-exist');
+  });
+
+  it('does not treat step ids embedded in shell command HEREDOCs as real steps', () => {
+    // The grep below references "final-review-pass-gate" inside a command
+    // string; the AST keeps string-literal contents inert so it is not
+    // mistaken for a step definition.
+    const withHeredoc = `import { workflow } from '@agent-relay/sdk/workflows';
+const wf = workflow('child')
+  .step("implement", { agent: "impl-codex", task: "work" })
+  .step("gate", { type: "deterministic", dependsOn: ["implement"], command: "echo .step(\\"final-review-pass-gate\\", {}) && grep MARKER f" });
+`;
+    expect(resolveSafeResumeAnchor('gate', withHeredoc)).toBe('implement');
+  });
+});
+
 function successResponse(runId: string): LocalResponse {
   return {
     ok: true,
diff --git a/src/local/auto-fix-loop.ts b/src/local/auto-fix-loop.ts
index 0e6d51b0..e0840a2e 100644
--- a/src/local/auto-fix-loop.ts
+++ b/src/local/auto-fix-loop.ts
@@ -159,6 +159,199 @@ function safeToRetryWithoutStartFromStep(
   return !workflowDidExpensiveWork(evidence);
 }
 
+interface WorkflowStepNode {
+  id: string;
+  /**
+   * True when the step regenerates its own output on every run (agent steps,
+   * and any non-deterministic step type). False only for step types the SDK
+   * treats as broker-free and pure: `deterministic`, `worktree`, `integration`
+   * (see `@agent-relay/sdk` runner `requiresBroker`). Deterministic gates that
+   * validate artifacts written by upstream agent steps are the failure mode
+   * this guards: resuming `--start-from <gate>` skips the agent producers, so
+   * the gate's input is never regenerated and it fails forever.
+   */
+  regeneratesOutput: boolean;
+  dependsOn: string[];
+  order: number;
+}
+
+/**
+ * Parse `workflow.step("id", { ... })` builder calls from a generated workflow
+ * artifact into a step-dependency graph. Uses the TypeScript AST (per AGENTS.md
+ * "Source-Text Analysis: Use Grammar-Aware Parsers, Not Regex") so step ids
+ * embedded in shell `command` HEREDOCs or `task` prose don't get mistaken for
+ * real steps. Returns null when the artifact has no recognizable steps.
+ */
+function parseWorkflowStepGraph(content: string): Map<string, WorkflowStepNode> | null {
+  let sourceFile: ts.SourceFile;
+  try {
+    sourceFile = ts.createSourceFile(
+      'ricky-workflow-artifact.ts',
+      content,
+      ts.ScriptTarget.Latest,
+      false,
+      ts.ScriptKind.TS,
+    );
+  } catch {
+    return null;
+  }
+
+  const steps = new Map<string, WorkflowStepNode>();
+  let order = 0;
+  const nonRegeneratingTypes = new Set(['deterministic', 'worktree', 'integration']);
+
+  const visit = (node: ts.Node): void => {
+    if (
+      ts.isCallExpression(node)
+      && ts.isPropertyAccessExpression(node.expression)
+      && node.expression.name.text === 'step'
+      && node.arguments.length >= 2
+      && ts.isStringLiteralLike(node.arguments[0])
+      && ts.isObjectLiteralExpression(node.arguments[1])
+    ) {
+      const id = node.arguments[0].text;
+      const config = node.arguments[1];
+      let hasAgent = false;
+      let stepType: string | undefined;
+      let dependsOn: string[] = [];
+      for (const prop of config.properties) {
+        if (!ts.isPropertyAssignment(prop) || !prop.name) continue;
+        const key = ts.isIdentifier(prop.name) || ts.isStringLiteralLike(prop.name)
+          ? prop.name.text
+          : undefined;
+        if (key === 'agent') hasAgent = true;
+        if (key === 'type' && ts.isStringLiteralLike(prop.initializer)) {
+          stepType = prop.initializer.text;
+        }
+        if (key === 'dependsOn' && ts.isArrayLiteralExpression(prop.initializer)) {
+          dependsOn = prop.initializer.elements
+            .filter(ts.isStringLiteralLike)
+            .map((element) => element.text);
+        }
+      }
+      // A step regenerates output unless it is explicitly one of the pure
+      // SDK step types AND not an agent step. Default-unknown → treat as
+      // regenerating (safe: resuming there re-runs it).
+      const regeneratesOutput = hasAgent
+        || stepType === undefined
+        || !nonRegeneratingTypes.has(stepType);
+      if (!steps.has(id)) {
+        steps.set(id, { id, regeneratesOutput, dependsOn, order: order++ });
+      }
+    }
+    ts.forEachChild(node, visit);
+  };
+  visit(sourceFile);
+  return steps.size > 0 ? steps : null;
+}
+
+/**
+ * Choose a safe `--start-from` anchor for an auto-fix retry.
+ *
+ * Resuming directly from `failedStep` is correct only when re-running that
+ * step alone can succeed. It cannot when `failedStep` is a deterministic gate
+ * that validates an artifact produced by an upstream agent step: resuming
+ * from the gate skips the agent producer, the artifact is never regenerated,
+ * and the gate fails identically every attempt (the
+ * `final-review-pass-gate` infinite-retry loop). In that case, anchor the
+ * resume at the *nearest* upstream agent step the gate transitively depends
+ * on so the producing step re-runs and regenerates the gate's input; earlier
+ * artifacts persist on disk between runs so they need not be recomputed.
+ *
+ * Returns `failedStep` unchanged when it is itself a regenerating step, when
+ * its whole dependency chain is pure/deterministic (recompute is safe), or
+ * when the workflow graph cannot be parsed.
+ */
+/**
+ * Extract the literal source text of a step's config object —
+ * `.step("<id>", { ... })` → the `{ ... }` span — via the TypeScript AST.
+ * Used to detect whether a repair actually rewrote the failed step.
+ */
+function extractStepConfigText(content: string, stepId: string): string | null {
+  let sourceFile: ts.SourceFile;
+  try {
+    sourceFile = ts.createSourceFile(
+      'ricky-workflow-artifact.ts',
+      content,
+      ts.ScriptTarget.Latest,
+      false,
+      ts.ScriptKind.TS,
+    );
+  } catch {
+    return null;
+  }
+  let found: string | null = null;
+  const visit = (node: ts.Node): void => {
+    if (found !== null) return;
+    if (
+      ts.isCallExpression(node)
+      && ts.isPropertyAccessExpression(node.expression)
+      && node.expression.name.text === 'step'
+      && node.arguments.length >= 2
+      && ts.isStringLiteralLike(node.arguments[0])
+      && node.arguments[0].text === stepId
+      && ts.isObjectLiteralExpression(node.arguments[1])
+    ) {
+      found = node.arguments[1].getText(sourceFile);
+      return;
+    }
+    ts.forEachChild(node, visit);
+  };
+  visit(sourceFile);
+  return found;
+}
+
+/**
+ * True when a repair rewrote the failed step's own definition. When the gate
+ * step itself was the bug (e.g. the `lead-plan-gate` marker-append logic) the
+ * repair fixes it in place and resuming `--start-from <gate>` is correct —
+ * the resume-anchor downgrade must NOT fire in that case.
+ */
+function failedStepRepaired(
+  failedStep: string,
+  oldContent: string | undefined,
+  newContent: string | undefined,
+): boolean {
+  if (!oldContent || !newContent) return false;
+  const before = extractStepConfigText(oldContent, failedStep);
+  const after = extractStepConfigText(newContent, failedStep);
+  if (before === null || after === null) return false;
+  return before !== after;
+}
+
+export function resolveSafeResumeAnchor(
+  failedStep: string | undefined,
+  workflowContent: string | undefined,
+): string | undefined {
+  if (!failedStep || !workflowContent) return failedStep;
+  const graph = parseWorkflowStepGraph(workflowContent);
+  if (!graph) return failedStep;
+  const failed = graph.get(failedStep);
+  if (!failed) return failedStep;
+  if (failed.regeneratesOutput) return failedStep;
+
+  const seen = new Set<string>([failedStep]);
+  let frontier = [...failed.dependsOn];
+  while (frontier.length > 0) {
+    const nearestAgent = frontier
+      .map((id) => graph.get(id))
+      .filter((node): node is WorkflowStepNode => Boolean(node) && node!.regeneratesOutput)
+      // Prefer the most-recently-declared producer when several sit at the
+      // same dependency distance — it is the closest to the gate.
+      .sort((a, b) => b.order - a.order)[0];
+    if (nearestAgent) return nearestAgent.id;
+    const next: string[] = [];
+    for (const id of frontier) {
+      if (seen.has(id)) continue;
+      seen.add(id);
+      const node = graph.get(id);
+      if (node) next.push(...node.dependsOn);
+    }
+    frontier = next;
+  }
+  return failedStep;
+}
+
 /**
  * Reason string explaining why we refused to retry from scratch. Kept in one
  * place so escalation messages and warnings stay aligned.
@@ -468,6 +661,22 @@ export async function runWithAutoFix(
           retryOfRunId = runId;
         }
 
+        // If the repair rewrote the failed step itself, the fix lives in that
+        // step — resume from it. Only when the repair left the failed step
+        // untouched (e.g. it made downstream validation non-terminal but did
+        // not fix a gate whose input comes from a skipped upstream agent
+        // step) do we downgrade the anchor to the artifact's producer.
+        const personaResumeAnchor =
+          failedStep && failedStepRepaired(failedStep, repairTarget.artifactContent, repair.content)
+            ? failedStep
+            : resolveSafeResumeAnchor(failedStep, repair.content);
+        if (personaResumeAnchor !== failedStep && failedStep) {
+          warnings.push(
+            `Auto-fix resume anchor moved from "${failedStep}" to "${personaResumeAnchor}": ` +
+            `"${failedStep}" validates artifacts produced by an upstream agent step that a ` +
+            `direct --start-from would skip.`,
+          );
+        }
         currentRequest = {
           ...retryBaseRequest(currentRequest, response, repairedArtifactPath, repair.content),
           autoFix: undefined,
@@ -475,11 +684,11 @@ export async function runWithAutoFix(
             attempt: attempt + 1,
             maxAttempts,
             ...(runId ? { previousRunId: runId, retryOfRunId: retryOfRunId ?? runId } : {}),
-            ...(failedStep ? { startFromStep: failedStep } : {}),
+            ...(personaResumeAnchor ? { startFromStep: personaResumeAnchor } : {}),
             reason: `auto-fix retry after Workforce workflow persona repair for ${blockerCode ?? 'local failure'}`,
           },
         };
-        onProgress?.(`Retrying workflow${failedStep ? ` from ${failedStep}` : ''}...`);
+        onProgress?.(`Retrying workflow${personaResumeAnchor ? ` from ${personaResumeAnchor}` : ''}...`);
         continue;
       } catch (error) {
         attemptSummary.fix_error = error instanceof Error ? error.message : String(error);
@@ -493,6 +702,14 @@ export async function runWithAutoFix(
           } else if (!retryOfRunId) {
             retryOfRunId = runId;
           }
+          const providerFailResumeAnchor = resolveSafeResumeAnchor(failedStep, repairTarget.artifactContent);
+          if (providerFailResumeAnchor !== failedStep && failedStep) {
+            warnings.push(
+              `Auto-fix resume anchor moved from "${failedStep}" to "${providerFailResumeAnchor}": ` +
+              `"${failedStep}" validates artifacts produced by an upstream agent step that a ` +
+              `direct --start-from would skip.`,
+            );
+          }
           currentRequest = {
             ...retryBaseRequest(currentRequest, response),
             autoFix: undefined,
@@ -500,11 +717,11 @@ export async function runWithAutoFix(
               attempt: attempt + 1,
               maxAttempts,
               ...(runId ? { previousRunId: runId, retryOfRunId: retryOfRunId ?? runId } : {}),
-              ...(failedStep ? { startFromStep: failedStep } : {}),
+              ...(providerFailResumeAnchor ? { startFromStep: providerFailResumeAnchor } : {}),
               reason: `auto-fix retry after workflow repair provider failure for ${blockerCode ?? 'local failure'}`,
             },
           };
-          onProgress?.(`Workflow repair provider failed; retrying workflow${failedStep ? ` from ${failedStep}` : ''}...`);
+          onProgress?.(`Workflow repair provider failed; retrying workflow${providerFailResumeAnchor ? ` from ${providerFailResumeAnchor}` : ''}...`);
           continue;
         }
         if (!safeToRetryWithoutStartFromStep(failedStep, evidence)) {
@@ -605,6 +822,19 @@ export async function runWithAutoFix(
       retryOfRunId = runId;
     }
 
+    // The direct-repair path is only reached for V1 environment/binary
+    // blockers, after the workflow-repair branch has already returned. No
+    // workflow artifact content is in scope here, so the anchor stays as the
+    // failed step (the review-gate skip-loop is handled by the workflow-repair
+    // sites above, which do carry artifact content).
+    const directResumeAnchor = resolveSafeResumeAnchor(failedStep, undefined);
+    if (directResumeAnchor !== failedStep && failedStep) {
+      warnings.push(
+        `Auto-fix resume anchor moved from "${failedStep}" to "${directResumeAnchor}": ` +
+        `"${failedStep}" validates artifacts produced by an upstream agent step that a ` +
+        `direct --start-from would skip.`,
+      );
+    }
     currentRequest = {
       ...retryBaseRequest(currentRequest, response),
       autoFix: undefined,
@@ -612,11 +842,11 @@ export async function runWithAutoFix(
         attempt: attempt + 1,
         maxAttempts,
         ...(runId ? { previousRunId: runId, retryOfRunId: retryOfRunId ?? runId } : {}),
-        ...(failedStep ? { startFromStep: failedStep } : {}),
+        ...(directResumeAnchor ? { startFromStep: directResumeAnchor } : {}),
         reason: `auto-fix retry after ${blockerCode ?? 'local failure'}`,
       },
     };
-    onProgress?.(`Retrying workflow${failedStep ? ` from ${failedStep}` : ''}...`);
+    onProgress?.(`Retrying workflow${directResumeAnchor ? ` from ${directResumeAnchor}` : ''}...`);
   }
 
   return withAutoFix(lastResponse ?? failedBeforeAttempt(request), maxAttempts, attempts, 'error', warnings, trackingRunId);