From c4b65582bfdcbbce8db77228f2230c2b5415c4b6 Mon Sep 17 00:00:00 2001 From: Fsocietyhhh <1211904451@qq.com> Date: Sat, 30 May 2026 17:41:22 -0700 Subject: [PATCH 1/2] fix(agent): stop a 6% gateway blip from killing the whole session (3.23.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audited 2026-05-28 from telemetry: 28/468 paid-model calls (~6%) return a PaymentRejected from the Solana gateway intermittently — identical prompts succeed 5 s apart. Three client-side defects amplified that blip into 'totally unusable, restart doesn't help': 1. error-classifier: 'payment_rejected' was non-transient with maxRetries=0. A single blip surfaced as a hard error. Fixed: mark transient with maxRetries=3. Each retry re-signs with a fresh nonce (llm.ts), so it's not a replay; deterministic failures (clock skew, wrong chain) still exhaust the budget quickly and fall through. 2. loop.ts: 'payment_rejected' was treated identically to 'payment' (insufficient funds) — added to paymentFailedModels for the whole session. One blip permanently demoted the user to free models. Fixed: split the two. 'payment' stays session-permanent (wallet won't refill mid-session). 'payment_rejected' only falls back FOR THIS TURN; next turn resets to baseModel and tries the paid model again. 3. start.ts: disconnectMcpServers() was fire-and-forget and there was no explicit process.exit(). Lingering keep-alive sockets (panel HTTP server, gateway clients, MCP children, FRANKLIN_EXTRACT_ON_EXIT) pinned the event loop. User saw 'Goodbye.' but `ps` still showed the process; a follow-up `franklin` raced with the zombie. Fixed: bounded MCP shutdown race (2 s cap) followed by explicit process.exit() in both Ink and basic UIs. #1 + #2 + #3 together turn a 6% transient into 'session ruined and restart doesn't help'. Removing #1 and #2 caps the blast radius at one turn even when the gateway hiccups. Gateway-side root cause (Solana nonce-cache race + missing RETRYABLE entries) is tracked separately in BlockRunAI/blockrun-sol. --- package.json | 2 +- src/agent/error-classifier.ts | 15 ++++++++++++-- src/agent/loop.ts | 38 ++++++++++++++++++++++++++++------- src/commands/start.ts | 27 ++++++++++++++++++++++++- test/local.mjs | 20 ++++++++++++------ 5 files changed, 85 insertions(+), 17 deletions(-) diff --git a/package.json b/package.json index 52946550..d77dd7a3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@blockrun/franklin", - "version": "3.23.0", + "version": "3.23.1", "description": "Franklin Agent — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.", "type": "module", "exports": { diff --git a/src/agent/error-classifier.ts b/src/agent/error-classifier.ts index 1c6ca855..341f839d 100644 --- a/src/agent/error-classifier.ts +++ b/src/agent/error-classifier.ts @@ -65,6 +65,17 @@ export function classifyAgentError(message: string): AgentErrorInfo { // `Exa /v1/exa/search failed (402): {"error":"Payment verification failed",...}`. // Classify BEFORE the generic 'payment' branch below since the body // contains both 'payment' and 'verification failed'. + // + // Treated as transient with a small retry budget: real-world telemetry + // (2026-05-28 audit) shows the gateway intermittently rejects valid + // signed payments under burst load — identical prompts succeed 5s + // later. Most plausible root cause is a nonce-cache race in the + // gateway's replay protection. Retrying re-signs with a fresh nonce on + // each attempt (llm.ts derives a new nonce per request), so a retry + // is NOT a replay. Three attempts is enough to ride out the blip + // without burning tokens on a model whose wallet is genuinely + // misconfigured (clock skew, wrong chain) — those failure modes are + // deterministic and will exhaust the budget quickly. if (includesAny(err, [ 'verification failed', 'payment verification', @@ -75,8 +86,8 @@ export function classifyAgentError(message: string): AgentErrorInfo { 'replay protection', ])) { return { - category: 'payment_rejected', label: 'PaymentRejected', isTransient: false, maxRetries: 0, - suggestion: 'The gateway rejected your signed payment. Run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch), or stale nonce (the same retry will fail). Switch to a free model with `/model free` to keep working.', + category: 'payment_rejected', label: 'PaymentRejected', isTransient: true, maxRetries: 3, + suggestion: 'The gateway rejected your signed payment. If this keeps happening: run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch). Transient blips are auto-retried.', }; } diff --git a/src/agent/loop.ts b/src/agent/loop.ts index 4ee6a963..12e5df0c 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1710,13 +1710,21 @@ export async function interactiveSession( } // ── Payment failure: auto-fallback to free models ── - // Track payment-failed models for the entire session — unlike transient errors, - // 402s will keep failing until the user adds funds. Also handles - // payment_rejected (signature verified-and-rejected by gateway): - // same fallback path, but the suggestion text in classifier guides - // the user toward clock-skew / chain-mismatch fixes rather than - // "add funds." - if (classified.category === 'payment' || classified.category === 'payment_rejected') { + // 'payment' (insufficient funds / 402): session-permanent blacklist — + // the wallet won't refill mid-session, so retrying the same model + // just wastes a turn. Record to elo so the router learns to avoid it. + // + // 'payment_rejected' (signed payment rejected by gateway): only + // fall back FOR THIS TURN — do NOT add to paymentFailedModels and + // do NOT record to elo. The retry budget from the transient path + // above (3 attempts) has already been exhausted at this point; + // this fallback just lets the user keep working. The next user + // turn resets to baseModel (see top of outer loop) so a single + // gateway nonce-race blip can't permanently demote the user to + // free models for the whole session — that's the bug audited + // 2026-05-28 from telemetry showing 28/468 PaymentRejected with + // identical prompts succeeding 5s apart. + if (classified.category === 'payment') { turnFailedModels.add(config.model); paymentFailedModels.set(config.model, Date.now()); // Bound the Map so long sessions don't leak. LRU-evict oldest by timestamp. @@ -1742,6 +1750,22 @@ export async function interactiveSession( } } + if (classified.category === 'payment_rejected') { + turnFailedModels.add(config.model); + const nextFree = pickFreeFallback(lastRoutedCategory, turnFailedModels); + if (nextFree) { + const oldModel = config.model; + config.model = nextFree; + config.onModelChange?.(nextFree, 'system'); + const reason = `gateway rejected payment [${classified.label}] — will retry ${oldModel} next turn`; + onEvent({ + kind: 'text_delta', + text: `\n*${formatModelSwitch(oldModel, resolvedModel, reason, nextFree)}*\n`, + }); + continue; // Retry with next model + } + } + // ── Rate-limit / quota: auto-fallback to a different provider ── // Per-day TPM caps (Anthropic) won't clear in this session; per-second // limits already had their backoff retry above and still failed. In diff --git a/src/commands/start.ts b/src/commands/start.ts index 6b124c72..0b15dbdd 100644 --- a/src/commands/start.ts +++ b/src/commands/start.ts @@ -576,7 +576,15 @@ async function runWithInkUI( runExitBackgroundTasks(sessionHistory, agentConfig).catch(() => {}); } - disconnectMcpServers().catch(() => {}); + // Await MCP shutdown with a bounded timeout — previously fire-and-forget, + // which left stdio child processes alive and (combined with no explicit + // process.exit() below) was the root cause of the "I quit but the + // process is still running" report (audited 2026-05-28). A misbehaving + // MCP server must not be able to pin shutdown, so cap the wait at 2s. + await Promise.race([ + disconnectMcpServers().catch(() => {}), + new Promise((r) => setTimeout(r, 2000)), + ]); // Session summary — delta vs. snapshot at session start try { @@ -607,6 +615,15 @@ async function runWithInkUI( } console.log(chalk.dim('\nGoodbye.\n')); + + // Explicit exit. Without this, lingering keep-alive sockets (bootstrap + // learnings importer, panel HTTP server, gateway client agents) and any + // FRANKLIN_EXTRACT_ON_EXIT background promise can hold the event loop + // open for seconds-to-minutes after the UI tears down — the user sees + // "Goodbye." but `ps` still shows the process, and a subsequent + // `franklin` invocation races with the zombie. Force a clean exit. Any + // explicit error paths above set process.exitCode = 1 — preserve it. + process.exit(process.exitCode ?? 0); } async function runExitBackgroundTasks( @@ -703,6 +720,14 @@ async function runWithBasicUI( ui.printGoodbye(); flushStats(); + + // Same explicit-exit reasoning as runWithInkUI — bounded MCP shutdown + // then hard exit so background promises can't pin the process alive. + await Promise.race([ + disconnectMcpServers().catch(() => {}), + new Promise((r) => setTimeout(r, 2000)), + ]); + process.exit(process.exitCode ?? 0); } // ─── Panel auto-start ────────────────────────────────────────────────────── diff --git a/test/local.mjs b/test/local.mjs index 487981ab..b3dc315c 100644 --- a/test/local.mjs +++ b/test/local.mjs @@ -2677,20 +2677,28 @@ test('streamCompletion: 429 response with Retry-After header tags the error mess // Verified 2026-05-04 in a screenshot: ExaSearch failed with // `(402): {"error":"Payment verification failed","details":"Ver…}`. Same // HTTP status as a "payment required" challenge but a different remedy: -// the user's signed payment was rejected, not absent. Same retry won't -// help — must fix clock skew / chain / nonce. - -test('classifier: Payment verification failed → payment_rejected with chain/clock-skew tip', async () => { +// the user's signed payment was rejected, not absent. +// +// Audited 2026-05-28: empirically intermittent — telemetry showed 28/468 +// PaymentRejected with identical prompts succeeding 5s later. Most +// plausible root cause is a nonce-cache race in the gateway's replay +// protection under burst load. Each retry re-signs with a fresh nonce +// (llm.ts derives a new nonce per request), so a retry is NOT a replay. +// Hence transient with a small retry budget. Deterministic failure +// modes (clock skew, wrong chain) exhaust the budget quickly and fall +// through to the same fallback path. + +test('classifier: Payment verification failed → payment_rejected, transient with small retry budget', async () => { const { classifyAgentError } = await import('../dist/agent/error-classifier.js'); // Gateway-shape body, exact match for the live failure. const live = classifyAgentError('Exa /v1/exa/search failed (402): {"error":"Payment verification failed","details":"Ver..."}'); assert.equal(live.category, 'payment_rejected'); assert.equal(live.label, 'PaymentRejected'); - assert.equal(live.maxRetries, 0, 'must not auto-retry — same signature stays rejected'); + assert.equal(live.isTransient, true, 'must auto-retry — gateway nonce-race blips need a fresh-nonce retry'); + assert.equal(live.maxRetries, 3, 'small budget — enough to ride out a burst-load blip, not enough to thrash on a real misconfig'); assert.match(live.suggestion ?? '', /clock skew/i, 'suggestion should mention clock skew'); assert.match(live.suggestion ?? '', /chain/i, 'suggestion should mention chain'); - assert.match(live.suggestion ?? '', /\/model free/i, 'suggestion should offer free-model escape'); // Other variant phrasings the gateway might use. for (const msg of ['signature mismatch', 'invalid x-payment header', 'nonce reuse detected']) { From c222e3b9c35cb9bfc4e0f2ea87ff221d33244b74 Mon Sep 17 00:00:00 2001 From: 1bcMax <195689928+1bcMax@users.noreply.github.com> Date: Sat, 30 May 2026 21:29:32 -0400 Subject: [PATCH 2/2] fix(loop): reset retry budget when payment_rejected falls back to free model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The payment_rejected per-turn fallback switched to a free model without resetting recoveryAttempts. By that point the transient path above has exhausted this turn's maxRetries:3 budget, so the free fallback model inherited recoveryAttempts==3 and got zero retries — a single transient blip on the fallback model failed the whole turn, the exact outcome this PR set out to prevent. Reset the counter on switch, mirroring the rate_limit fallback's 'new model gets its own retry budget' behavior. --- src/agent/loop.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/agent/loop.ts b/src/agent/loop.ts index 12e5df0c..e9ff4e31 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1758,6 +1758,10 @@ export async function interactiveSession( config.model = nextFree; config.onModelChange?.(nextFree, 'system'); const reason = `gateway rejected payment [${classified.label}] — will retry ${oldModel} next turn`; + // Reset retry counter — the transient path above already burned + // this turn's budget on the rejected model; the free fallback + // model gets its own (mirrors the rate_limit fallback below). + recoveryAttempts = 0; onEvent({ kind: 'text_delta', text: `\n*${formatModelSwitch(oldModel, resolvedModel, reason, nextFree)}*\n`,