diff --git a/package.json b/package.json index 5294655..d77dd7a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@blockrun/franklin", - "version": "3.23.0", + "version": "3.23.1", "description": "Franklin Agent — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.", "type": "module", "exports": { diff --git a/src/agent/error-classifier.ts b/src/agent/error-classifier.ts index 1c6ca85..341f839 100644 --- a/src/agent/error-classifier.ts +++ b/src/agent/error-classifier.ts @@ -65,6 +65,17 @@ export function classifyAgentError(message: string): AgentErrorInfo { // `Exa /v1/exa/search failed (402): {"error":"Payment verification failed",...}`. // Classify BEFORE the generic 'payment' branch below since the body // contains both 'payment' and 'verification failed'. + // + // Treated as transient with a small retry budget: real-world telemetry + // (2026-05-28 audit) shows the gateway intermittently rejects valid + // signed payments under burst load — identical prompts succeed 5s + // later. Most plausible root cause is a nonce-cache race in the + // gateway's replay protection. Retrying re-signs with a fresh nonce on + // each attempt (llm.ts derives a new nonce per request), so a retry + // is NOT a replay. Three attempts is enough to ride out the blip + // without burning tokens on a model whose wallet is genuinely + // misconfigured (clock skew, wrong chain) — those failure modes are + // deterministic and will exhaust the budget quickly. if (includesAny(err, [ 'verification failed', 'payment verification', @@ -75,8 +86,8 @@ export function classifyAgentError(message: string): AgentErrorInfo { 'replay protection', ])) { return { - category: 'payment_rejected', label: 'PaymentRejected', isTransient: false, maxRetries: 0, - suggestion: 'The gateway rejected your signed payment. Run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch), or stale nonce (the same retry will fail). Switch to a free model with `/model free` to keep working.', + category: 'payment_rejected', label: 'PaymentRejected', isTransient: true, maxRetries: 3, + suggestion: 'The gateway rejected your signed payment. If this keeps happening: run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch). Transient blips are auto-retried.', }; } diff --git a/src/agent/loop.ts b/src/agent/loop.ts index 4ee6a96..e9ff4e3 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1710,13 +1710,21 @@ export async function interactiveSession( } // ── Payment failure: auto-fallback to free models ── - // Track payment-failed models for the entire session — unlike transient errors, - // 402s will keep failing until the user adds funds. Also handles - // payment_rejected (signature verified-and-rejected by gateway): - // same fallback path, but the suggestion text in classifier guides - // the user toward clock-skew / chain-mismatch fixes rather than - // "add funds." - if (classified.category === 'payment' || classified.category === 'payment_rejected') { + // 'payment' (insufficient funds / 402): session-permanent blacklist — + // the wallet won't refill mid-session, so retrying the same model + // just wastes a turn. Record to elo so the router learns to avoid it. + // + // 'payment_rejected' (signed payment rejected by gateway): only + // fall back FOR THIS TURN — do NOT add to paymentFailedModels and + // do NOT record to elo. The retry budget from the transient path + // above (3 attempts) has already been exhausted at this point; + // this fallback just lets the user keep working. The next user + // turn resets to baseModel (see top of outer loop) so a single + // gateway nonce-race blip can't permanently demote the user to + // free models for the whole session — that's the bug audited + // 2026-05-28 from telemetry showing 28/468 PaymentRejected with + // identical prompts succeeding 5s apart. + if (classified.category === 'payment') { turnFailedModels.add(config.model); paymentFailedModels.set(config.model, Date.now()); // Bound the Map so long sessions don't leak. LRU-evict oldest by timestamp. @@ -1742,6 +1750,26 @@ export async function interactiveSession( } } + if (classified.category === 'payment_rejected') { + turnFailedModels.add(config.model); + const nextFree = pickFreeFallback(lastRoutedCategory, turnFailedModels); + if (nextFree) { + const oldModel = config.model; + config.model = nextFree; + config.onModelChange?.(nextFree, 'system'); + const reason = `gateway rejected payment [${classified.label}] — will retry ${oldModel} next turn`; + // Reset retry counter — the transient path above already burned + // this turn's budget on the rejected model; the free fallback + // model gets its own (mirrors the rate_limit fallback below). + recoveryAttempts = 0; + onEvent({ + kind: 'text_delta', + text: `\n*${formatModelSwitch(oldModel, resolvedModel, reason, nextFree)}*\n`, + }); + continue; // Retry with next model + } + } + // ── Rate-limit / quota: auto-fallback to a different provider ── // Per-day TPM caps (Anthropic) won't clear in this session; per-second // limits already had their backoff retry above and still failed. In diff --git a/src/commands/start.ts b/src/commands/start.ts index 6b124c7..0b15dbd 100644 --- a/src/commands/start.ts +++ b/src/commands/start.ts @@ -576,7 +576,15 @@ async function runWithInkUI( runExitBackgroundTasks(sessionHistory, agentConfig).catch(() => {}); } - disconnectMcpServers().catch(() => {}); + // Await MCP shutdown with a bounded timeout — previously fire-and-forget, + // which left stdio child processes alive and (combined with no explicit + // process.exit() below) was the root cause of the "I quit but the + // process is still running" report (audited 2026-05-28). A misbehaving + // MCP server must not be able to pin shutdown, so cap the wait at 2s. + await Promise.race([ + disconnectMcpServers().catch(() => {}), + new Promise((r) => setTimeout(r, 2000)), + ]); // Session summary — delta vs. snapshot at session start try { @@ -607,6 +615,15 @@ async function runWithInkUI( } console.log(chalk.dim('\nGoodbye.\n')); + + // Explicit exit. Without this, lingering keep-alive sockets (bootstrap + // learnings importer, panel HTTP server, gateway client agents) and any + // FRANKLIN_EXTRACT_ON_EXIT background promise can hold the event loop + // open for seconds-to-minutes after the UI tears down — the user sees + // "Goodbye." but `ps` still shows the process, and a subsequent + // `franklin` invocation races with the zombie. Force a clean exit. Any + // explicit error paths above set process.exitCode = 1 — preserve it. + process.exit(process.exitCode ?? 0); } async function runExitBackgroundTasks( @@ -703,6 +720,14 @@ async function runWithBasicUI( ui.printGoodbye(); flushStats(); + + // Same explicit-exit reasoning as runWithInkUI — bounded MCP shutdown + // then hard exit so background promises can't pin the process alive. + await Promise.race([ + disconnectMcpServers().catch(() => {}), + new Promise((r) => setTimeout(r, 2000)), + ]); + process.exit(process.exitCode ?? 0); } // ─── Panel auto-start ────────────────────────────────────────────────────── diff --git a/test/local.mjs b/test/local.mjs index 487981a..b3dc315 100644 --- a/test/local.mjs +++ b/test/local.mjs @@ -2677,20 +2677,28 @@ test('streamCompletion: 429 response with Retry-After header tags the error mess // Verified 2026-05-04 in a screenshot: ExaSearch failed with // `(402): {"error":"Payment verification failed","details":"Ver…}`. Same // HTTP status as a "payment required" challenge but a different remedy: -// the user's signed payment was rejected, not absent. Same retry won't -// help — must fix clock skew / chain / nonce. - -test('classifier: Payment verification failed → payment_rejected with chain/clock-skew tip', async () => { +// the user's signed payment was rejected, not absent. +// +// Audited 2026-05-28: empirically intermittent — telemetry showed 28/468 +// PaymentRejected with identical prompts succeeding 5s later. Most +// plausible root cause is a nonce-cache race in the gateway's replay +// protection under burst load. Each retry re-signs with a fresh nonce +// (llm.ts derives a new nonce per request), so a retry is NOT a replay. +// Hence transient with a small retry budget. Deterministic failure +// modes (clock skew, wrong chain) exhaust the budget quickly and fall +// through to the same fallback path. + +test('classifier: Payment verification failed → payment_rejected, transient with small retry budget', async () => { const { classifyAgentError } = await import('../dist/agent/error-classifier.js'); // Gateway-shape body, exact match for the live failure. const live = classifyAgentError('Exa /v1/exa/search failed (402): {"error":"Payment verification failed","details":"Ver..."}'); assert.equal(live.category, 'payment_rejected'); assert.equal(live.label, 'PaymentRejected'); - assert.equal(live.maxRetries, 0, 'must not auto-retry — same signature stays rejected'); + assert.equal(live.isTransient, true, 'must auto-retry — gateway nonce-race blips need a fresh-nonce retry'); + assert.equal(live.maxRetries, 3, 'small budget — enough to ride out a burst-load blip, not enough to thrash on a real misconfig'); assert.match(live.suggestion ?? '', /clock skew/i, 'suggestion should mention clock skew'); assert.match(live.suggestion ?? '', /chain/i, 'suggestion should mention chain'); - assert.match(live.suggestion ?? '', /\/model free/i, 'suggestion should offer free-model escape'); // Other variant phrasings the gateway might use. for (const msg of ['signature mismatch', 'invalid x-payment header', 'nonce reuse detected']) {