Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@blockrun/franklin",
"version": "3.23.0",
"version": "3.23.1",
"description": "Franklin Agent — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
"type": "module",
"exports": {
Expand Down
15 changes: 13 additions & 2 deletions src/agent/error-classifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@ export function classifyAgentError(message: string): AgentErrorInfo {
// `Exa /v1/exa/search failed (402): {"error":"Payment verification failed",...}`.
// Classify BEFORE the generic 'payment' branch below since the body
// contains both 'payment' and 'verification failed'.
//
// Treated as transient with a small retry budget: real-world telemetry
// (2026-05-28 audit) shows the gateway intermittently rejects valid
// signed payments under burst load — identical prompts succeed 5s
// later. Most plausible root cause is a nonce-cache race in the
// gateway's replay protection. Retrying re-signs with a fresh nonce on
// each attempt (llm.ts derives a new nonce per request), so a retry
// is NOT a replay. Three attempts is enough to ride out the blip
// without burning tokens on a model whose wallet is genuinely
// misconfigured (clock skew, wrong chain) — those failure modes are
// deterministic and will exhaust the budget quickly.
if (includesAny(err, [
'verification failed',
'payment verification',
Expand All @@ -75,8 +86,8 @@ export function classifyAgentError(message: string): AgentErrorInfo {
'replay protection',
])) {
return {
category: 'payment_rejected', label: 'PaymentRejected', isTransient: false, maxRetries: 0,
suggestion: 'The gateway rejected your signed payment. Run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch), or stale nonce (the same retry will fail). Switch to a free model with `/model free` to keep working.',
category: 'payment_rejected', label: 'PaymentRejected', isTransient: true, maxRetries: 3,
suggestion: 'The gateway rejected your signed payment. If this keeps happening: run `franklin balance` to confirm funds + chain. Common causes: clock skew (resync system clock), wrong chain selected (use `/chain` to switch). Transient blips are auto-retried.',
};
}

Expand Down
42 changes: 35 additions & 7 deletions src/agent/loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1710,13 +1710,21 @@ export async function interactiveSession(
}

// ── Payment failure: auto-fallback to free models ──
// Track payment-failed models for the entire session — unlike transient errors,
// 402s will keep failing until the user adds funds. Also handles
// payment_rejected (signature verified-and-rejected by gateway):
// same fallback path, but the suggestion text in classifier guides
// the user toward clock-skew / chain-mismatch fixes rather than
// "add funds."
if (classified.category === 'payment' || classified.category === 'payment_rejected') {
// 'payment' (insufficient funds / 402): session-permanent blacklist —
// the wallet won't refill mid-session, so retrying the same model
// just wastes a turn. Record to elo so the router learns to avoid it.
//
// 'payment_rejected' (signed payment rejected by gateway): only
// fall back FOR THIS TURN — do NOT add to paymentFailedModels and
// do NOT record to elo. The retry budget from the transient path
// above (3 attempts) has already been exhausted at this point;
// this fallback just lets the user keep working. The next user
// turn resets to baseModel (see top of outer loop) so a single
// gateway nonce-race blip can't permanently demote the user to
// free models for the whole session — that's the bug audited
// 2026-05-28 from telemetry showing 28/468 PaymentRejected with
// identical prompts succeeding 5s apart.
if (classified.category === 'payment') {
turnFailedModels.add(config.model);
paymentFailedModels.set(config.model, Date.now());
// Bound the Map so long sessions don't leak. LRU-evict oldest by timestamp.
Expand All @@ -1742,6 +1750,26 @@ export async function interactiveSession(
}
}

if (classified.category === 'payment_rejected') {
turnFailedModels.add(config.model);
const nextFree = pickFreeFallback(lastRoutedCategory, turnFailedModels);
if (nextFree) {
const oldModel = config.model;
config.model = nextFree;
config.onModelChange?.(nextFree, 'system');
const reason = `gateway rejected payment [${classified.label}] — will retry ${oldModel} next turn`;
// Reset retry counter — the transient path above already burned
// this turn's budget on the rejected model; the free fallback
// model gets its own (mirrors the rate_limit fallback below).
recoveryAttempts = 0;
onEvent({
kind: 'text_delta',
text: `\n*${formatModelSwitch(oldModel, resolvedModel, reason, nextFree)}*\n`,
});
continue; // Retry with next model
}
}

// ── Rate-limit / quota: auto-fallback to a different provider ──
// Per-day TPM caps (Anthropic) won't clear in this session; per-second
// limits already had their backoff retry above and still failed. In
Expand Down
27 changes: 26 additions & 1 deletion src/commands/start.ts
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,15 @@ async function runWithInkUI(
runExitBackgroundTasks(sessionHistory, agentConfig).catch(() => {});
}

disconnectMcpServers().catch(() => {});
// Await MCP shutdown with a bounded timeout — previously fire-and-forget,
// which left stdio child processes alive and (combined with no explicit
// process.exit() below) was the root cause of the "I quit but the
// process is still running" report (audited 2026-05-28). A misbehaving
// MCP server must not be able to pin shutdown, so cap the wait at 2s.
await Promise.race([
disconnectMcpServers().catch(() => {}),
new Promise<void>((r) => setTimeout(r, 2000)),
]);

// Session summary — delta vs. snapshot at session start
try {
Expand Down Expand Up @@ -607,6 +615,15 @@ async function runWithInkUI(
}

console.log(chalk.dim('\nGoodbye.\n'));

// Explicit exit. Without this, lingering keep-alive sockets (bootstrap
// learnings importer, panel HTTP server, gateway client agents) and any
// FRANKLIN_EXTRACT_ON_EXIT background promise can hold the event loop
// open for seconds-to-minutes after the UI tears down — the user sees
// "Goodbye." but `ps` still shows the process, and a subsequent
// `franklin` invocation races with the zombie. Force a clean exit. Any
// explicit error paths above set process.exitCode = 1 — preserve it.
process.exit(process.exitCode ?? 0);
}

async function runExitBackgroundTasks(
Expand Down Expand Up @@ -703,6 +720,14 @@ async function runWithBasicUI(

ui.printGoodbye();
flushStats();

// Same explicit-exit reasoning as runWithInkUI — bounded MCP shutdown
// then hard exit so background promises can't pin the process alive.
await Promise.race([
disconnectMcpServers().catch(() => {}),
new Promise<void>((r) => setTimeout(r, 2000)),
]);
process.exit(process.exitCode ?? 0);
}

// ─── Panel auto-start ──────────────────────────────────────────────────────
Expand Down
20 changes: 14 additions & 6 deletions test/local.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -2677,20 +2677,28 @@ test('streamCompletion: 429 response with Retry-After header tags the error mess
// Verified 2026-05-04 in a screenshot: ExaSearch failed with
// `(402): {"error":"Payment verification failed","details":"Ver…}`. Same
// HTTP status as a "payment required" challenge but a different remedy:
// the user's signed payment was rejected, not absent. Same retry won't
// help — must fix clock skew / chain / nonce.

test('classifier: Payment verification failed → payment_rejected with chain/clock-skew tip', async () => {
// the user's signed payment was rejected, not absent.
//
// Audited 2026-05-28: empirically intermittent — telemetry showed 28/468
// PaymentRejected with identical prompts succeeding 5s later. Most
// plausible root cause is a nonce-cache race in the gateway's replay
// protection under burst load. Each retry re-signs with a fresh nonce
// (llm.ts derives a new nonce per request), so a retry is NOT a replay.
// Hence transient with a small retry budget. Deterministic failure
// modes (clock skew, wrong chain) exhaust the budget quickly and fall
// through to the same fallback path.

test('classifier: Payment verification failed → payment_rejected, transient with small retry budget', async () => {
const { classifyAgentError } = await import('../dist/agent/error-classifier.js');

// Gateway-shape body, exact match for the live failure.
const live = classifyAgentError('Exa /v1/exa/search failed (402): {"error":"Payment verification failed","details":"Ver..."}');
assert.equal(live.category, 'payment_rejected');
assert.equal(live.label, 'PaymentRejected');
assert.equal(live.maxRetries, 0, 'must not auto-retry — same signature stays rejected');
assert.equal(live.isTransient, true, 'must auto-retry — gateway nonce-race blips need a fresh-nonce retry');
assert.equal(live.maxRetries, 3, 'small budget — enough to ride out a burst-load blip, not enough to thrash on a real misconfig');
assert.match(live.suggestion ?? '', /clock skew/i, 'suggestion should mention clock skew');
assert.match(live.suggestion ?? '', /chain/i, 'suggestion should mention chain');
assert.match(live.suggestion ?? '', /\/model free/i, 'suggestion should offer free-model escape');

// Other variant phrasings the gateway might use.
for (const msg of ['signature mismatch', 'invalid x-payment header', 'nonce reuse detected']) {
Expand Down