diff --git a/CHANGELOG.md b/CHANGELOG.md index 9eede0a..5606547 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Added + +- **Quality signals: outcome inference + one-shot rate.** Two orthogonal per-session signals for the "was this work good enough that a cheaper model could have done it" question. Closes [#6](https://github.com/AgentWorkforce/burn/issues/6). [cli, analyze] + - `@relayburn/analyze` — new `computeQuality(turns, opts)` returning `SessionOutcome[]` (classifies sessions as `completed` / `abandoned` / `errored` / `unknown` with explicit confidence and a reason code) and `OneShotMetrics[]` (edit turns / one-shot edit turns / retry volume, excluding sidechain subagent turns). Give-up phrase detection on the last assistant text downgrades confidence when the content sidecar is available, but is never required. + - `burn summary --quality` — appends a quality rollup (outcome counts + weighted one-shot rate) to summary output. Content sidecar reads run with a concurrency cap of 8 so large ledgers don't serialize I/O. + - Both signals are computed lazily at query time (never persisted) so future rule changes don't require a rebuild. + - Sources that don't record `stopReason` (e.g. Codex) are classified `completed/low` with reason `unknown-ending` rather than being swept into `abandoned`. + +### PRs in this release + +- [#53](https://github.com/AgentWorkforce/burn/pull/53) — Add quality signals (outcome inference + one-shot rate) + ## 2026-04-23 — `burn waste`: per-tool-call cost attribution **Versions:** `@relayburn/reader@0.4.0`, `@relayburn/ledger@0.4.0`, `@relayburn/analyze@0.4.0`, `@relayburn/cli@0.4.0` diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md index 28b5974..b7309d9 100644 --- a/packages/analyze/CHANGELOG.md +++ b/packages/analyze/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **Quality signals module.** `computeQuality(turns, opts)` returns two orthogonal per-session signals — `SessionOutcome` (outcome inference) and `OneShotMetrics` (one-shot rate) — for answering "was this work good enough that a cheaper model could have done it." Closes [#6](https://github.com/AgentWorkforce/burn/issues/6). Also exported individually as `inferOutcome` and `computeOneShotRate`. + - **Outcome inference** classifies each session as `completed` / `abandoned` / `errored` / `unknown` with explicit `high` / `medium` / `low` confidence and a reason code (`single-exchange`, `too-short`, `recent`, `user-ended`, `user-ended-long`, `failure-streak`, `give-up`, `assistant-ended`, `unknown-ending`, `empty`). Works from turn metadata alone; an optional `contentBySession` map downgrades `assistant-ended` to `give-up/low` when the last assistant text matches known give-up phrases (e.g. `"i'm unable to"`, `"i cannot access"`, `"doesn't appear to exist"`). + - **One-shot rate** is `oneShotTurns / editTurns` per session, where a one-shot turn is an edit turn with zero retries. Sidechain (subagent) turns are excluded from the denominator so their retry counts don't poison the parent session's rate. Also returns `totalRetries` as a raw volume signal. + - Computed lazily at query time, never persisted to the ledger — upgrading the rules later does not require a rebuild. Requires no prompt storage; the give-up downgrade runs opportunistically when content is available. + - Handles sources that don't record `stopReason` (e.g. Codex): the final-turn ending role is reported as `'unknown'` and the session is classified `completed/low` with reason `unknown-ending` rather than being swept into `abandoned`. + ## [0.4.0] - 2026-04-23 ### Added diff --git a/packages/analyze/src/index.ts b/packages/analyze/src/index.ts index b83b497..0127110 100644 --- a/packages/analyze/src/index.ts +++ b/packages/analyze/src/index.ts @@ -29,6 +29,15 @@ export type { RetryLoop, SessionPatternSummary, } from './patterns.js'; +export { computeQuality, computeOneShotRate, inferOutcome } from './quality.js'; +export type { + ComputeQualityOptions, + OneShotMetrics, + OutcomeConfidence, + OutcomeLabel, + QualityResult, + SessionOutcome, +} from './quality.js'; export { attributeClaudeMd, buildAdviseRecommendations, diff --git a/packages/analyze/src/quality.test.ts b/packages/analyze/src/quality.test.ts new file mode 100644 index 0000000..96e61fc --- /dev/null +++ b/packages/analyze/src/quality.test.ts @@ -0,0 +1,259 @@ +import { strict as assert } from 'node:assert'; +import { describe, it } from 'node:test'; + +import type { ContentRecord, ToolCall, TurnRecord } from '@relayburn/reader'; + +import { computeOneShotRate, computeQuality, inferOutcome } from './quality.js'; + +function tc(id: string, name: string, opts: Partial = {}): ToolCall { + return { id, name, argsHash: `${name}:${id}`, ...opts }; +} + +function turn(overrides: Partial & { messageId: string; turnIndex: number }): TurnRecord { + return { + v: 1, + source: 'claude-code', + sessionId: 's', + model: 'claude-sonnet-4-6', + ts: '2026-04-20T00:00:00.000Z', + usage: { input: 10, output: 5, reasoning: 0, cacheRead: 0, cacheCreate5m: 0, cacheCreate1h: 0 }, + toolCalls: [], + retries: 0, + hasEdits: false, + ...overrides, + }; +} + +const FIXED_NOW = Date.parse('2026-04-21T00:00:00.000Z'); + +describe('inferOutcome', () => { + it('returns empty/low for a session with zero turns', () => { + const o = inferOutcome('s', [], undefined, FIXED_NOW); + assert.equal(o.outcome, 'unknown'); + assert.equal(o.confidence, 'low'); + assert.equal(o.reason, 'empty'); + }); + + it('marks a single-exchange assistant-ended session as completed/medium', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, stopReason: 'tool_use' }), + turn({ messageId: 'm2', turnIndex: 1, stopReason: 'end_turn' }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'completed'); + assert.equal(o.confidence, 'medium'); + assert.equal(o.reason, 'single-exchange'); + }); + + it('marks a one-turn assistant-ended session (user asks → assistant answers) as completed/medium', () => { + const turns = [turn({ messageId: 'm1', turnIndex: 0, stopReason: 'end_turn' })]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'completed'); + assert.equal(o.confidence, 'medium'); + assert.equal(o.reason, 'single-exchange'); + }); + + it('marks a very-short session as unknown/low', () => { + const turns = [turn({ messageId: 'm1', turnIndex: 0, stopReason: 'tool_use' })]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'unknown'); + assert.equal(o.reason, 'too-short'); + }); + + it('marks a still-active (recent) session as unknown/low with isRecent=true', () => { + const now = Date.parse('2026-04-20T00:05:00.000Z'); + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, ts: '2026-04-20T00:00:00.000Z', stopReason: 'end_turn' }), + turn({ messageId: 'm2', turnIndex: 1, ts: '2026-04-20T00:01:00.000Z', stopReason: 'end_turn' }), + turn({ messageId: 'm3', turnIndex: 2, ts: '2026-04-20T00:02:00.000Z', stopReason: 'end_turn' }), + ]; + const o = inferOutcome('s', turns, undefined, now); + assert.equal(o.outcome, 'unknown'); + assert.equal(o.isRecent, true); + assert.equal(o.reason, 'recent'); + }); + + it('marks a user-ended long session as abandoned/high', () => { + const turns = Array.from({ length: 10 }, (_, i) => + turn({ messageId: `m${i}`, turnIndex: i, stopReason: i === 9 ? 'tool_use' : 'end_turn' }), + ); + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'abandoned'); + assert.equal(o.confidence, 'high'); + assert.equal(o.reason, 'user-ended-long'); + }); + + it('marks a user-ended short-medium session as abandoned/medium', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, stopReason: 'end_turn' }), + turn({ messageId: 'm2', turnIndex: 1, stopReason: 'end_turn' }), + turn({ messageId: 'm3', turnIndex: 2, stopReason: 'tool_use' }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'abandoned'); + assert.equal(o.confidence, 'medium'); + assert.equal(o.reason, 'user-ended'); + }); + + it('marks a trailing-failure-streak session as errored/medium', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, stopReason: 'end_turn' }), + turn({ + messageId: 'm2', + turnIndex: 1, + stopReason: 'end_turn', + toolCalls: [tc('u1', 'Bash', { isError: true })], + }), + turn({ + messageId: 'm3', + turnIndex: 2, + stopReason: 'end_turn', + toolCalls: [tc('u2', 'Bash', { isError: true })], + }), + turn({ + messageId: 'm4', + turnIndex: 3, + stopReason: 'end_turn', + toolCalls: [tc('u3', 'Bash', { isError: true })], + }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'errored'); + assert.equal(o.reason, 'failure-streak'); + }); + + it('marks an assistant-ended session as completed/medium by default', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, stopReason: 'end_turn' }), + turn({ messageId: 'm2', turnIndex: 1, stopReason: 'end_turn' }), + turn({ messageId: 'm3', turnIndex: 2, stopReason: 'end_turn' }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'completed'); + assert.equal(o.confidence, 'medium'); + assert.equal(o.reason, 'assistant-ended'); + }); + + it('classifies sessions with no stopReason (e.g. Codex) as completed/low/unknown-ending', () => { + // Codex parser never sets stopReason; without the fallback, the default + // classifier would mark these as abandoned/medium (false negative). + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, source: 'codex' }), + turn({ messageId: 'm2', turnIndex: 1, source: 'codex' }), + turn({ messageId: 'm3', turnIndex: 2, source: 'codex' }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'completed'); + assert.equal(o.confidence, 'low'); + assert.equal(o.reason, 'unknown-ending'); + }); + + it('still detects trailing failure streak for sources without stopReason', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, source: 'codex' }), + turn({ + messageId: 'm2', + turnIndex: 1, + source: 'codex', + toolCalls: [tc('u1', 'Bash', { isError: true })], + }), + turn({ + messageId: 'm3', + turnIndex: 2, + source: 'codex', + toolCalls: [tc('u2', 'Bash', { isError: true })], + }), + turn({ + messageId: 'm4', + turnIndex: 3, + source: 'codex', + toolCalls: [tc('u3', 'Bash', { isError: true })], + }), + ]; + const o = inferOutcome('s', turns, undefined, FIXED_NOW); + assert.equal(o.outcome, 'errored'); + assert.equal(o.reason, 'failure-streak'); + }); + + it('downgrades assistant-ended to completed/low when last assistant text has a give-up phrase', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, stopReason: 'end_turn' }), + turn({ messageId: 'm2', turnIndex: 1, stopReason: 'end_turn' }), + turn({ messageId: 'm3', turnIndex: 2, stopReason: 'end_turn' }), + ]; + const content: ContentRecord[] = [ + { + v: 1, + source: 'claude-code', + sessionId: 's', + messageId: 'm3', + ts: '2026-04-20T00:00:00.000Z', + role: 'assistant', + kind: 'text', + text: "I'm unable to access the file, so I will stop here.", + }, + ]; + const map = new Map([['s', content]]); + const o = inferOutcome('s', turns, map, FIXED_NOW); + assert.equal(o.outcome, 'completed'); + assert.equal(o.confidence, 'low'); + assert.equal(o.reason, 'give-up'); + }); +}); + +describe('computeOneShotRate', () => { + it('counts edit turns with zero retries as one-shot', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, hasEdits: true, retries: 0 }), + turn({ messageId: 'm2', turnIndex: 1, hasEdits: true, retries: 2 }), + turn({ messageId: 'm3', turnIndex: 2, hasEdits: true, retries: 0 }), + turn({ messageId: 'm4', turnIndex: 3, hasEdits: false, retries: 5 }), // non-edit, ignored + ]; + const m = computeOneShotRate('s', turns); + assert.equal(m.editTurns, 3); + assert.equal(m.oneShotTurns, 2); + assert.equal(m.oneShotRate, 2 / 3); + assert.equal(m.totalRetries, 2); + }); + + it('returns undefined rate when there are no edit turns', () => { + const turns = [turn({ messageId: 'm1', turnIndex: 0, hasEdits: false })]; + const m = computeOneShotRate('s', turns); + assert.equal(m.editTurns, 0); + assert.equal(m.oneShotRate, undefined); + }); + + it('excludes sidechain (subagent) turns from the denominator', () => { + const turns = [ + turn({ messageId: 'm1', turnIndex: 0, hasEdits: true, retries: 0 }), + turn({ + messageId: 'm2', + turnIndex: 1, + hasEdits: true, + retries: 5, + subagent: { isSidechain: true }, + }), + ]; + const m = computeOneShotRate('s', turns); + assert.equal(m.editTurns, 1); + assert.equal(m.oneShotRate, 1); + }); +}); + +describe('computeQuality — pairing', () => { + it('emits outcome + one-shot for each session in the input', () => { + const turns = [ + turn({ messageId: 'a1', turnIndex: 0, sessionId: 'A', hasEdits: true, stopReason: 'end_turn' }), + turn({ messageId: 'a2', turnIndex: 1, sessionId: 'A', hasEdits: true, stopReason: 'end_turn' }), + turn({ messageId: 'a3', turnIndex: 2, sessionId: 'A', stopReason: 'end_turn' }), + turn({ messageId: 'b1', turnIndex: 0, sessionId: 'B', stopReason: 'tool_use' }), + ]; + const q = computeQuality(turns, { now: FIXED_NOW }); + const aOut = q.outcomes.find((o) => o.sessionId === 'A')!; + const bOut = q.outcomes.find((o) => o.sessionId === 'B')!; + assert.equal(aOut.outcome, 'completed'); + assert.equal(bOut.outcome, 'unknown'); // messageCount < 3 + assert.equal(q.oneShot.find((m) => m.sessionId === 'A')!.editTurns, 2); + assert.equal(q.oneShot.find((m) => m.sessionId === 'B')!.editTurns, 0); + }); +}); diff --git a/packages/analyze/src/quality.ts b/packages/analyze/src/quality.ts new file mode 100644 index 0000000..5d1b34a --- /dev/null +++ b/packages/analyze/src/quality.ts @@ -0,0 +1,291 @@ +import type { ContentRecord, TurnRecord } from '@relayburn/reader'; + +// Quality signals for the "was this work good enough that a cheaper model +// could have done it" question. Two orthogonal detectors, per the decision +// in issue #6: outcome inference (agentsview) + one-shot rate (codeburn). +// +// Design choices that stuck from the issue discussion: +// - No prompt storage required — both signals work from session metadata and +// tool-call patterns alone. Content (last assistant text) is used *only* +// to downgrade confidence; never required. +// - Computed lazily at query time, not persisted in the ledger. Upgrading +// the rules later doesn't require a rebuild. +// - Confidence is explicit on every classification so downstream consumers +// can filter out low-confidence signals rather than treat them as noise. + +export type OutcomeLabel = 'completed' | 'abandoned' | 'errored' | 'unknown'; +export type OutcomeConfidence = 'high' | 'medium' | 'low'; + +export interface SessionOutcome { + sessionId: string; + outcome: OutcomeLabel; + confidence: OutcomeConfidence; + isRecent: boolean; + // Why this classification fired. Short identifier so callers can filter/ + // aggregate by reason without re-parsing strings. + reason: + | 'automated' + | 'single-exchange' + | 'too-short' + | 'recent' + | 'user-ended' + | 'user-ended-long' + | 'failure-streak' + | 'give-up' + | 'assistant-ended' + | 'unknown-ending' + | 'empty'; +} + +export interface OneShotMetrics { + sessionId: string; + editTurns: number; + oneShotTurns: number; + // `oneShotTurns / editTurns` when editTurns > 0, else undefined. Callers + // decide what to display for zero-edit sessions (NaN vs "—"). + oneShotRate: number | undefined; + // Total retries across all turns in the session. Useful alongside the rate + // as a raw volume signal — a rate of 0.5 with 2 edits reads very different + // from a rate of 0.5 with 40 edits. + totalRetries: number; +} + +export interface QualityResult { + outcomes: SessionOutcome[]; + oneShot: OneShotMetrics[]; +} + +export interface ComputeQualityOptions { + // Optional: content sidecar records. When provided, give-up phrase + // matching on the last assistant text downgrades assistant-ended sessions + // from 'completed/medium' to 'completed/low'. Without content, the + // give-up downgrade is skipped — the classifier still runs. + contentBySession?: Map; + // Clock override for tests. Defaults to `Date.now()`. + now?: number; +} + +// Phrases observed in agentsview's give-up heuristic plus additions from +// real Claude/Codex sessions. Kept case-insensitive. +const GIVE_UP_PATTERNS = [ + "i'm unable to", + 'i am unable to', + "i can't proceed", + 'i cannot proceed', + "i don't have access", + 'i cannot access', + 'unable to verify', + "doesn't appear to exist", +]; + +const RECENT_WINDOW_MS = 10 * 60 * 1000; +const SHORT_CONVERSATION_THRESHOLD = 3; +const LONG_CONVERSATION_THRESHOLD = 10; +const FAILURE_STREAK_THRESHOLD = 3; + +export function computeQuality( + turns: TurnRecord[], + opts: ComputeQualityOptions = {}, +): QualityResult { + const bySession = new Map(); + for (const t of turns) { + let list = bySession.get(t.sessionId); + if (!list) { + list = []; + bySession.set(t.sessionId, list); + } + list.push(t); + } + + const outcomes: SessionOutcome[] = []; + const oneShot: OneShotMetrics[] = []; + const now = opts.now ?? Date.now(); + + for (const [sessionId, sessionTurns] of bySession) { + sessionTurns.sort((a, b) => a.turnIndex - b.turnIndex); + outcomes.push(inferOutcome(sessionId, sessionTurns, opts.contentBySession, now)); + oneShot.push(computeOneShotRate(sessionId, sessionTurns)); + } + + return { outcomes, oneShot }; +} + +export function inferOutcome( + sessionId: string, + turns: TurnRecord[], + contentBySession: Map | undefined, + nowMs: number, +): SessionOutcome { + if (turns.length === 0) { + return { + sessionId, + outcome: 'unknown', + confidence: 'low', + isRecent: false, + reason: 'empty', + }; + } + + // Recency: classifier should not mark a still-active session as abandoned. + const last = turns[turns.length - 1]!; + const lastMs = Date.parse(last.ts); + const isRecent = Number.isFinite(lastMs) && nowMs - lastMs < RECENT_WINDOW_MS; + + const messageCount = turns.length; + const endedRole = endingRole(turns); + const failureStreak = trailingFailureStreak(turns); + + // A single assistant turn that reached end_turn is almost always an + // intentional one-shot exchange (user asked, assistant answered — e.g. + // "hi → hello", or a single tool-mediated round trip that produces two + // assistant turns). Treat these as completed at medium confidence rather + // than falling through to "too-short/unknown". TurnRecord counts assistant + // turns only, so messageCount <= 2 covers both shapes. + if (messageCount <= 2 && endedRole === 'assistant') { + return { + sessionId, + outcome: 'completed', + confidence: 'medium', + isRecent, + reason: 'single-exchange', + }; + } + if (messageCount < SHORT_CONVERSATION_THRESHOLD) { + return { + sessionId, + outcome: 'unknown', + confidence: 'low', + isRecent, + reason: 'too-short', + }; + } + if (isRecent) { + return { + sessionId, + outcome: 'unknown', + confidence: 'low', + isRecent: true, + reason: 'recent', + }; + } + + if (endedRole === 'user') { + // Long user-ended sessions are overwhelmingly abandoned (user walked + // away mid-reply); short ones are ambiguous enough to keep at medium. + const high = messageCount >= LONG_CONVERSATION_THRESHOLD; + return { + sessionId, + outcome: 'abandoned', + confidence: high ? 'high' : 'medium', + isRecent: false, + reason: high ? 'user-ended-long' : 'user-ended', + }; + } + + if (failureStreak >= FAILURE_STREAK_THRESHOLD) { + return { + sessionId, + outcome: 'errored', + confidence: 'medium', + isRecent: false, + reason: 'failure-streak', + }; + } + + if (endedRole === 'unknown') { + // Source doesn't record stop reason (e.g. Codex) — we can't distinguish + // a natural stop from a mid-tool-call abandonment. Default to completed + // at low confidence rather than misclassifying every such session as + // abandoned. + return { + sessionId, + outcome: 'completed', + confidence: 'low', + isRecent: false, + reason: 'unknown-ending', + }; + } + + // Assistant-ended successfully — default completed. Give-up phrase in the + // last assistant text downgrades confidence (but doesn't change the label; + // we still don't know if the user would have agreed it was done). + const gaveUp = contentBySession ? detectGiveUp(contentBySession.get(sessionId)) : false; + return { + sessionId, + outcome: 'completed', + confidence: gaveUp ? 'low' : 'medium', + isRecent: false, + reason: gaveUp ? 'give-up' : 'assistant-ended', + }; +} + +export function computeOneShotRate( + sessionId: string, + turns: TurnRecord[], +): OneShotMetrics { + let editTurns = 0; + let oneShotTurns = 0; + let totalRetries = 0; + for (const t of turns) { + // Sidechain (subagent) turns are a different cost-attribution universe; + // their retry counts don't belong in the parent session's rate. + if (t.subagent?.isSidechain) continue; + if (!t.hasEdits) continue; + editTurns++; + totalRetries += t.retries ?? 0; + if ((t.retries ?? 0) === 0) oneShotTurns++; + } + return { + sessionId, + editTurns, + oneShotTurns, + oneShotRate: editTurns > 0 ? oneShotTurns / editTurns : undefined, + totalRetries, + }; +} + +function endingRole(turns: TurnRecord[]): 'user' | 'assistant' | 'unknown' { + // TurnRecord represents assistant turns; a ToolUse turn is followed by a + // user tool_result (which may or may not prompt another assistant turn). + // We infer "ended-with-assistant" when the final turn reached a natural + // stop (`end_turn`) — i.e. it wasn't still waiting for a tool_result. + // A non-'end_turn' stop reason means user-ended (session died after a + // tool_use, before the assistant had a chance to respond). When the + // source doesn't record stopReason at all (e.g. Codex), return 'unknown' + // so the caller can avoid the false-negative "abandoned" classification. + const last = turns[turns.length - 1]!; + if (last.stopReason === undefined) return 'unknown'; + return last.stopReason === 'end_turn' ? 'assistant' : 'user'; +} + +function trailingFailureStreak(turns: TurnRecord[]): number { + // Count trailing consecutive tool calls with isError=true in turn order. + // Mirrors the detectPatterns consecutive-failure signal but scoped to the + // tail of the session: a session can recover from mid-session failures + // (→ still completed) and only the trailing state matters for outcome. + let streak = 0; + for (let i = turns.length - 1; i >= 0; i--) { + const calls = turns[i]!.toolCalls; + if (calls.length === 0) break; + // All tool calls in this turn must be errored to count toward the + // streak. A single success in the trailing turn breaks it — the + // agent is not strictly stuck. + const allErrored = calls.every((c) => c.isError === true); + if (!allErrored) break; + streak += calls.length; + } + return streak; +} + +function detectGiveUp(records: ContentRecord[] | undefined): boolean { + if (!records || records.length === 0) return false; + // Find last assistant text record. + for (let i = records.length - 1; i >= 0; i--) { + const r = records[i]!; + if (r.role === 'assistant' && r.kind === 'text' && typeof r.text === 'string') { + const haystack = r.text.toLowerCase(); + return GIVE_UP_PATTERNS.some((p) => haystack.includes(p)); + } + } + return false; +} diff --git a/packages/cli/CHANGELOG.md b/packages/cli/CHANGELOG.md index 798fa26..57cee51 100644 --- a/packages/cli/CHANGELOG.md +++ b/packages/cli/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **`burn summary --quality`** — appends a quality rollup to the summary output: outcome counts (completed / abandoned / errored / unknown) plus the weighted one-shot edit rate across the matched sessions. Closes [#6](https://github.com/AgentWorkforce/burn/issues/6). + - Opportunistically loads per-session content sidecars (when available) so give-up phrase detection can downgrade assistant-ended confidence. Sidecar reads run with a concurrency cap of 8 so large ledgers don't serialize I/O. + ## [0.4.0] - 2026-04-23 ### Added diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index 07e8630..4e896a6 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -16,7 +16,7 @@ import { runWaste } from './commands/waste.js'; const HELP = `burn — token usage & cost attribution for agent CLIs Usage: - burn summary [--since 7d] [--project ] [--session ] [--workflow ] [--agent ] + burn summary [--since 7d] [--project ] [--session ] [--workflow ] [--agent ] [--quality] burn by-tool [--since 7d] [--project ] [--session ] burn waste [--since 7d] [--project ] [--session ] [--workflow ] [--all] [--json] [--patterns[=retries,failures,compaction,reverts]] diff --git a/packages/cli/src/commands/summary.ts b/packages/cli/src/commands/summary.ts index 0d7a7a8..30dc0be 100644 --- a/packages/cli/src/commands/summary.ts +++ b/packages/cli/src/commands/summary.ts @@ -1,8 +1,9 @@ -import { loadPricing } from '@relayburn/analyze'; +import { computeQuality, loadPricing } from '@relayburn/analyze'; import { costForTurn, sumCosts } from '@relayburn/analyze'; -import type { CostBreakdown } from '@relayburn/analyze'; -import { queryAll, type Query } from '@relayburn/ledger'; +import type { CostBreakdown, OutcomeLabel, QualityResult } from '@relayburn/analyze'; +import { queryAll, readContent, type Query } from '@relayburn/ledger'; import type { EnrichedTurn } from '@relayburn/ledger'; +import type { ContentRecord } from '@relayburn/reader'; import { ingestAll } from '../ingest.js'; import { formatInt, formatUsd, parseSinceArg, table } from '../format.js'; @@ -60,10 +61,77 @@ export async function runSummary(args: ParsedArgs): Promise { ); lines.push(''); + if (args.flags['quality'] === true) { + const contentBySession = await loadContentForQuality(turns); + const quality = computeQuality(turns, { contentBySession }); + lines.push(renderQuality(quality)); + lines.push(''); + } + process.stdout.write(lines.join('\n')); return 0; } +async function loadContentForQuality( + turns: EnrichedTurn[], +): Promise> { + const sessionIds = [...new Set(turns.map((t) => t.sessionId))]; + const bySession = new Map(); + // Sequential reads across thousands of sessions (many with no sidecar at + // all → ENOENT path) dominate runtime on large summaries. Cap concurrency + // so we don't fan out unboundedly on huge ledgers but still overlap I/O. + const concurrency = Math.min(8, sessionIds.length); + let next = 0; + async function worker(): Promise { + while (next < sessionIds.length) { + const sessionId = sessionIds[next++]!; + const records = await readContent({ sessionId }); + if (records.length > 0) bySession.set(sessionId, records); + } + } + await Promise.all(Array.from({ length: concurrency }, () => worker())); + return bySession; +} + +function renderQuality(q: QualityResult): string { + if (q.outcomes.length === 0) return 'quality: (no sessions)'; + const counts = outcomeCounts(q); + const oneShotOverall = weightedOneShotRate(q); + const summary = [ + `quality — sessions: ${q.outcomes.length}`, + ` outcomes: ${counts.completed} completed / ${counts.abandoned} abandoned / ${counts.errored} errored / ${counts.unknown} unknown`, + oneShotOverall === undefined + ? ' one-shot rate: n/a (no edit turns)' + : ` one-shot rate: ${(oneShotOverall * 100).toFixed(1)}% across ${counts.editTurns} edit turns`, + ]; + return summary.join('\n'); +} + +function outcomeCounts(q: QualityResult): Record & { + editTurns: number; +} { + const counts: Record & { editTurns: number } = { + completed: 0, + abandoned: 0, + errored: 0, + unknown: 0, + editTurns: 0, + }; + for (const o of q.outcomes) counts[o.outcome]++; + for (const m of q.oneShot) counts.editTurns += m.editTurns; + return counts; +} + +function weightedOneShotRate(q: QualityResult): number | undefined { + let edit = 0; + let oneShot = 0; + for (const m of q.oneShot) { + edit += m.editTurns; + oneShot += m.oneShotTurns; + } + return edit > 0 ? oneShot / edit : undefined; +} + interface ModelRow { model: string; turns: number;