From 5a3d0c6485a66cf4a9a67ac700edd9678b9a158b Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Fri, 24 Apr 2026 21:50:45 -0400 Subject: [PATCH 1/2] Fix reasoning-token pricing semantics, preserve models.dev reasoning tariffs (#32) Two pricing-correctness bugs that distorted reported spend whenever reasoning tokens were involved: 1. Codex `usage.reasoning` was being double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. On the issue's 10-turn sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead) this overstates cost by $0.43 / 11.3% of the slice. 2. `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (Alibaba Qwen reasoning models, etc.) couldn't be priced correctly. Fix: - Extend `ModelCost` with `reasoning?: number` and `reasoningMode: 'included_in_output' | 'separate' | 'same_as_output'`. - `flatten()` preserves `cost.reasoning` and tags the entry `separate`. Models without a distinct reasoning tariff default to `same_as_output` (preserves existing Claude billing). - `costForUsage` branches on the resolved mode. `costForTurn` infers `included_in_output` for `source: 'codex'` so reasoning is recorded but not billed on top of output. - `usage.reasoning` is still preserved in `TurnRecord` for observability. The bug was in pricing, not data capture. Tests cover all four acceptance criteria from the issue: Codex input=1M/output=500k/reasoning=200k bills 10.0 (not 13.0); the synthetic `separate` model bills 13 for 1M of each bucket; the documented Codex regression scenario; `flatten` preserves reasoning; the builtin snapshot retains at least one separate-tariff model. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 4 + README.md | 10 + packages/analyze/CHANGELOG.md | 12 ++ packages/analyze/src/cost.test.ts | 205 +++++++++++++++++++- packages/analyze/src/cost.ts | 51 ++++- packages/analyze/src/index.ts | 6 +- packages/analyze/src/plan-usage.test.ts | 1 + packages/analyze/src/pricing.ts | 30 ++- packages/mcp/src/end-to-end.test.ts | 8 +- packages/mcp/src/tools/session-cost.test.ts | 1 + 10 files changed, 313 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94b83da..7d0abaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Unreleased] +### Fixed + +- **Reasoning-token pricing semantics** (#32). User-visible cost numbers will change downward for any session with non-zero reasoning tokens — most notably Codex sessions, where reasoning was being billed twice (once inside `output_tokens` and again on top via `usage.reasoning`). On the documented 10-turn Codex sample the reported cost drops from $4.282607 to $3.846557 (~11.3%). Models with a distinct reasoning tariff in `models.dev` (e.g. Alibaba Qwen reasoning models) are now priced correctly instead of falling through at the output rate. The reader-level `usage.reasoning` field is unchanged — the bug was in pricing, not data capture. See `packages/analyze/CHANGELOG.md` for the full breakdown. + ### Added - **`@relayburn/mcp` package + `burn mcp-server`** (#26). Closes the loop between observation and decision: a running agent can self-query its own cost and quota state mid-session via MCP and adjust behavior (downgrade model, defer expensive subagent, abort) before hitting the 5-hour wall. None of the surveyed competitors do this — ccusage's MCP is for user-query, not agent-self-query. diff --git a/README.md b/README.md index f201b0b..7fc445f 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,16 @@ Together these make `oneShotRate = oneShotTurns / editTurns` computable directly Override ledger location with `RELAYBURN_HOME=/path/to/dir`. +### Reasoning-token pricing semantics + +`usage.reasoning` on a `TurnRecord` is always preserved for observability, but how it's billed depends on the source and model: + +- **Codex (`source: 'codex'`)** — `output_tokens` already includes reasoning. `burn` does **not** double-bill reasoning on top of output. `usage.reasoning` is informational only. (Matches `ccusage`'s Codex semantics.) +- **Models with a distinct `cost.reasoning` tariff in `models.dev`** — billed at that tariff (e.g. Alibaba Qwen reasoning models). The flattened `ModelCost` carries `reasoning` and `reasoningMode: 'separate'`. +- **Everything else (Anthropic Claude, default)** — billed at the model's `output` rate. `reasoningMode: 'same_as_output'`. + +You can override per-call via `costForUsage(usage, model, pricing, { reasoningMode })`. + ## CLI ``` diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md index e73cc0d..5b148a3 100644 --- a/packages/analyze/CHANGELOG.md +++ b/packages/analyze/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- **Reasoning-token pricing semantics** (#32). Two correctness bugs that distorted reported spend whenever reasoning tokens were involved: + - Codex `usage.reasoning` was double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. `burn` now treats Codex turns as `included_in_output` and bills `output` only. On a 10-turn Codex sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead), this drops the reported cost from $4.282607 to $3.846557 — about 11% off the Codex slice. + - `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (e.g. Alibaba Qwen reasoning models) couldn't be priced correctly. The flattener now preserves `reasoning` and tags the entry `reasoningMode: 'separate'`; `costForUsage` honors the distinct tariff. + +### Added + +- `ModelCost.reasoningMode: 'included_in_output' | 'separate' | 'same_as_output'` and optional `reasoning` per-million tariff. `ReasoningMode` and `CostForUsageOptions` are exported. +- `costForUsage(usage, model, pricing, { reasoningMode })` accepts an explicit override. `costForTurn` infers `included_in_output` for `source: 'codex'` automatically. +- `flatten` is now exported so callers can build `PricingTable`s from in-memory `models.dev` payloads. + ## [0.11.0] - 2026-04-25 ### Added diff --git a/packages/analyze/src/cost.test.ts b/packages/analyze/src/cost.test.ts index 48c0ab1..cf6978d 100644 --- a/packages/analyze/src/cost.test.ts +++ b/packages/analyze/src/cost.test.ts @@ -1,15 +1,20 @@ import { strict as assert } from 'node:assert'; import { describe, it } from 'node:test'; -import type { TurnRecord } from '@relayburn/reader'; +import type { SourceKind, TurnRecord } from '@relayburn/reader'; import { costForTurn, costForUsage } from './cost.js'; -import { loadBuiltinPricing } from './pricing.js'; +import { flatten, loadBuiltinPricing } from './pricing.js'; +import type { PricingTable } from './pricing.js'; -function turn(model: string, u: Partial = {}): TurnRecord { +function turn( + model: string, + u: Partial = {}, + source: SourceKind = 'claude-code', +): TurnRecord { return { v: 1, - source: 'claude-code', + source, sessionId: 's', messageId: 'm', turnIndex: 0, @@ -67,7 +72,7 @@ describe('cost', () => { assert.equal(c.cacheCreate, p['claude-opus-4-7']!.cacheWrite); }); - it('bills reasoning tokens at the output rate and reports them separately', async () => { + it('bills reasoning at the output rate for Claude (same_as_output mode)', async () => { const p = await loadBuiltinPricing(); const c = costForTurn( turn('claude-sonnet-4-6', { output: 1_000_000, reasoning: 1_000_000 }), @@ -75,11 +80,140 @@ describe('cost', () => { ); assert.ok(c); const rate = p['claude-sonnet-4-6']!; + assert.equal(rate.reasoningMode, 'same_as_output'); assert.equal(c.output, rate.output); assert.equal(c.reasoning, rate.output); assert.equal(c.total, rate.output * 2); }); + it('does NOT double-bill reasoning for Codex turns (included_in_output)', async () => { + // Acceptance criterion from issue #32: a Codex turn with + // input = 1_000_000, output = 500_000, reasoning = 200_000 + // and a model priced input=2.5/output=15 should bill 10.0, not 13.0. + const p: PricingTable = { + 'gpt-5-codex': { + input: 2.5, + output: 15, + cacheRead: 0, + cacheWrite: 2.5, + reasoningMode: 'same_as_output', + }, + }; + const c = costForTurn( + turn( + 'gpt-5-codex', + { input: 1_000_000, output: 500_000, reasoning: 200_000 }, + 'codex', + ), + p, + ); + assert.ok(c); + assert.equal(c.input, 2.5); + assert.equal(c.output, 7.5); + assert.equal(c.reasoning, 0, 'reasoning is informational for Codex, not billed'); + assert.equal(c.total, 10.0); + }); + + it('Codex regression: 11.3% overstatement scenario from the issue', async () => { + // 10 Codex turns aggregated: input 660_698, output 52_676, reasoning 29_070, + // cacheRead 5_618_688. The issue documents $4.282607 (current/wrong) vs + // $3.846557 (corrected) at gpt-5-codex pricing (input=1.25, output=10, + // cacheRead=0.125). We assert the corrected number to within 1e-6. + const p: PricingTable = { + 'gpt-5-codex': { + input: 1.25, + output: 10, + cacheRead: 0.125, + cacheWrite: 1.25, + reasoningMode: 'same_as_output', + }, + }; + const c = costForTurn( + turn( + 'gpt-5-codex', + { + input: 660_698, + output: 52_676, + reasoning: 29_070, + cacheRead: 5_618_688, + }, + 'codex', + ), + p, + ); + assert.ok(c); + // input + output + cacheRead, reasoning is zero for codex + const expected = + (660_698 / 1_000_000) * 1.25 + + (52_676 / 1_000_000) * 10 + + (5_618_688 / 1_000_000) * 0.125; + assert.ok( + Math.abs(c.total - expected) < 1e-9, + `expected ${expected}, got ${c.total}`, + ); + assert.equal(c.reasoning, 0); + }); + + it('honors a separate reasoning tariff when models.dev provides one', async () => { + // Acceptance criterion from issue #32: a model with input=1, output=4, + // reasoning=8 and 1M tokens of each should bill 13. + const p: PricingTable = { + 'synthetic-reasoner': { + input: 1, + output: 4, + reasoning: 8, + cacheRead: 0, + cacheWrite: 1, + reasoningMode: 'separate', + }, + }; + const c = costForUsage( + { + input: 1_000_000, + output: 1_000_000, + reasoning: 1_000_000, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + 'synthetic-reasoner', + p, + ); + assert.ok(c); + assert.equal(c.input, 1); + assert.equal(c.output, 4); + assert.equal(c.reasoning, 8); + assert.equal(c.total, 13); + }); + + it('explicit reasoningMode option overrides the model default', async () => { + const p: PricingTable = { + 'override-test': { + input: 1, + output: 10, + cacheRead: 0, + cacheWrite: 1, + reasoningMode: 'same_as_output', + }, + }; + const usage = { + input: 0, + output: 0, + reasoning: 1_000_000, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }; + const billed = costForUsage(usage, 'override-test', p); + const skipped = costForUsage(usage, 'override-test', p, { + reasoningMode: 'included_in_output', + }); + assert.ok(billed); + assert.ok(skipped); + assert.equal(billed.reasoning, 10); + assert.equal(skipped.reasoning, 0); + }); + it('returns null for unknown model', async () => { const p = await loadBuiltinPricing(); const c = costForTurn(turn('definitely-not-a-model', { input: 100 }), p); @@ -92,3 +226,64 @@ describe('cost', () => { assert.ok(rate.cacheRead < rate.input); }); }); + +describe('pricing.flatten', () => { + it('preserves cost.reasoning from models.dev and tags it `separate`', () => { + const root = { + acme: { + id: 'acme', + models: { + 'reasoner-v1': { + id: 'reasoner-v1', + cost: { + input: 0.7, + output: 2.8, + reasoning: 8.4, + cache_read: 0.07, + cache_write: 0.7, + }, + }, + }, + }, + }; + const table = flatten(root); + const entry = table['reasoner-v1']; + assert.ok(entry, 'reasoner-v1 flattened'); + assert.equal(entry.input, 0.7); + assert.equal(entry.output, 2.8); + assert.equal(entry.reasoning, 8.4); + assert.equal(entry.cacheRead, 0.07); + assert.equal(entry.cacheWrite, 0.7); + assert.equal(entry.reasoningMode, 'separate'); + }); + + it('defaults reasoningMode to `same_as_output` when no reasoning tariff is given', () => { + const root = { + acme: { + id: 'acme', + models: { + 'plain-v1': { + id: 'plain-v1', + cost: { input: 1, output: 2 }, + }, + }, + }, + }; + const table = flatten(root); + const entry = table['plain-v1']; + assert.ok(entry); + assert.equal(entry.reasoningMode, 'same_as_output'); + assert.equal(entry.reasoning, undefined); + }); + + it('builtin snapshot preserves at least one separate-tariff model', async () => { + // Smoke test: prove the live snapshot loader retains cost.reasoning for + // providers like Alibaba's Qwen that publish a distinct tariff. + const p = await loadBuiltinPricing(); + const separate = Object.values(p).filter((m) => m.reasoningMode === 'separate'); + assert.ok(separate.length > 0, 'expected at least one separate-tariff model'); + for (const m of separate) { + assert.equal(typeof m.reasoning, 'number'); + } + }); +}); diff --git a/packages/analyze/src/cost.ts b/packages/analyze/src/cost.ts index 423c425..9de5255 100644 --- a/packages/analyze/src/cost.ts +++ b/packages/analyze/src/cost.ts @@ -1,6 +1,6 @@ -import type { TurnRecord, Usage } from '@relayburn/reader'; +import type { SourceKind, TurnRecord, Usage } from '@relayburn/reader'; -import type { ModelCost, PricingTable } from './pricing.js'; +import type { ModelCost, PricingTable, ReasoningMode } from './pricing.js'; export interface CostBreakdown { model: string; @@ -12,18 +12,30 @@ export interface CostBreakdown { cacheCreate: number; } +export interface CostForUsageOptions { + /** + * Override the reasoning-billing semantics for this call. When omitted, the + * mode is taken from the resolved `ModelCost` (`reasoningMode`). When given, + * it wins — used by `costForTurn` to force `included_in_output` for sources + * (e.g. Codex) whose transcripts already fold reasoning into `output_tokens`. + */ + reasoningMode?: ReasoningMode; +} + const PER_MILLION = 1_000_000; export function costForUsage( usage: Usage, model: string, pricing: PricingTable, + options: CostForUsageOptions = {}, ): CostBreakdown | null { const rate = lookup(model, pricing); if (!rate) return null; + const mode: ReasoningMode = options.reasoningMode ?? rate.reasoningMode; const input = (usage.input / PER_MILLION) * rate.input; const output = (usage.output / PER_MILLION) * rate.output; - const reasoning = (usage.reasoning / PER_MILLION) * rate.output; + const reasoning = reasoningCost(usage.reasoning, rate, mode); const cacheRead = (usage.cacheRead / PER_MILLION) * rate.cacheRead; const cacheCreate = ((usage.cacheCreate5m + usage.cacheCreate1h) / PER_MILLION) * rate.cacheWrite; @@ -39,7 +51,38 @@ export function costForUsage( } export function costForTurn(turn: TurnRecord, pricing: PricingTable): CostBreakdown | null { - return costForUsage(turn.usage, turn.model, pricing); + const override = reasoningModeForSource(turn.source); + const opts: CostForUsageOptions = override ? { reasoningMode: override } : {}; + return costForUsage(turn.usage, turn.model, pricing, opts); +} + +function reasoningCost(reasoningTokens: number, rate: ModelCost, mode: ReasoningMode): number { + switch (mode) { + case 'included_in_output': + // Already billed inside `usage.output` — informational only. + return 0; + case 'separate': + // Use the model's distinct reasoning tariff. If the override forced this + // mode but the model has no `rate.reasoning`, fall back to the output + // rate so we never silently drop reasoning tokens. + return (reasoningTokens / PER_MILLION) * (rate.reasoning ?? rate.output); + case 'same_as_output': + default: + return (reasoningTokens / PER_MILLION) * rate.output; + } +} + +/** + * Per-source reasoning-billing semantics override. Returning `undefined` means + * "defer to the model's `reasoningMode`". + * + * - Codex: `output_tokens` already includes reasoning; never bill it on top. + * See `../research/ccusage/apps/codex/src/data-loader.ts` for prior art. + * - Everyone else: defer to the model. + */ +function reasoningModeForSource(source: SourceKind): ReasoningMode | undefined { + if (source === 'codex') return 'included_in_output'; + return undefined; } function lookup(model: string, pricing: PricingTable): ModelCost | undefined { diff --git a/packages/analyze/src/index.ts b/packages/analyze/src/index.ts index 0f77cc7..358983d 100644 --- a/packages/analyze/src/index.ts +++ b/packages/analyze/src/index.ts @@ -1,7 +1,7 @@ -export { loadBuiltinPricing, loadPricing } from './pricing.js'; -export type { ModelCost, PricingTable } from './pricing.js'; +export { flatten, loadBuiltinPricing, loadPricing } from './pricing.js'; +export type { ModelCost, PricingTable, ReasoningMode } from './pricing.js'; export { costForTurn, costForUsage, sumCosts } from './cost.js'; -export type { CostBreakdown } from './cost.js'; +export type { CostBreakdown, CostForUsageOptions } from './cost.js'; export { buildCompareTable, DEFAULT_MIN_SAMPLE } from './compare.js'; export type { CompareCategory, CompareCell, CompareOptions, CompareTable } from './compare.js'; export { diff --git a/packages/analyze/src/plan-usage.test.ts b/packages/analyze/src/plan-usage.test.ts index a3ed08a..e3ef4cc 100644 --- a/packages/analyze/src/plan-usage.test.ts +++ b/packages/analyze/src/plan-usage.test.ts @@ -13,6 +13,7 @@ const PRICING: PricingTable = { output: 15, cacheRead: 0.3, cacheWrite: 3.75, + reasoningMode: 'same_as_output', }, }; diff --git a/packages/analyze/src/pricing.ts b/packages/analyze/src/pricing.ts index 1a54ec8..a853856 100644 --- a/packages/analyze/src/pricing.ts +++ b/packages/analyze/src/pricing.ts @@ -4,11 +4,30 @@ import { fileURLToPath } from 'node:url'; import { pricingOverridePath } from '@relayburn/ledger'; +/** + * How a model's reasoning tokens should be priced. + * + * - `included_in_output`: The harness/source already counts reasoning tokens + * inside `output_tokens`, so `usage.reasoning` is informational only and + * must NOT be billed on top of `usage.output`. Codex transcripts behave + * this way. + * - `separate`: The model has a distinct reasoning tariff (`cost.reasoning` + * in the `models.dev` snapshot). Bill `usage.reasoning` at that tariff. + * - `same_as_output`: `usage.output` and `usage.reasoning` are non-overlapping + * token buckets and there is no distinct reasoning tariff. Bill + * `usage.reasoning` at the output rate. Anthropic Claude transcripts are + * the canonical example. + */ +export type ReasoningMode = 'included_in_output' | 'separate' | 'same_as_output'; + export interface ModelCost { input: number; output: number; cacheRead: number; cacheWrite: number; + /** Per-million reasoning-token tariff. Set iff `reasoningMode === 'separate'`. */ + reasoning?: number; + reasoningMode: ReasoningMode; } export type PricingTable = Record; @@ -20,6 +39,7 @@ interface ModelsDevModel { output?: number; cache_read?: number; cache_write?: number; + reasoning?: number; }; } @@ -54,7 +74,7 @@ async function loadFromFile(filePath: string): Promise { return flatten(parsed); } -function flatten(root: ModelsDevRoot): PricingTable { +export function flatten(root: ModelsDevRoot): PricingTable { const out: PricingTable = {}; for (const provider of Object.values(root)) { const models = provider.models; @@ -62,12 +82,18 @@ function flatten(root: ModelsDevRoot): PricingTable { for (const [id, model] of Object.entries(models)) { const cost = model.cost; if (!cost || typeof cost.input !== 'number' || typeof cost.output !== 'number') continue; - out[id] = { + const hasReasoning = typeof cost.reasoning === 'number'; + const entry: ModelCost = { input: cost.input, output: cost.output, cacheRead: cost.cache_read ?? 0, cacheWrite: cost.cache_write ?? cost.input, + reasoningMode: hasReasoning ? 'separate' : 'same_as_output', }; + if (hasReasoning && typeof cost.reasoning === 'number') { + entry.reasoning = cost.reasoning; + } + out[id] = entry; } } return out; diff --git a/packages/mcp/src/end-to-end.test.ts b/packages/mcp/src/end-to-end.test.ts index ffa01c9..8920214 100644 --- a/packages/mcp/src/end-to-end.test.ts +++ b/packages/mcp/src/end-to-end.test.ts @@ -16,7 +16,13 @@ interface JsonRpcResponse { } const PRICING: PricingTable = { - 'claude-sonnet-4-5': { input: 3, output: 15, cacheRead: 0.3, cacheWrite: 3.75 }, + 'claude-sonnet-4-5': { + input: 3, + output: 15, + cacheRead: 0.3, + cacheWrite: 3.75, + reasoningMode: 'same_as_output', + }, }; function turn(): EnrichedTurn { diff --git a/packages/mcp/src/tools/session-cost.test.ts b/packages/mcp/src/tools/session-cost.test.ts index 4178f4d..af9e0b3 100644 --- a/packages/mcp/src/tools/session-cost.test.ts +++ b/packages/mcp/src/tools/session-cost.test.ts @@ -12,6 +12,7 @@ const PRICING: PricingTable = { output: 15, cacheRead: 0.3, cacheWrite: 3.75, + reasoningMode: 'same_as_output', }, }; From d1ad409104a25d58918af633c225907fd2cd2f04 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Fri, 24 Apr 2026 22:19:45 -0400 Subject: [PATCH 2/2] Make waste-attribution session totals reasoning-mode aware (Devin review on #73) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `packages/analyze/src/waste.ts` had a private `costForTurnLocal` that duplicated `costForUsage`'s arithmetic but predated the reasoning-mode work in this PR. It unconditionally billed `usage.reasoning` at the output rate, so: 1. Codex turns were still double-billed in `sessionGrand` / `grandCost` / `unattributedCost` — the exact bug #32 was supposed to fix, just in a different code path. 2. Models with a separate reasoning tariff (e.g. Alibaba Qwen) were billed at the output rate instead of `rate.reasoning`. The fix is to delegate to the canonical `costForTurn`, which already threads `reasoningModeForSource` (Codex -> `included_in_output`) and honors `ModelCost.reasoningMode` per model. That keeps waste totals consistent with `cost.ts` / `costForTurn` for any session involving reasoning tokens. Adds a regression test that constructs a Codex turn with input/output/reasoning and asserts `attributeWaste(...).grandTotal` does not include `reasoning x output_rate`. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/analyze/CHANGELOG.md | 1 + packages/analyze/src/waste.test.ts | 42 ++++++++++++++++++++++++++++++ packages/analyze/src/waste.ts | 22 +++++----------- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md index 5b148a3..b766fd5 100644 --- a/packages/analyze/CHANGELOG.md +++ b/packages/analyze/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Reasoning-token pricing semantics** (#32). Two correctness bugs that distorted reported spend whenever reasoning tokens were involved: - Codex `usage.reasoning` was double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. `burn` now treats Codex turns as `included_in_output` and bills `output` only. On a 10-turn Codex sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead), this drops the reported cost from $4.282607 to $3.846557 — about 11% off the Codex slice. - `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (e.g. Alibaba Qwen reasoning models) couldn't be priced correctly. The flattener now preserves `reasoning` and tags the entry `reasoningMode: 'separate'`; `costForUsage` honors the distinct tariff. +- **Waste-attribution session totals now honor the same reasoning-mode semantics** as `costForTurn`. `attributeWaste` previously had a private `costForTurnLocal` that unconditionally billed reasoning at the output rate, which double-billed Codex turns and ignored separate reasoning tariffs in `sessionGrand` / `grandCost` / `unattributedCost`. It now delegates to `costForTurn`, so waste totals match `cost.ts` for any session involving reasoning tokens (Devin review on #73). ### Added diff --git a/packages/analyze/src/waste.test.ts b/packages/analyze/src/waste.test.ts index f8dffef..ddeef44 100644 --- a/packages/analyze/src/waste.test.ts +++ b/packages/analyze/src/waste.test.ts @@ -375,6 +375,48 @@ describe('attributeWaste', () => { assert.ok(Math.abs(a.persistenceCost - expectedPersistence) < 1e-9, `persistenceCost=${a.persistenceCost} expected=${expectedPersistence}`); }); + it("session grand total honors source-aware reasoning semantics (Codex doesn't double-bill)", async () => { + // Regression test: `attributeWaste` must use the canonical `costForTurn` + // so it inherits per-source reasoning-billing semantics (`included_in_output` + // for Codex). Otherwise waste's `sessionGrand` overstates Codex spend by + // `reasoning × output_rate`, contradicting `costForTurn` totals. + const pricing = await loadBuiltinPricing(); + // Pick a model that exists in the snapshot under both Anthropic and openai + // routes is not required — we just need a known Codex model. `gpt-5-codex` + // is the canonical Codex model in the issue. Fall back to an Anthropic + // model if it's missing from the snapshot. + const codexModel = pricing['gpt-5-codex'] ? 'gpt-5-codex' : 'claude-sonnet-4-6'; + const sessionId = 's-codex-reasoning'; + const turns: TurnRecord[] = [ + turn({ + sessionId, + messageId: 'msg-0', + turnIndex: 0, + source: 'codex', + model: codexModel, + usage: { + input: 1000, + // Codex's `output_tokens` already includes reasoning. Reasoning + // must NOT be billed on top. + output: 500, + reasoning: 200, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + }), + ]; + const result = attributeWaste(turns, { pricing }); + + const rate = pricing[codexModel]!; + const expected = + (1000 / 1_000_000) * rate.input + (500 / 1_000_000) * rate.output; + assert.ok( + Math.abs(result.grandTotal - expected) < 1e-9, + `Codex sessionGrand should not include reasoning at output rate: got=${result.grandTotal} expected=${expected}`, + ); + }); + it('grand total + unattributed = session grand total within rounding', async () => { const pricing = await loadBuiltinPricing(); const sessionId = 's-totals'; diff --git a/packages/analyze/src/waste.ts b/packages/analyze/src/waste.ts index 3f2f688..1847ada 100644 --- a/packages/analyze/src/waste.ts +++ b/packages/analyze/src/waste.ts @@ -1,5 +1,6 @@ import type { ContentRecord, TurnRecord } from '@relayburn/reader'; +import { costForTurn } from './cost.js'; import type { ModelCost, PricingTable } from './pricing.js'; const PER_MILLION = 1_000_000; @@ -90,8 +91,12 @@ export function attributeWaste( let sessionGrand = 0; for (const t of sessionTurns) { - const cost = costForTurnLocal(t, pricing); - if (cost !== null) sessionGrand += cost; + // Use the canonical `costForTurn` so waste-attribution totals stay + // consistent with `cost.ts` for sessions involving reasoning tokens + // (Codex `included_in_output`, models with a separate reasoning tariff, + // etc.). Returns null for unknown models — skip those, same as before. + const breakdown = costForTurn(t, pricing); + if (breakdown !== null) sessionGrand += breakdown.total; } let sessionAttributed = 0; @@ -348,19 +353,6 @@ function lookupRate(model: string, pricing: PricingTable): ModelCost | undefined return undefined; } -function costForTurnLocal(turn: TurnRecord, pricing: PricingTable): number | null { - const rate = lookupRate(turn.model, pricing); - if (!rate) return null; - const u = turn.usage; - return ( - (u.input / PER_MILLION) * rate.input + - (u.output / PER_MILLION) * rate.output + - (u.reasoning / PER_MILLION) * rate.output + - (u.cacheRead / PER_MILLION) * rate.cacheRead + - ((u.cacheCreate5m + u.cacheCreate1h) / PER_MILLION) * rate.cacheWrite - ); -} - export interface FileAggregation { path: string; toolCallCount: number;