From 5a3d0c6485a66cf4a9a67ac700edd9678b9a158b Mon Sep 17 00:00:00 2001
From: Will Washburn <will.washburn@gmail.com>
Date: Fri, 24 Apr 2026 21:50:45 -0400
Subject: [PATCH 1/2] Fix reasoning-token pricing semantics, preserve
 models.dev reasoning tariffs (#32)

Two pricing-correctness bugs that distorted reported spend whenever
reasoning tokens were involved:

1. Codex `usage.reasoning` was being double-billed at the output rate
   even though Codex's `output_tokens` already includes reasoning. On
   the issue's 10-turn sample (660k input / 53k output / 29k reasoning /
   5.6M cacheRead) this overstates cost by $0.43 / 11.3% of the slice.

2. `cost.reasoning` from the `models.dev` snapshot was discarded during
   `flatten()`, so any model with a distinct reasoning tariff (Alibaba
   Qwen reasoning models, etc.) couldn't be priced correctly.

Fix:
- Extend `ModelCost` with `reasoning?: number` and `reasoningMode:
  'included_in_output' | 'separate' | 'same_as_output'`.
- `flatten()` preserves `cost.reasoning` and tags the entry `separate`.
  Models without a distinct reasoning tariff default to `same_as_output`
  (preserves existing Claude billing).
- `costForUsage` branches on the resolved mode. `costForTurn` infers
  `included_in_output` for `source: 'codex'` so reasoning is recorded
  but not billed on top of output.
- `usage.reasoning` is still preserved in `TurnRecord` for observability.
  The bug was in pricing, not data capture.

Tests cover all four acceptance criteria from the issue: Codex
input=1M/output=500k/reasoning=200k bills 10.0 (not 13.0); the
synthetic `separate` model bills 13 for 1M of each bucket; the
documented Codex regression scenario; `flatten` preserves reasoning;
the builtin snapshot retains at least one separate-tariff model.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                |   4 +
 README.md                                   |  10 +
 packages/analyze/CHANGELOG.md               |  12 ++
 packages/analyze/src/cost.test.ts           | 205 +++++++++++++++++++-
 packages/analyze/src/cost.ts                |  51 ++++-
 packages/analyze/src/index.ts               |   6 +-
 packages/analyze/src/plan-usage.test.ts     |   1 +
 packages/analyze/src/pricing.ts             |  30 ++-
 packages/mcp/src/end-to-end.test.ts         |   8 +-
 packages/mcp/src/tools/session-cost.test.ts |   1 +
 10 files changed, 313 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94b83da..7d0abaf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reasoning-token pricing semantics** (#32). User-visible cost numbers will change downward for any session with non-zero reasoning tokens — most notably Codex sessions, where reasoning was being billed twice (once inside `output_tokens` and again on top via `usage.reasoning`). On the documented 10-turn Codex sample the reported cost drops from $4.282607 to $3.846557 (~11.3%). Models with a distinct reasoning tariff in `models.dev` (e.g. Alibaba Qwen reasoning models) are now priced correctly instead of falling through at the output rate. The reader-level `usage.reasoning` field is unchanged — the bug was in pricing, not data capture. See `packages/analyze/CHANGELOG.md` for the full breakdown.
+
 ### Added
 
 - **`@relayburn/mcp` package + `burn mcp-server`** (#26). Closes the loop between observation and decision: a running agent can self-query its own cost and quota state mid-session via MCP and adjust behavior (downgrade model, defer expensive subagent, abort) before hitting the 5-hour wall. None of the surveyed competitors do this — ccusage's MCP is for user-query, not agent-self-query.
diff --git a/README.md b/README.md
index f201b0b..7fc445f 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,16 @@ Together these make `oneShotRate = oneShotTurns / editTurns` computable directly
 
 Override ledger location with `RELAYBURN_HOME=/path/to/dir`.
 
+### Reasoning-token pricing semantics
+
+`usage.reasoning` on a `TurnRecord` is always preserved for observability, but how it's billed depends on the source and model:
+
+- **Codex (`source: 'codex'`)** — `output_tokens` already includes reasoning. `burn` does **not** double-bill reasoning on top of output. `usage.reasoning` is informational only. (Matches `ccusage`'s Codex semantics.)
+- **Models with a distinct `cost.reasoning` tariff in `models.dev`** — billed at that tariff (e.g. Alibaba Qwen reasoning models). The flattened `ModelCost` carries `reasoning` and `reasoningMode: 'separate'`.
+- **Everything else (Anthropic Claude, default)** — billed at the model's `output` rate. `reasoningMode: 'same_as_output'`.
+
+You can override per-call via `costForUsage(usage, model, pricing, { reasoningMode })`.
+
 ## CLI
 
 ```
diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md
index e73cc0d..5b148a3 100644
--- a/packages/analyze/CHANGELOG.md
+++ b/packages/analyze/CHANGELOG.md
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reasoning-token pricing semantics** (#32). Two correctness bugs that distorted reported spend whenever reasoning tokens were involved:
+  - Codex `usage.reasoning` was double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. `burn` now treats Codex turns as `included_in_output` and bills `output` only. On a 10-turn Codex sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead), this drops the reported cost from $4.282607 to $3.846557 — about 11% off the Codex slice.
+  - `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (e.g. Alibaba Qwen reasoning models) couldn't be priced correctly. The flattener now preserves `reasoning` and tags the entry `reasoningMode: 'separate'`; `costForUsage` honors the distinct tariff.
+
+### Added
+
+- `ModelCost.reasoningMode: 'included_in_output' | 'separate' | 'same_as_output'` and optional `reasoning` per-million tariff. `ReasoningMode` and `CostForUsageOptions` are exported.
+- `costForUsage(usage, model, pricing, { reasoningMode })` accepts an explicit override. `costForTurn` infers `included_in_output` for `source: 'codex'` automatically.
+- `flatten` is now exported so callers can build `PricingTable`s from in-memory `models.dev` payloads.
+
 ## [0.11.0] - 2026-04-25
 
 ### Added
diff --git a/packages/analyze/src/cost.test.ts b/packages/analyze/src/cost.test.ts
index 48c0ab1..cf6978d 100644
--- a/packages/analyze/src/cost.test.ts
+++ b/packages/analyze/src/cost.test.ts
@@ -1,15 +1,20 @@
 import { strict as assert } from 'node:assert';
 import { describe, it } from 'node:test';
 
-import type { TurnRecord } from '@relayburn/reader';
+import type { SourceKind, TurnRecord } from '@relayburn/reader';
 
 import { costForTurn, costForUsage } from './cost.js';
-import { loadBuiltinPricing } from './pricing.js';
+import { flatten, loadBuiltinPricing } from './pricing.js';
+import type { PricingTable } from './pricing.js';
 
-function turn(model: string, u: Partial<TurnRecord['usage']> = {}): TurnRecord {
+function turn(
+  model: string,
+  u: Partial<TurnRecord['usage']> = {},
+  source: SourceKind = 'claude-code',
+): TurnRecord {
   return {
     v: 1,
-    source: 'claude-code',
+    source,
     sessionId: 's',
     messageId: 'm',
     turnIndex: 0,
@@ -67,7 +72,7 @@ describe('cost', () => {
     assert.equal(c.cacheCreate, p['claude-opus-4-7']!.cacheWrite);
   });
 
-  it('bills reasoning tokens at the output rate and reports them separately', async () => {
+  it('bills reasoning at the output rate for Claude (same_as_output mode)', async () => {
     const p = await loadBuiltinPricing();
     const c = costForTurn(
       turn('claude-sonnet-4-6', { output: 1_000_000, reasoning: 1_000_000 }),
@@ -75,11 +80,140 @@ describe('cost', () => {
     );
     assert.ok(c);
     const rate = p['claude-sonnet-4-6']!;
+    assert.equal(rate.reasoningMode, 'same_as_output');
     assert.equal(c.output, rate.output);
     assert.equal(c.reasoning, rate.output);
     assert.equal(c.total, rate.output * 2);
   });
 
+  it('does NOT double-bill reasoning for Codex turns (included_in_output)', async () => {
+    // Acceptance criterion from issue #32: a Codex turn with
+    //   input = 1_000_000, output = 500_000, reasoning = 200_000
+    // and a model priced input=2.5/output=15 should bill 10.0, not 13.0.
+    const p: PricingTable = {
+      'gpt-5-codex': {
+        input: 2.5,
+        output: 15,
+        cacheRead: 0,
+        cacheWrite: 2.5,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const c = costForTurn(
+      turn(
+        'gpt-5-codex',
+        { input: 1_000_000, output: 500_000, reasoning: 200_000 },
+        'codex',
+      ),
+      p,
+    );
+    assert.ok(c);
+    assert.equal(c.input, 2.5);
+    assert.equal(c.output, 7.5);
+    assert.equal(c.reasoning, 0, 'reasoning is informational for Codex, not billed');
+    assert.equal(c.total, 10.0);
+  });
+
+  it('Codex regression: 11.3% overstatement scenario from the issue', async () => {
+    // 10 Codex turns aggregated: input 660_698, output 52_676, reasoning 29_070,
+    // cacheRead 5_618_688. The issue documents $4.282607 (current/wrong) vs
+    // $3.846557 (corrected) at gpt-5-codex pricing (input=1.25, output=10,
+    // cacheRead=0.125). We assert the corrected number to within 1e-6.
+    const p: PricingTable = {
+      'gpt-5-codex': {
+        input: 1.25,
+        output: 10,
+        cacheRead: 0.125,
+        cacheWrite: 1.25,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const c = costForTurn(
+      turn(
+        'gpt-5-codex',
+        {
+          input: 660_698,
+          output: 52_676,
+          reasoning: 29_070,
+          cacheRead: 5_618_688,
+        },
+        'codex',
+      ),
+      p,
+    );
+    assert.ok(c);
+    // input + output + cacheRead, reasoning is zero for codex
+    const expected =
+      (660_698 / 1_000_000) * 1.25 +
+      (52_676 / 1_000_000) * 10 +
+      (5_618_688 / 1_000_000) * 0.125;
+    assert.ok(
+      Math.abs(c.total - expected) < 1e-9,
+      `expected ${expected}, got ${c.total}`,
+    );
+    assert.equal(c.reasoning, 0);
+  });
+
+  it('honors a separate reasoning tariff when models.dev provides one', async () => {
+    // Acceptance criterion from issue #32: a model with input=1, output=4,
+    // reasoning=8 and 1M tokens of each should bill 13.
+    const p: PricingTable = {
+      'synthetic-reasoner': {
+        input: 1,
+        output: 4,
+        reasoning: 8,
+        cacheRead: 0,
+        cacheWrite: 1,
+        reasoningMode: 'separate',
+      },
+    };
+    const c = costForUsage(
+      {
+        input: 1_000_000,
+        output: 1_000_000,
+        reasoning: 1_000_000,
+        cacheRead: 0,
+        cacheCreate5m: 0,
+        cacheCreate1h: 0,
+      },
+      'synthetic-reasoner',
+      p,
+    );
+    assert.ok(c);
+    assert.equal(c.input, 1);
+    assert.equal(c.output, 4);
+    assert.equal(c.reasoning, 8);
+    assert.equal(c.total, 13);
+  });
+
+  it('explicit reasoningMode option overrides the model default', async () => {
+    const p: PricingTable = {
+      'override-test': {
+        input: 1,
+        output: 10,
+        cacheRead: 0,
+        cacheWrite: 1,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const usage = {
+      input: 0,
+      output: 0,
+      reasoning: 1_000_000,
+      cacheRead: 0,
+      cacheCreate5m: 0,
+      cacheCreate1h: 0,
+    };
+    const billed = costForUsage(usage, 'override-test', p);
+    const skipped = costForUsage(usage, 'override-test', p, {
+      reasoningMode: 'included_in_output',
+    });
+    assert.ok(billed);
+    assert.ok(skipped);
+    assert.equal(billed.reasoning, 10);
+    assert.equal(skipped.reasoning, 0);
+  });
+
   it('returns null for unknown model', async () => {
     const p = await loadBuiltinPricing();
     const c = costForTurn(turn('definitely-not-a-model', { input: 100 }), p);
@@ -92,3 +226,64 @@ describe('cost', () => {
     assert.ok(rate.cacheRead < rate.input);
   });
 });
+
+describe('pricing.flatten', () => {
+  it('preserves cost.reasoning from models.dev and tags it `separate`', () => {
+    const root = {
+      acme: {
+        id: 'acme',
+        models: {
+          'reasoner-v1': {
+            id: 'reasoner-v1',
+            cost: {
+              input: 0.7,
+              output: 2.8,
+              reasoning: 8.4,
+              cache_read: 0.07,
+              cache_write: 0.7,
+            },
+          },
+        },
+      },
+    };
+    const table = flatten(root);
+    const entry = table['reasoner-v1'];
+    assert.ok(entry, 'reasoner-v1 flattened');
+    assert.equal(entry.input, 0.7);
+    assert.equal(entry.output, 2.8);
+    assert.equal(entry.reasoning, 8.4);
+    assert.equal(entry.cacheRead, 0.07);
+    assert.equal(entry.cacheWrite, 0.7);
+    assert.equal(entry.reasoningMode, 'separate');
+  });
+
+  it('defaults reasoningMode to `same_as_output` when no reasoning tariff is given', () => {
+    const root = {
+      acme: {
+        id: 'acme',
+        models: {
+          'plain-v1': {
+            id: 'plain-v1',
+            cost: { input: 1, output: 2 },
+          },
+        },
+      },
+    };
+    const table = flatten(root);
+    const entry = table['plain-v1'];
+    assert.ok(entry);
+    assert.equal(entry.reasoningMode, 'same_as_output');
+    assert.equal(entry.reasoning, undefined);
+  });
+
+  it('builtin snapshot preserves at least one separate-tariff model', async () => {
+    // Smoke test: prove the live snapshot loader retains cost.reasoning for
+    // providers like Alibaba's Qwen that publish a distinct tariff.
+    const p = await loadBuiltinPricing();
+    const separate = Object.values(p).filter((m) => m.reasoningMode === 'separate');
+    assert.ok(separate.length > 0, 'expected at least one separate-tariff model');
+    for (const m of separate) {
+      assert.equal(typeof m.reasoning, 'number');
+    }
+  });
+});
diff --git a/packages/analyze/src/cost.ts b/packages/analyze/src/cost.ts
index 423c425..9de5255 100644
--- a/packages/analyze/src/cost.ts
+++ b/packages/analyze/src/cost.ts
@@ -1,6 +1,6 @@
-import type { TurnRecord, Usage } from '@relayburn/reader';
+import type { SourceKind, TurnRecord, Usage } from '@relayburn/reader';
 
-import type { ModelCost, PricingTable } from './pricing.js';
+import type { ModelCost, PricingTable, ReasoningMode } from './pricing.js';
 
 export interface CostBreakdown {
   model: string;
@@ -12,18 +12,30 @@ export interface CostBreakdown {
   cacheCreate: number;
 }
 
+export interface CostForUsageOptions {
+  /**
+   * Override the reasoning-billing semantics for this call. When omitted, the
+   * mode is taken from the resolved `ModelCost` (`reasoningMode`). When given,
+   * it wins — used by `costForTurn` to force `included_in_output` for sources
+   * (e.g. Codex) whose transcripts already fold reasoning into `output_tokens`.
+   */
+  reasoningMode?: ReasoningMode;
+}
+
 const PER_MILLION = 1_000_000;
 
 export function costForUsage(
   usage: Usage,
   model: string,
   pricing: PricingTable,
+  options: CostForUsageOptions = {},
 ): CostBreakdown | null {
   const rate = lookup(model, pricing);
   if (!rate) return null;
+  const mode: ReasoningMode = options.reasoningMode ?? rate.reasoningMode;
   const input = (usage.input / PER_MILLION) * rate.input;
   const output = (usage.output / PER_MILLION) * rate.output;
-  const reasoning = (usage.reasoning / PER_MILLION) * rate.output;
+  const reasoning = reasoningCost(usage.reasoning, rate, mode);
   const cacheRead = (usage.cacheRead / PER_MILLION) * rate.cacheRead;
   const cacheCreate =
     ((usage.cacheCreate5m + usage.cacheCreate1h) / PER_MILLION) * rate.cacheWrite;
@@ -39,7 +51,38 @@ export function costForUsage(
 }
 
 export function costForTurn(turn: TurnRecord, pricing: PricingTable): CostBreakdown | null {
-  return costForUsage(turn.usage, turn.model, pricing);
+  const override = reasoningModeForSource(turn.source);
+  const opts: CostForUsageOptions = override ? { reasoningMode: override } : {};
+  return costForUsage(turn.usage, turn.model, pricing, opts);
+}
+
+function reasoningCost(reasoningTokens: number, rate: ModelCost, mode: ReasoningMode): number {
+  switch (mode) {
+    case 'included_in_output':
+      // Already billed inside `usage.output` — informational only.
+      return 0;
+    case 'separate':
+      // Use the model's distinct reasoning tariff. If the override forced this
+      // mode but the model has no `rate.reasoning`, fall back to the output
+      // rate so we never silently drop reasoning tokens.
+      return (reasoningTokens / PER_MILLION) * (rate.reasoning ?? rate.output);
+    case 'same_as_output':
+    default:
+      return (reasoningTokens / PER_MILLION) * rate.output;
+  }
+}
+
+/**
+ * Per-source reasoning-billing semantics override. Returning `undefined` means
+ * "defer to the model's `reasoningMode`".
+ *
+ * - Codex: `output_tokens` already includes reasoning; never bill it on top.
+ *   See `../research/ccusage/apps/codex/src/data-loader.ts` for prior art.
+ * - Everyone else: defer to the model.
+ */
+function reasoningModeForSource(source: SourceKind): ReasoningMode | undefined {
+  if (source === 'codex') return 'included_in_output';
+  return undefined;
 }
 
 function lookup(model: string, pricing: PricingTable): ModelCost | undefined {
diff --git a/packages/analyze/src/index.ts b/packages/analyze/src/index.ts
index 0f77cc7..358983d 100644
--- a/packages/analyze/src/index.ts
+++ b/packages/analyze/src/index.ts
@@ -1,7 +1,7 @@
-export { loadBuiltinPricing, loadPricing } from './pricing.js';
-export type { ModelCost, PricingTable } from './pricing.js';
+export { flatten, loadBuiltinPricing, loadPricing } from './pricing.js';
+export type { ModelCost, PricingTable, ReasoningMode } from './pricing.js';
 export { costForTurn, costForUsage, sumCosts } from './cost.js';
-export type { CostBreakdown } from './cost.js';
+export type { CostBreakdown, CostForUsageOptions } from './cost.js';
 export { buildCompareTable, DEFAULT_MIN_SAMPLE } from './compare.js';
 export type { CompareCategory, CompareCell, CompareOptions, CompareTable } from './compare.js';
 export {
diff --git a/packages/analyze/src/plan-usage.test.ts b/packages/analyze/src/plan-usage.test.ts
index a3ed08a..e3ef4cc 100644
--- a/packages/analyze/src/plan-usage.test.ts
+++ b/packages/analyze/src/plan-usage.test.ts
@@ -13,6 +13,7 @@ const PRICING: PricingTable = {
     output: 15,
     cacheRead: 0.3,
     cacheWrite: 3.75,
+    reasoningMode: 'same_as_output',
   },
 };
 
diff --git a/packages/analyze/src/pricing.ts b/packages/analyze/src/pricing.ts
index 1a54ec8..a853856 100644
--- a/packages/analyze/src/pricing.ts
+++ b/packages/analyze/src/pricing.ts
@@ -4,11 +4,30 @@ import { fileURLToPath } from 'node:url';
 
 import { pricingOverridePath } from '@relayburn/ledger';
 
+/**
+ * How a model's reasoning tokens should be priced.
+ *
+ * - `included_in_output`: The harness/source already counts reasoning tokens
+ *   inside `output_tokens`, so `usage.reasoning` is informational only and
+ *   must NOT be billed on top of `usage.output`. Codex transcripts behave
+ *   this way.
+ * - `separate`: The model has a distinct reasoning tariff (`cost.reasoning`
+ *   in the `models.dev` snapshot). Bill `usage.reasoning` at that tariff.
+ * - `same_as_output`: `usage.output` and `usage.reasoning` are non-overlapping
+ *   token buckets and there is no distinct reasoning tariff. Bill
+ *   `usage.reasoning` at the output rate. Anthropic Claude transcripts are
+ *   the canonical example.
+ */
+export type ReasoningMode = 'included_in_output' | 'separate' | 'same_as_output';
+
 export interface ModelCost {
   input: number;
   output: number;
   cacheRead: number;
   cacheWrite: number;
+  /** Per-million reasoning-token tariff. Set iff `reasoningMode === 'separate'`. */
+  reasoning?: number;
+  reasoningMode: ReasoningMode;
 }
 
 export type PricingTable = Record<string, ModelCost>;
@@ -20,6 +39,7 @@ interface ModelsDevModel {
     output?: number;
     cache_read?: number;
     cache_write?: number;
+    reasoning?: number;
   };
 }
 
@@ -54,7 +74,7 @@ async function loadFromFile(filePath: string): Promise<PricingTable> {
   return flatten(parsed);
 }
 
-function flatten(root: ModelsDevRoot): PricingTable {
+export function flatten(root: ModelsDevRoot): PricingTable {
   const out: PricingTable = {};
   for (const provider of Object.values(root)) {
     const models = provider.models;
@@ -62,12 +82,18 @@ function flatten(root: ModelsDevRoot): PricingTable {
     for (const [id, model] of Object.entries(models)) {
       const cost = model.cost;
       if (!cost || typeof cost.input !== 'number' || typeof cost.output !== 'number') continue;
-      out[id] = {
+      const hasReasoning = typeof cost.reasoning === 'number';
+      const entry: ModelCost = {
         input: cost.input,
         output: cost.output,
         cacheRead: cost.cache_read ?? 0,
         cacheWrite: cost.cache_write ?? cost.input,
+        reasoningMode: hasReasoning ? 'separate' : 'same_as_output',
       };
+      if (hasReasoning && typeof cost.reasoning === 'number') {
+        entry.reasoning = cost.reasoning;
+      }
+      out[id] = entry;
     }
   }
   return out;
diff --git a/packages/mcp/src/end-to-end.test.ts b/packages/mcp/src/end-to-end.test.ts
index ffa01c9..8920214 100644
--- a/packages/mcp/src/end-to-end.test.ts
+++ b/packages/mcp/src/end-to-end.test.ts
@@ -16,7 +16,13 @@ interface JsonRpcResponse {
 }
 
 const PRICING: PricingTable = {
-  'claude-sonnet-4-5': { input: 3, output: 15, cacheRead: 0.3, cacheWrite: 3.75 },
+  'claude-sonnet-4-5': {
+    input: 3,
+    output: 15,
+    cacheRead: 0.3,
+    cacheWrite: 3.75,
+    reasoningMode: 'same_as_output',
+  },
 };
 
 function turn(): EnrichedTurn {
diff --git a/packages/mcp/src/tools/session-cost.test.ts b/packages/mcp/src/tools/session-cost.test.ts
index 4178f4d..af9e0b3 100644
--- a/packages/mcp/src/tools/session-cost.test.ts
+++ b/packages/mcp/src/tools/session-cost.test.ts
@@ -12,6 +12,7 @@ const PRICING: PricingTable = {
     output: 15,
     cacheRead: 0.3,
     cacheWrite: 3.75,
+    reasoningMode: 'same_as_output',
   },
 };
 

From d1ad409104a25d58918af633c225907fd2cd2f04 Mon Sep 17 00:00:00 2001
From: Will Washburn <will.washburn@gmail.com>
Date: Fri, 24 Apr 2026 22:19:45 -0400
Subject: [PATCH 2/2] Make waste-attribution session totals reasoning-mode
 aware (Devin review on #73)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`packages/analyze/src/waste.ts` had a private `costForTurnLocal` that
duplicated `costForUsage`'s arithmetic but predated the reasoning-mode
work in this PR. It unconditionally billed `usage.reasoning` at the
output rate, so:

1. Codex turns were still double-billed in `sessionGrand` /
   `grandCost` / `unattributedCost` — the exact bug #32 was supposed
   to fix, just in a different code path.
2. Models with a separate reasoning tariff (e.g. Alibaba Qwen) were
   billed at the output rate instead of `rate.reasoning`.

The fix is to delegate to the canonical `costForTurn`, which already
threads `reasoningModeForSource` (Codex -> `included_in_output`) and
honors `ModelCost.reasoningMode` per model. That keeps waste totals
consistent with `cost.ts` / `costForTurn` for any session involving
reasoning tokens.

Adds a regression test that constructs a Codex turn with
input/output/reasoning and asserts `attributeWaste(...).grandTotal`
does not include `reasoning x output_rate`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/analyze/CHANGELOG.md      |  1 +
 packages/analyze/src/waste.test.ts | 42 ++++++++++++++++++++++++++++++
 packages/analyze/src/waste.ts      | 22 +++++-----------
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md
index 5b148a3..b766fd5 100644
--- a/packages/analyze/CHANGELOG.md
+++ b/packages/analyze/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Reasoning-token pricing semantics** (#32). Two correctness bugs that distorted reported spend whenever reasoning tokens were involved:
   - Codex `usage.reasoning` was double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. `burn` now treats Codex turns as `included_in_output` and bills `output` only. On a 10-turn Codex sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead), this drops the reported cost from $4.282607 to $3.846557 — about 11% off the Codex slice.
   - `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (e.g. Alibaba Qwen reasoning models) couldn't be priced correctly. The flattener now preserves `reasoning` and tags the entry `reasoningMode: 'separate'`; `costForUsage` honors the distinct tariff.
+- **Waste-attribution session totals now honor the same reasoning-mode semantics** as `costForTurn`. `attributeWaste` previously had a private `costForTurnLocal` that unconditionally billed reasoning at the output rate, which double-billed Codex turns and ignored separate reasoning tariffs in `sessionGrand` / `grandCost` / `unattributedCost`. It now delegates to `costForTurn`, so waste totals match `cost.ts` for any session involving reasoning tokens (Devin review on #73).
 
 ### Added
 
diff --git a/packages/analyze/src/waste.test.ts b/packages/analyze/src/waste.test.ts
index f8dffef..ddeef44 100644
--- a/packages/analyze/src/waste.test.ts
+++ b/packages/analyze/src/waste.test.ts
@@ -375,6 +375,48 @@ describe('attributeWaste', () => {
     assert.ok(Math.abs(a.persistenceCost - expectedPersistence) < 1e-9, `persistenceCost=${a.persistenceCost} expected=${expectedPersistence}`);
   });
 
+  it("session grand total honors source-aware reasoning semantics (Codex doesn't double-bill)", async () => {
+    // Regression test: `attributeWaste` must use the canonical `costForTurn`
+    // so it inherits per-source reasoning-billing semantics (`included_in_output`
+    // for Codex). Otherwise waste's `sessionGrand` overstates Codex spend by
+    // `reasoning × output_rate`, contradicting `costForTurn` totals.
+    const pricing = await loadBuiltinPricing();
+    // Pick a model that exists in the snapshot under both Anthropic and openai
+    // routes is not required — we just need a known Codex model. `gpt-5-codex`
+    // is the canonical Codex model in the issue. Fall back to an Anthropic
+    // model if it's missing from the snapshot.
+    const codexModel = pricing['gpt-5-codex'] ? 'gpt-5-codex' : 'claude-sonnet-4-6';
+    const sessionId = 's-codex-reasoning';
+    const turns: TurnRecord[] = [
+      turn({
+        sessionId,
+        messageId: 'msg-0',
+        turnIndex: 0,
+        source: 'codex',
+        model: codexModel,
+        usage: {
+          input: 1000,
+          // Codex's `output_tokens` already includes reasoning. Reasoning
+          // must NOT be billed on top.
+          output: 500,
+          reasoning: 200,
+          cacheRead: 0,
+          cacheCreate5m: 0,
+          cacheCreate1h: 0,
+        },
+      }),
+    ];
+    const result = attributeWaste(turns, { pricing });
+
+    const rate = pricing[codexModel]!;
+    const expected =
+      (1000 / 1_000_000) * rate.input + (500 / 1_000_000) * rate.output;
+    assert.ok(
+      Math.abs(result.grandTotal - expected) < 1e-9,
+      `Codex sessionGrand should not include reasoning at output rate: got=${result.grandTotal} expected=${expected}`,
+    );
+  });
+
   it('grand total + unattributed = session grand total within rounding', async () => {
     const pricing = await loadBuiltinPricing();
     const sessionId = 's-totals';
diff --git a/packages/analyze/src/waste.ts b/packages/analyze/src/waste.ts
index 3f2f688..1847ada 100644
--- a/packages/analyze/src/waste.ts
+++ b/packages/analyze/src/waste.ts
@@ -1,5 +1,6 @@
 import type { ContentRecord, TurnRecord } from '@relayburn/reader';
 
+import { costForTurn } from './cost.js';
 import type { ModelCost, PricingTable } from './pricing.js';
 
 const PER_MILLION = 1_000_000;
@@ -90,8 +91,12 @@ export function attributeWaste(
 
     let sessionGrand = 0;
     for (const t of sessionTurns) {
-      const cost = costForTurnLocal(t, pricing);
-      if (cost !== null) sessionGrand += cost;
+      // Use the canonical `costForTurn` so waste-attribution totals stay
+      // consistent with `cost.ts` for sessions involving reasoning tokens
+      // (Codex `included_in_output`, models with a separate reasoning tariff,
+      // etc.). Returns null for unknown models — skip those, same as before.
+      const breakdown = costForTurn(t, pricing);
+      if (breakdown !== null) sessionGrand += breakdown.total;
     }
 
     let sessionAttributed = 0;
@@ -348,19 +353,6 @@ function lookupRate(model: string, pricing: PricingTable): ModelCost | undefined
   return undefined;
 }
 
-function costForTurnLocal(turn: TurnRecord, pricing: PricingTable): number | null {
-  const rate = lookupRate(turn.model, pricing);
-  if (!rate) return null;
-  const u = turn.usage;
-  return (
-    (u.input / PER_MILLION) * rate.input +
-    (u.output / PER_MILLION) * rate.output +
-    (u.reasoning / PER_MILLION) * rate.output +
-    (u.cacheRead / PER_MILLION) * rate.cacheRead +
-    ((u.cacheCreate5m + u.cacheCreate1h) / PER_MILLION) * rate.cacheWrite
-  );
-}
-
 export interface FileAggregation {
   path: string;
   toolCallCount: number;