AgentWorkforce · willwashburn · Apr 26, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reasoning-token pricing semantics** (#32). User-visible cost numbers will change downward for any session with non-zero reasoning tokens — most notably Codex sessions, where reasoning was being billed twice (once inside `output_tokens` and again on top via `usage.reasoning`). On the documented 10-turn Codex sample the reported cost drops from $4.282607 to $3.846557 (~11.3%). Models with a distinct reasoning tariff in `models.dev` (e.g. Alibaba Qwen reasoning models) are now priced correctly instead of falling through at the output rate. The reader-level `usage.reasoning` field is unchanged — the bug was in pricing, not data capture. See `packages/analyze/CHANGELOG.md` for the full breakdown.
+
 ## [0.13.1] - 2026-04-25
 
 ### Added

diff --git a/README.md b/README.md
@@ -242,6 +242,16 @@ Together these make `oneShotRate = oneShotTurns / editTurns` computable directly
 
 Override ledger location with `RELAYBURN_HOME=/path/to/dir`.
 
+### Reasoning-token pricing semantics
+
+`usage.reasoning` on a `TurnRecord` is always preserved for observability, but how it's billed depends on the source and model:
+
+- **Codex (`source: 'codex'`)** — `output_tokens` already includes reasoning. `burn` does **not** double-bill reasoning on top of output. `usage.reasoning` is informational only. (Matches `ccusage`'s Codex semantics.)
+- **Models with a distinct `cost.reasoning` tariff in `models.dev`** — billed at that tariff (e.g. Alibaba Qwen reasoning models). The flattened `ModelCost` carries `reasoning` and `reasoningMode: 'separate'`.
+- **Everything else (Anthropic Claude, default)** — billed at the model's `output` rate. `reasoningMode: 'same_as_output'`.
+
+You can override per-call via `costForUsage(usage, model, pricing, { reasoningMode })`.
+
 ## CLI
 
 ```

diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reasoning-token pricing semantics** (#32). Two correctness bugs that distorted reported spend whenever reasoning tokens were involved:
+  - Codex `usage.reasoning` was double-billed at the output rate even though Codex's `output_tokens` already includes reasoning. `burn` now treats Codex turns as `included_in_output` and bills `output` only. On a 10-turn Codex sample (660k input / 53k output / 29k reasoning / 5.6M cacheRead), this drops the reported cost from $4.282607 to $3.846557 — about 11% off the Codex slice.
+  - `cost.reasoning` from the `models.dev` snapshot was discarded during `flatten()`, so any model with a distinct reasoning tariff (e.g. Alibaba Qwen reasoning models) couldn't be priced correctly. The flattener now preserves `reasoning` and tags the entry `reasoningMode: 'separate'`; `costForUsage` honors the distinct tariff.
+- **Waste-attribution session totals now honor the same reasoning-mode semantics** as `costForTurn`. `attributeWaste` previously had a private `costForTurnLocal` that unconditionally billed reasoning at the output rate, which double-billed Codex turns and ignored separate reasoning tariffs in `sessionGrand` / `grandCost` / `unattributedCost`. It now delegates to `costForTurn`, so waste totals match `cost.ts` for any session involving reasoning tokens (Devin review on #73).
+
+### Added
+
+- `ModelCost.reasoningMode: 'included_in_output' | 'separate' | 'same_as_output'` and optional `reasoning` per-million tariff. `ReasoningMode` and `CostForUsageOptions` are exported.
+- `costForUsage(usage, model, pricing, { reasoningMode })` accepts an explicit override. `costForTurn` infers `included_in_output` for `source: 'codex'` automatically.
+- `flatten` is now exported so callers can build `PricingTable`s from in-memory `models.dev` payloads.
+
 ## [0.14.0] - 2026-04-25
 
 ### Added

diff --git a/packages/analyze/src/cost.test.ts b/packages/analyze/src/cost.test.ts
@@ -1,15 +1,20 @@
 import { strict as assert } from 'node:assert';
 import { describe, it } from 'node:test';
 
-import type { TurnRecord } from '@relayburn/reader';
+import type { SourceKind, TurnRecord } from '@relayburn/reader';
 
 import { costForTurn, costForUsage } from './cost.js';
-import { loadBuiltinPricing } from './pricing.js';
+import { flatten, loadBuiltinPricing } from './pricing.js';
+import type { PricingTable } from './pricing.js';
 
-function turn(model: string, u: Partial<TurnRecord['usage']> = {}): TurnRecord {
+function turn(
+  model: string,
+  u: Partial<TurnRecord['usage']> = {},
+  source: SourceKind = 'claude-code',
+): TurnRecord {
   return {
     v: 1,
-    source: 'claude-code',
+    source,
     sessionId: 's',
     messageId: 'm',
     turnIndex: 0,
@@ -67,19 +72,148 @@ describe('cost', () => {
     assert.equal(c.cacheCreate, p['claude-opus-4-7']!.cacheWrite);
   });
 
-  it('bills reasoning tokens at the output rate and reports them separately', async () => {
+  it('bills reasoning at the output rate for Claude (same_as_output mode)', async () => {
     const p = await loadBuiltinPricing();
     const c = costForTurn(
       turn('claude-sonnet-4-6', { output: 1_000_000, reasoning: 1_000_000 }),
       p,
     );
     assert.ok(c);
     const rate = p['claude-sonnet-4-6']!;
+    assert.equal(rate.reasoningMode, 'same_as_output');
     assert.equal(c.output, rate.output);
     assert.equal(c.reasoning, rate.output);
     assert.equal(c.total, rate.output * 2);
   });
 
+  it('does NOT double-bill reasoning for Codex turns (included_in_output)', async () => {
+    // Acceptance criterion from issue #32: a Codex turn with
+    //   input = 1_000_000, output = 500_000, reasoning = 200_000
+    // and a model priced input=2.5/output=15 should bill 10.0, not 13.0.
+    const p: PricingTable = {
+      'gpt-5-codex': {
+        input: 2.5,
+        output: 15,
+        cacheRead: 0,
+        cacheWrite: 2.5,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const c = costForTurn(
+      turn(
+        'gpt-5-codex',
+        { input: 1_000_000, output: 500_000, reasoning: 200_000 },
+        'codex',
+      ),
+      p,
+    );
+    assert.ok(c);
+    assert.equal(c.input, 2.5);
+    assert.equal(c.output, 7.5);
+    assert.equal(c.reasoning, 0, 'reasoning is informational for Codex, not billed');
+    assert.equal(c.total, 10.0);
+  });
+
+  it('Codex regression: 11.3% overstatement scenario from the issue', async () => {
+    // 10 Codex turns aggregated: input 660_698, output 52_676, reasoning 29_070,
+    // cacheRead 5_618_688. The issue documents $4.282607 (current/wrong) vs
+    // $3.846557 (corrected) at gpt-5-codex pricing (input=1.25, output=10,
+    // cacheRead=0.125). We assert the corrected number to within 1e-6.
+    const p: PricingTable = {
+      'gpt-5-codex': {
+        input: 1.25,
+        output: 10,
+        cacheRead: 0.125,
+        cacheWrite: 1.25,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const c = costForTurn(
+      turn(
+        'gpt-5-codex',
+        {
+          input: 660_698,
+          output: 52_676,
+          reasoning: 29_070,
+          cacheRead: 5_618_688,
+        },
+        'codex',
+      ),
+      p,
+    );
+    assert.ok(c);
+    // input + output + cacheRead, reasoning is zero for codex
+    const expected =
+      (660_698 / 1_000_000) * 1.25 +
+      (52_676 / 1_000_000) * 10 +
+      (5_618_688 / 1_000_000) * 0.125;
+    assert.ok(
+      Math.abs(c.total - expected) < 1e-9,
+      `expected ${expected}, got ${c.total}`,
+    );
+    assert.equal(c.reasoning, 0);
+  });
+
+  it('honors a separate reasoning tariff when models.dev provides one', async () => {
+    // Acceptance criterion from issue #32: a model with input=1, output=4,
+    // reasoning=8 and 1M tokens of each should bill 13.
+    const p: PricingTable = {
+      'synthetic-reasoner': {
+        input: 1,
+        output: 4,
+        reasoning: 8,
+        cacheRead: 0,
+        cacheWrite: 1,
+        reasoningMode: 'separate',
+      },
+    };
+    const c = costForUsage(
+      {
+        input: 1_000_000,
+        output: 1_000_000,
+        reasoning: 1_000_000,
+        cacheRead: 0,
+        cacheCreate5m: 0,
+        cacheCreate1h: 0,
+      },
+      'synthetic-reasoner',
+      p,
+    );
+    assert.ok(c);
+    assert.equal(c.input, 1);
+    assert.equal(c.output, 4);
+    assert.equal(c.reasoning, 8);
+    assert.equal(c.total, 13);
+  });
+
+  it('explicit reasoningMode option overrides the model default', async () => {
+    const p: PricingTable = {
+      'override-test': {
+        input: 1,
+        output: 10,
+        cacheRead: 0,
+        cacheWrite: 1,
+        reasoningMode: 'same_as_output',
+      },
+    };
+    const usage = {
+      input: 0,
+      output: 0,
+      reasoning: 1_000_000,
+      cacheRead: 0,
+      cacheCreate5m: 0,
+      cacheCreate1h: 0,
+    };
+    const billed = costForUsage(usage, 'override-test', p);
+    const skipped = costForUsage(usage, 'override-test', p, {
+      reasoningMode: 'included_in_output',
+    });
+    assert.ok(billed);
+    assert.ok(skipped);
+    assert.equal(billed.reasoning, 10);
+    assert.equal(skipped.reasoning, 0);
+  });
+
   it('returns null for unknown model', async () => {
     const p = await loadBuiltinPricing();
     const c = costForTurn(turn('definitely-not-a-model', { input: 100 }), p);
@@ -92,3 +226,64 @@ describe('cost', () => {
     assert.ok(rate.cacheRead < rate.input);
   });
 });
+
+describe('pricing.flatten', () => {
+  it('preserves cost.reasoning from models.dev and tags it `separate`', () => {
+    const root = {
+      acme: {
+        id: 'acme',
+        models: {
+          'reasoner-v1': {
+            id: 'reasoner-v1',
+            cost: {
+              input: 0.7,
+              output: 2.8,
+              reasoning: 8.4,
+              cache_read: 0.07,
+              cache_write: 0.7,
+            },
+          },
+        },
+      },
+    };
+    const table = flatten(root);
+    const entry = table['reasoner-v1'];
+    assert.ok(entry, 'reasoner-v1 flattened');
+    assert.equal(entry.input, 0.7);
+    assert.equal(entry.output, 2.8);
+    assert.equal(entry.reasoning, 8.4);
+    assert.equal(entry.cacheRead, 0.07);
+    assert.equal(entry.cacheWrite, 0.7);
+    assert.equal(entry.reasoningMode, 'separate');
+  });
+
+  it('defaults reasoningMode to `same_as_output` when no reasoning tariff is given', () => {
+    const root = {
+      acme: {
+        id: 'acme',
+        models: {
+          'plain-v1': {
+            id: 'plain-v1',
+            cost: { input: 1, output: 2 },
+          },
+        },
+      },
+    };
+    const table = flatten(root);
+    const entry = table['plain-v1'];
+    assert.ok(entry);
+    assert.equal(entry.reasoningMode, 'same_as_output');
+    assert.equal(entry.reasoning, undefined);
+  });
+
+  it('builtin snapshot preserves at least one separate-tariff model', async () => {
+    // Smoke test: prove the live snapshot loader retains cost.reasoning for
+    // providers like Alibaba's Qwen that publish a distinct tariff.
+    const p = await loadBuiltinPricing();
+    const separate = Object.values(p).filter((m) => m.reasoningMode === 'separate');
+    assert.ok(separate.length > 0, 'expected at least one separate-tariff model');
+    for (const m of separate) {
+      assert.equal(typeof m.reasoning, 'number');
+    }
+  });
+});
diff --git a/packages/analyze/src/cost.ts b/packages/analyze/src/cost.ts
@@ -1,6 +1,6 @@
-import type { TurnRecord, Usage } from '@relayburn/reader';
+import type { SourceKind, TurnRecord, Usage } from '@relayburn/reader';
 
-import type { ModelCost, PricingTable } from './pricing.js';
+import type { ModelCost, PricingTable, ReasoningMode } from './pricing.js';
 import { resolveProvider } from './provider-reattribution.js';
 
 export interface CostBreakdown {
@@ -13,18 +13,30 @@ export interface CostBreakdown {
   cacheCreate: number;
 }
 
+export interface CostForUsageOptions {
+  /**
+   * Override the reasoning-billing semantics for this call. When omitted, the
+   * mode is taken from the resolved `ModelCost` (`reasoningMode`). When given,
+   * it wins — used by `costForTurn` to force `included_in_output` for sources
+   * (e.g. Codex) whose transcripts already fold reasoning into `output_tokens`.
+   */
+  reasoningMode?: ReasoningMode;
+}
+
 const PER_MILLION = 1_000_000;
 
 export function costForUsage(
   usage: Usage,
   model: string,
   pricing: PricingTable,
+  options: CostForUsageOptions = {},
 ): CostBreakdown | null {
   const rate = lookupModelRate(model, pricing);
   if (!rate) return null;
+  const mode: ReasoningMode = options.reasoningMode ?? rate.reasoningMode;
   const input = (usage.input / PER_MILLION) * rate.input;
   const output = (usage.output / PER_MILLION) * rate.output;
-  const reasoning = (usage.reasoning / PER_MILLION) * rate.output;
+  const reasoning = reasoningCost(usage.reasoning, rate, mode);
   const cacheRead = (usage.cacheRead / PER_MILLION) * rate.cacheRead;
   const cacheCreate =
     ((usage.cacheCreate5m + usage.cacheCreate1h) / PER_MILLION) * rate.cacheWrite;
@@ -40,7 +52,38 @@ export function costForUsage(
 }
 
 export function costForTurn(turn: TurnRecord, pricing: PricingTable): CostBreakdown | null {
-  return costForUsage(turn.usage, turn.model, pricing);
+  const override = reasoningModeForSource(turn.source);
+  const opts: CostForUsageOptions = override ? { reasoningMode: override } : {};
+  return costForUsage(turn.usage, turn.model, pricing, opts);
+}
+
+function reasoningCost(reasoningTokens: number, rate: ModelCost, mode: ReasoningMode): number {
+  switch (mode) {
+    case 'included_in_output':
+      // Already billed inside `usage.output` — informational only.
+      return 0;
+    case 'separate':
+      // Use the model's distinct reasoning tariff. If the override forced this
+      // mode but the model has no `rate.reasoning`, fall back to the output
+      // rate so we never silently drop reasoning tokens.
+      return (reasoningTokens / PER_MILLION) * (rate.reasoning ?? rate.output);
+    case 'same_as_output':
+    default:
+      return (reasoningTokens / PER_MILLION) * rate.output;
+  }
+}
+
+/**
+ * Per-source reasoning-billing semantics override. Returning `undefined` means
+ * "defer to the model's `reasoningMode`".
+ *
+ * - Codex: `output_tokens` already includes reasoning; never bill it on top.
+ *   See `../research/ccusage/apps/codex/src/data-loader.ts` for prior art.
+ * - Everyone else: defer to the model.
+ */
+function reasoningModeForSource(source: SourceKind): ReasoningMode | undefined {
+  if (source === 'codex') return 'included_in_output';
+  return undefined;
 }
 
 // Shared lookup: direct match → synthetic reattribution (issue #31, e.g.

diff --git a/packages/analyze/src/index.ts b/packages/analyze/src/index.ts
@@ -1,7 +1,7 @@
-export { loadBuiltinPricing, loadPricing } from './pricing.js';
-export type { ModelCost, PricingTable } from './pricing.js';
+export { flatten, loadBuiltinPricing, loadPricing } from './pricing.js';
+export type { ModelCost, PricingTable, ReasoningMode } from './pricing.js';
 export { costForTurn, costForUsage, sumCosts } from './cost.js';
-export type { CostBreakdown } from './cost.js';
+export type { CostBreakdown, CostForUsageOptions } from './cost.js';
 export { buildCompareTable, DEFAULT_MIN_SAMPLE } from './compare.js';
 export type { CompareCategory, CompareCell, CompareOptions, CompareTable } from './compare.js';
 export {

diff --git a/packages/analyze/src/plan-usage.test.ts b/packages/analyze/src/plan-usage.test.ts
@@ -13,6 +13,7 @@ const PRICING: PricingTable = {
     output: 15,
     cacheRead: 0.3,
     cacheWrite: 3.75,
+    reasoningMode: 'same_as_output',
   },
 };