From e343ef4f9239af9b1b6a5b099dbe396aa62d21b8 Mon Sep 17 00:00:00 2001
From: Mickael Farina <farina.mickael@gmail.com>
Date: Fri, 22 May 2026 15:30:12 +0200
Subject: [PATCH] refactor(llm): codec_llm raise_on_error mode + migrate
 fail-loud sites (A-12 tranche 2c)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR-3E-2c. Adds the raise-on-failure contract that tranche 2 deferred, then
migrates the 4 sites that MUST fail loud.

New: codec_llm.LLMError + codec_llm.call(raise_on_error=True). When True, call()
raises LLMError on EVERY non-success path — non-200 (after retries), request
exception (after retries), and a 200 with empty/unparseable content. Default
stays False (never-raise -> ""), so the existing streaming/best-effort callers
(codec.py, qwen_call, compaction, dictate) are untouched — pinned by a
regression guard test.

Migrated:
- codec_textassist.call_qwen -> call(raise_on_error=True). Fixes a real bug: on
  LLM failure the never-raise path would pbcopy "" + Cmd-V, pasting EMPTY over
  the user's selection and showing "Text replaced!". Now the caller's except
  shows the Error overlay (also on empty-200). FINAL-ANSWER strip kept at the
  call site; <think> strip now handled by codec_llm.
- scripts/regen_skill_descriptions._llm -> call(raise_on_error=True). Fail-loud
  preserved (LLMError propagates like the old raise_for_status; empty-200 now
  raises instead of writing an empty description).
- codec_agent_plan._qwen_chat + codec_agent_runner._qwen_chat -> call(
  raise_on_error=True) behind a thin adapter that maps LLMError onto their
  PUBLIC QwenUnavailableError, so the daemon's `except QwenUnavailableError`
  retry/abort/resume logic is unchanged. Added a parallel _qwen_base() resolver
  (call-time config). These also gain <think> strip + enable_thinking=False ->
  more robust JSON parsing downstream.

Tests: tests/test_llm_raise_mode.py (14 — raise-mode success/non-200/exception/
empty-200, default-still-never-raises regression guard, agent adapters map to
QwenUnavailableError + pass content through, source invariants). 109 agent
tests (test_agent_plan/runner/chat_plan_persistence) still green. Full suite
1423 passing, 23 known-baseline failures, zero new. Zero net-new ruff (per-file
delta vs origin/main = 0). No skills/ touched -> no manifest regen.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 AGENTS.md                                  |   2 +-
 codec_agent_plan.py                        |  51 ++++---
 codec_agent_runner.py                      |  46 +++---
 codec_llm.py                               |  33 ++++-
 codec_textassist.py                        |  22 +--
 docs/PR3E2C-RAISE-MODE-DESIGN.md           |  71 +++++++++
 docs/audits/PHASE-1-CODE-QUALITY.md        |   4 +-
 docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md |   1 +
 scripts/regen_skill_descriptions.py        |  22 ++-
 tests/test_llm_raise_mode.py               | 165 +++++++++++++++++++++
 10 files changed, 345 insertions(+), 72 deletions(-)
 create mode 100644 docs/PR3E2C-RAISE-MODE-DESIGN.md
 create mode 100644 tests/test_llm_raise_mode.py
diff --git a/AGENTS.md b/AGENTS.md
index e95227d..83f5de1 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -56,7 +56,7 @@ docs/                        API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig
 
 Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.)
 
-**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches. Done: `codec_llm.call()` (non-stream) + `stream()` (sync SSE generator, yields raw deltas, never-raises); migrated sites = codec.py voice-reply, `codec_session.qwen_call` + `qwen_stream`, `codec_compaction`, `codec_dictate`. Pending tranches: 2c raise-mode (`codec_llm.call(raise_on_error=True)` for agent_plan/runner + textassist + the regen script — they MUST fail loud, never-raise would silently paste empty / write empty), an async `astream()` for voice `_stream_qwen` + agents (queue stays at the call site — `codec_llm` never owns the semaphore), dashboard (4 non-stream + the `[SKILL:…]` stream tag-machine, which keeps its own parser and consumes only `stream()`'s raw tokens), bridges, and a skills tranche.
+**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches. Done: `codec_llm.call()` (non-stream; + `raise_on_error=True` raising `codec_llm.LLMError` for fail-loud callers) + `stream()` (sync SSE generator, yields raw deltas, never-raises); migrated sites = codec.py voice-reply, `codec_session.qwen_call` + `qwen_stream`, `codec_compaction`, `codec_dictate`, `codec_textassist`, the regen script, and `codec_agent_plan`/`codec_agent_runner` `_qwen_chat` (adapter maps `LLMError` → their public `QwenUnavailableError`). Pending tranches: an async `astream()` for voice `_stream_qwen` + agents `Agent.run` (queue stays at the call site — `codec_llm` never owns the semaphore), dashboard (4 non-stream + the `[SKILL:…]` stream tag-machine, which keeps its own parser and consumes only `stream()`'s raw tokens), bridges, and a skills tranche.
 
 ## 3. Agent + Crew runtime
 
diff --git a/codec_agent_plan.py b/codec_agent_plan.py
index 18d5d32..bb46c47 100644
--- a/codec_agent_plan.py
+++ b/codec_agent_plan.py
@@ -249,6 +249,15 @@ def _qwen_model() -> str:
         return "mlx-community/Qwen3.6-35B-A3B-4bit"
 
 
+def _qwen_base() -> str:
+    """Base URL (no /chat/completions) for codec_llm.call — call-time resolved."""
+    try:
+        from codec_config import QWEN_BASE_URL
+        return QWEN_BASE_URL
+    except Exception:
+        return "http://localhost:8083/v1"
+
+
 QWEN_URL = _qwen_url()       # back-compat — module-level constant for tests
 QWEN_MODEL = _qwen_model()   # back-compat
 QWEN_TIMEOUT = 60  # seconds
@@ -268,33 +277,27 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
     assistant's content string. Raises QwenUnavailableError on
     network failure or non-2xx response.
 
-    URL + model resolved at call time via _qwen_url() / _qwen_model()
+    URL + model resolved at call time via _qwen_base() / _qwen_model()
     so they pick up ~/.codec/config.json:llm_base_url + :llm_model
     rather than the deploy-time hardcoded values."""
-    import requests  # lazy import — avoid forcing requests on test machines without it
-
-    payload = {
-        "model": _qwen_model(),
-        "messages": [
-            {"role": "system", "content": system_prompt or ""},
-            {"role": "user",   "content": user_prompt},
-        ],
-        "max_tokens": max_tokens,
-        "temperature": 0.2,
-    }
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True) replaces the
+    # inline POST + per-failure raises. The adapter maps codec_llm.LLMError onto
+    # the public QwenUnavailableError, so callers' `except QwenUnavailableError`
+    # is unchanged. (Now also strips <think> + enable_thinking=False — the
+    # downstream JSON parse is more robust for it.)
+    import codec_llm
     try:
-        r = requests.post(_qwen_url(), json=payload, timeout=QWEN_TIMEOUT)
-    except requests.exceptions.ConnectionError as e:
-        raise QwenUnavailableError(f"qwen3.6 unreachable: {e}")
-    except requests.exceptions.Timeout:
-        raise QwenUnavailableError("qwen3.6 request timed out")
-    if r.status_code != 200:
-        raise QwenUnavailableError(f"qwen3.6 returned {r.status_code}: {r.text[:200]}")
-    try:
-        data = r.json()
-        return data["choices"][0]["message"]["content"]
-    except (KeyError, json.JSONDecodeError) as e:
-        raise QwenUnavailableError(f"qwen3.6 returned malformed response: {e}")
+        return codec_llm.call(
+            [
+                {"role": "system", "content": system_prompt or ""},
+                {"role": "user",   "content": user_prompt},
+            ],
+            base_url=_qwen_base(), model=_qwen_model(),
+            max_tokens=max_tokens, temperature=0.2,
+            timeout=QWEN_TIMEOUT, raise_on_error=True,
+        )
+    except codec_llm.LLMError as e:
+        raise QwenUnavailableError(f"qwen3.6 unavailable: {e}") from e
 
 
 # ── Plan drafting ─────────────────────────────────────────────────────────────
diff --git a/codec_agent_runner.py b/codec_agent_runner.py
index 8f61f12..73e5a1f 100644
--- a/codec_agent_runner.py
+++ b/codec_agent_runner.py
@@ -239,6 +239,15 @@ def _qwen_model() -> str:
         return "mlx-community/Qwen3.6-35B-A3B-4bit"
 
 
+def _qwen_base() -> str:
+    """Base URL (no /chat/completions) for codec_llm.call — call-time resolved."""
+    try:
+        from codec_config import QWEN_BASE_URL
+        return QWEN_BASE_URL
+    except Exception:
+        return "http://localhost:8083/v1"
+
+
 QWEN_URL = _qwen_url()
 QWEN_MODEL = _qwen_model()
 QWEN_TIMEOUT = 60
@@ -306,28 +315,23 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
 
     URL + model resolved at call time so config.json changes are picked
     up without a process restart."""
-    import requests
-    payload = {
-        "model": _qwen_model(),
-        "messages": [
-            {"role": "system", "content": system_prompt or ""},
-            {"role": "user",   "content": user_prompt},
-        ],
-        "max_tokens": max_tokens,
-        "temperature": 0.2,
-    }
-    try:
-        r = requests.post(_qwen_url(), json=payload, timeout=QWEN_TIMEOUT)
-    except requests.exceptions.ConnectionError as e:
-        raise QwenUnavailableError(f"qwen3.6 unreachable: {e}")
-    except requests.exceptions.Timeout:
-        raise QwenUnavailableError("qwen3.6 request timed out")
-    if r.status_code != 200:
-        raise QwenUnavailableError(f"qwen3.6 returned {r.status_code}")
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True). Adapter
+    # maps codec_llm.LLMError -> the public QwenUnavailableError so the daemon's
+    # retry/abort logic (except QwenUnavailableError) is unchanged. Kept parallel
+    # with codec_agent_plan._qwen_chat.
+    import codec_llm
     try:
-        return r.json()["choices"][0]["message"]["content"]
-    except (KeyError, json.JSONDecodeError) as e:
-        raise QwenUnavailableError(f"qwen3.6 returned malformed response: {e}")
+        return codec_llm.call(
+            [
+                {"role": "system", "content": system_prompt or ""},
+                {"role": "user",   "content": user_prompt},
+            ],
+            base_url=_qwen_base(), model=_qwen_model(),
+            max_tokens=max_tokens, temperature=0.2,
+            timeout=QWEN_TIMEOUT, raise_on_error=True,
+        )
+    except codec_llm.LLMError as e:
+        raise QwenUnavailableError(f"qwen3.6 unavailable: {e}") from e
 
 
 def _qwen_next_action(plan_dict: Dict[str, Any], checkpoint: Dict[str, Any],
diff --git a/codec_llm.py b/codec_llm.py
index d0c3a63..7f1f80c 100644
--- a/codec_llm.py
+++ b/codec_llm.py
@@ -27,6 +27,16 @@
 
 log = logging.getLogger("codec.llm")
 
+
+class LLMError(Exception):
+    """Raised by ``call(raise_on_error=True)`` on any non-success outcome —
+    non-200 (after retries), a request exception (after retries), or a 200 with
+    empty/unparseable content. The default ``raise_on_error=False`` keeps the
+    never-raise → "" contract that the streaming/best-effort callers rely on.
+    Fail-loud callers (agent_plan/runner, textassist, the regen script) opt in
+    and map this onto their own error handling."""
+
+
 _THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
 
 
@@ -98,13 +108,21 @@ def call(
     retries: int = 1,
     enable_thinking: bool = False,
     extra_kwargs: Optional[Dict[str, Any]] = None,
+    raise_on_error: bool = False,
 ) -> str:
     """POST `messages` to `<base_url>/chat/completions` and return the parsed,
-    `<think>`-stripped assistant text (or "" on failure).
+    `<think>`-stripped assistant text.
 
     `retries` includes the first attempt (retries=3 → up to 3 tries with
     exponential 2**n backoff between them, matching codec_session.qwen_call).
-    Never raises — network/parse errors are logged and yield "".
+
+    Error contract:
+    - `raise_on_error=False` (default): never raises — network/parse errors and
+      empty/unparseable 200s are logged and yield "".
+    - `raise_on_error=True`: raises `LLMError` on EVERY non-success outcome
+      (non-200 after retries, request exception after retries, or a 200 with
+      empty/unparseable content). For fail-loud callers that must not silently
+      proceed on an empty answer.
     """
     import requests
     headers, payload = _build_request(
@@ -115,6 +133,7 @@ def call(
 
     attempts = max(1, retries)
     url = base_url.rstrip("/") + "/chat/completions"
+    last_error: Optional[Exception] = None
     for attempt in range(attempts):
         try:
             r = requests.post(url, json=payload, headers=headers, timeout=timeout)
@@ -122,13 +141,21 @@ def call(
                 resp = extract_content(r.json())
                 if resp:
                     return resp
-                # 200 but empty/odd shape — don't retry, nothing more to get.
+                # 200 but empty/odd shape — nothing more to get; don't retry.
+                if raise_on_error:
+                    raise LLMError("LLM returned empty or unparseable content")
                 return ""
+            last_error = LLMError(f"LLM call returned {r.status_code}: {r.text[:200]}")
             log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200])
+        except LLMError:
+            raise  # empty-200 in raise mode — propagate, don't swallow as a retry
         except Exception as e:
+            last_error = e
             log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e)
             if attempt < attempts - 1:
                 time.sleep(2 ** attempt)
+    if raise_on_error:
+        raise LLMError(f"LLM call failed after {attempts} attempt(s): {last_error}")
     return ""
 
 
diff --git a/codec_textassist.py b/codec_textassist.py
index 5d116fe..f881373 100755
--- a/codec_textassist.py
+++ b/codec_textassist.py
@@ -24,15 +24,19 @@ def call_qwen(text, mode):
         "translate": "You are a translator. Translate the following text into English. No matter what language the input is — Ukrainian, Spanish, French, Russian, Chinese, Arabic, anything — always translate to English. Output ONLY the translated English text, nothing else.",
         "prompt": "You are a prompt engineer. Rewrite the following text to be a clear, optimized prompt for an AI language model. Make it specific, structured, and effective. Remove ambiguity, add context where helpful, and ensure the intent is crystal clear. Output ONLY the optimized prompt, nothing else."
     }
-    payload = {"model": model, "messages": [
-        {"role": "system", "content": prompts.get(mode, prompts["proofread"])},
-        {"role": "user", "content": text}
-    ], "max_tokens": 4000, "temperature": 0.3, "stream": False,
-    "chat_template_kwargs": {"enable_thinking": False}}
-    payload.update(kwargs)
-    r = requests.post(f"{base}/chat/completions", json=payload, timeout=60)
-    result = r.json()["choices"][0]["message"]["content"].strip()
-    result = re.sub(r'<think>[\s\S]*?</think>', '', result).strip()
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True). Fail-loud
+    # is required here — the caller's except shows an Error overlay; never-raise
+    # would paste an empty result over the user's selection. codec_llm strips
+    # <think>; the `### FINAL ANSWER:` marker is textassist-specific so it stays.
+    import codec_llm
+    result = codec_llm.call(
+        [
+            {"role": "system", "content": prompts.get(mode, prompts["proofread"])},
+            {"role": "user", "content": text},
+        ],
+        base_url=base, model=model, max_tokens=4000, temperature=0.3,
+        extra_kwargs=kwargs, timeout=60, raise_on_error=True,
+    )
     return re.sub(r'###\s*FINAL ANSWER:\s*', '', result).strip()
 
 def overlay(text, color, duration):
diff --git a/docs/PR3E2C-RAISE-MODE-DESIGN.md b/docs/PR3E2C-RAISE-MODE-DESIGN.md
new file mode 100644
index 0000000..df9b265
--- /dev/null
+++ b/docs/PR3E2C-RAISE-MODE-DESIGN.md
@@ -0,0 +1,71 @@
+# PR-3E-2c — A-12 tranche 2c: `codec_llm.call(raise_on_error=True)` + migrate the raise-on-failure sites (DESIGN)
+
+**Status:** IMPLEMENTED. Added `codec_llm.LLMError` + `codec_llm.call(raise_on_error=True)`; migrated all 4 sites (textassist, regen, agent_plan/_runner via a `QwenUnavailableError` adapter + new `_qwen_base()` helper). 14 new tests (`tests/test_llm_raise_mode.py`); 109 agent tests still green; full suite zero new failures; zero net-new ruff.
+**Finding:** A-12 (continuation). Tranche 2 deferred the 4 sites whose contract is *fail-loud* (they must NOT silently return empty). This adds the `raise_on_error` mode and migrates them.
+**Wave:** 3. Touches the autonomous-agent runtime → design-first + parity tests.
+
+---
+
+## 1. The 4 sites + why they were deferred
+
+All 4 **raise (or fail-loud) on LLM failure** — the opposite of `codec_llm.call`'s never-raise. Migrating them with the current never-raise → `""` would silently degrade:
+
+| Site | Current failure behavior | never-raise would… |
+|---|---|---|
+| `codec_textassist.call_qwen` | raises on non-200/malformed; caller's `except` shows an Error overlay | `pbcopy ""` + ⌘V → **paste empty over the user's selection** + "Text replaced!" |
+| `scripts/regen_skill_descriptions._llm` | `raise_for_status()` (fail-loud dev script) | silently **write empty descriptions** |
+| `codec_agent_plan._qwen_chat` | raises `QwenUnavailableError` on conn/timeout/non-200/malformed | swallow failure → plan drafting parses `""` |
+| `codec_agent_runner._qwen_chat` | identical to agent_plan (`QwenUnavailableError`) | swallow failure → daemon never retries / parses `""` |
+
+## 2. Contract — `raise_on_error: bool = False`
+
+Add to `codec_llm.call`. **Default False → existing behavior unchanged** (codec.py / qwen_call / compaction / dictate are untouched).
+
+When `True`: raise a new typed **`codec_llm.LLMError(Exception)`** whenever `call()` would otherwise return `""` — i.e. on **every** non-success path:
+- non-200 (after exhausting `retries`)
+- connection / timeout / other request exception (after exhausting `retries`)
+- malformed response JSON or shape (`extract_content` → "")
+- **200 but empty content** (no usable answer)
+
+Rationale for raising on empty-200 too: all 4 sites treat an empty/unusable answer as a failure (textassist would paste nothing, regen would write nothing, the agents would parse nothing). "Never silently returns empty" is exactly the contract they want — so empty-200 → raise is a *strict improvement*, not just parity.
+
+`retries` still applies before raising (the agents use the default `retries=1` = single attempt = raise immediately on first failure, matching their current single-POST shape).
+
+## 3. Per-site migration
+
+- **`textassist.call_qwen`** → `codec_llm.call(..., raise_on_error=True)`, keep the `### FINAL ANSWER:` strip at the call site (codec_llm already strips `<think>`). The caller's existing `try/except` (shows the Error overlay) now also fires on empty-200 — **fixing the destructive empty-paste**.
+- **`regen._llm`** → `codec_llm.call(..., raise_on_error=True).strip('"').strip()`. Fail-loud preserved (LLMError propagates like `raise_for_status` did); gains `<think>` strip.
+- **`agent_plan._qwen_chat` / `agent_runner._qwen_chat`** → thin adapter that **preserves the public `QwenUnavailableError`**:
+  ```python
+  import codec_llm
+  try:
+      return codec_llm.call(
+          [{"role": "system", "content": system_prompt or ""},
+           {"role": "user", "content": user_prompt}],
+          base_url=_qwen_url_base(), model=_qwen_model(),
+          max_tokens=max_tokens, temperature=0.2,
+          timeout=QWEN_TIMEOUT, raise_on_error=True,
+      )
+  except codec_llm.LLMError as e:
+      raise QwenUnavailableError(str(e)) from e
+  ```
+  (`_qwen_url()` returns the full `.../chat/completions`; codec_llm wants the base, so the adapter strips the suffix or uses the base helper — see implementation.)
+
+## 4. Behavior deltas (documented)
+
+1. **agent_plan/runner now strip `<think>` + send `enable_thinking=False` + have a `reasoning` fallback.** Their downstream parses the content as JSON — suppressing/stripping think makes that *more* robust (original passed raw content through). Improvement, not regression.
+2. **All 4: empty-200 now raises** (was: `""` → empty paste / empty desc / parse-`""`). Strict improvement — fail-loud is the intent.
+3. **agent_plan/runner: exception _message_ changes** (e.g. "qwen3.6 returned 500" → wrapped `LLMError` text) but the **TYPE `QwenUnavailableError` is preserved** via the adapter — the daemon's `except QwenUnavailableError` retry/abort logic is unaffected.
+4. **agent_plan/runner: no added retries** — `retries=1` (default) keeps the single-attempt shape; the daemon owns retry/backoff at a higher level.
+
+## 5. Test plan
+- `tests/test_llm_raise_mode.py`: `call(raise_on_error=True)` raises `LLMError` on non-200 / connection-exception / empty-200; `raise_on_error=False` (default) still returns `""` for all three (regression guard on the existing contract); `LLMError` is an `Exception` subclass.
+- Migration tests: monkeypatch `codec_llm.call` → assert `agent_plan._qwen_chat` / `agent_runner._qwen_chat` raise `QwenUnavailableError` (not `LLMError`) when codec_llm raises, and pass content through on success. Source-invariants: the 4 sites call `codec_llm.call(`, inline `requests.post(...chat/completions...)` / `raise_for_status` gone.
+- Full suite: expect the 23 known-baseline failures, **zero new**. `regen_skill_descriptions` is a script (no manifest); textassist is not a skill module → **no manifest regen**.
+
+## 6. Risk + rollback
+- **Blast radius:** `codec_llm.py` (+`raise_on_error`/`LLMError`) + 4 call sites. The agent runtime is the sensitive part — covered by parity tests asserting `QwenUnavailableError` is still what propagates.
+- **Rollback:** single-commit revert restores the inline impls. `raise_on_error` defaults False so no other caller can be affected.
+
+## 7. After 2c — remaining A-12
+Bridges (telegram/imessage), dashboard (4 non-stream + the `[SKILL:…]` stream tag-machine), voice `_stream_qwen` + agents (async `astream()` + queue at call site), skills tranche (translate/fact_extract/create_skill/skill_forge).
diff --git a/docs/audits/PHASE-1-CODE-QUALITY.md b/docs/audits/PHASE-1-CODE-QUALITY.md
index c17ad1a..a2e58f7 100644
--- a/docs/audits/PHASE-1-CODE-QUALITY.md
+++ b/docs/audits/PHASE-1-CODE-QUALITY.md
@@ -124,7 +124,9 @@ Both scan `SKILLS_DIR` independently, so a skill file is loaded twice in differe
 
 > **First tranche closed by PR-3E** (Option 2); remainder phased. The audit's premise that `codec_llm_proxy.py` already has a `call()`/`stream()` to reuse was **inaccurate** — that module is a priority *queue*, not an HTTP caller. PR-3E instead built the genuinely-new canonical **`codec_llm.py`** (`call()` non-streaming, plus `strip_think`/`extract_content`: headers, `Bearer` auth, `chat_template_kwargs.enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). **Migrated by PR-3E (tranche 1):** `codec.py` voice-reply chat + `codec_session.qwen_call`. Pinned by `tests/test_llm_vision_dedup.py`. See `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md` §3 (phased plan) + §8 (shipped).
 >
-> **Tranche 2 (PR-3E-2):** built the streaming keystone **`codec_llm.stream()`** (sync generator, yields raw deltas, never-raises) + extracted a shared `_build_request`; migrated `codec_session.qwen_stream`, `codec_compaction`, `codec_dictate`. Read-the-source moved `codec_textassist` + `scripts/regen_skill_descriptions` to **tranche 2c** (they have a raise-on-failure contract — never-raise would paste empty over the user's selection / write empty descriptions). Pinned by `tests/test_llm_stream.py`. See `docs/PR3E2-LLM-STREAM-TRANCHE2-DESIGN.md`. **Still pending (own PRs):** dashboard (4 non-stream + 1 stream tag-machine), voice `_stream_qwen` + agents (async + queue → need `astream()`), agent_plan/runner + textassist + regen (2c raise-mode), bridges telegram/imessage, self_improve/watcher, skills tranche (translate/fact_extract/create_skill/skill_forge).
+> **Tranche 2 (PR-3E-2):** built the streaming keystone **`codec_llm.stream()`** (sync generator, yields raw deltas, never-raises) + extracted a shared `_build_request`; migrated `codec_session.qwen_stream`, `codec_compaction`, `codec_dictate`. Read-the-source moved `codec_textassist` + `scripts/regen_skill_descriptions` to **tranche 2c** (they have a raise-on-failure contract — never-raise would paste empty over the user's selection / write empty descriptions). Pinned by `tests/test_llm_stream.py`. See `docs/PR3E2-LLM-STREAM-TRANCHE2-DESIGN.md`.
+>
+> **Tranche 2c (PR-3E-2c):** added `codec_llm.LLMError` + `codec_llm.call(raise_on_error=True)` (raises on every non-success — non-200/exception/empty-200; default stays never-raise). Migrated the 4 fail-loud sites: `codec_textassist` + `scripts/regen_skill_descriptions` (LLMError propagates — fixes the destructive empty-paste / empty-description), and `codec_agent_plan._qwen_chat` + `codec_agent_runner._qwen_chat` via a thin adapter that maps `LLMError` → their public `QwenUnavailableError` (+ a parallel `_qwen_base()` helper). Pinned by `tests/test_llm_raise_mode.py` (109 agent tests still green). See `docs/PR3E2C-RAISE-MODE-DESIGN.md`. **Still pending (own PRs):** dashboard (4 non-stream + 1 stream tag-machine), voice `_stream_qwen` + agents `Agent.run` (async + queue → need `astream()`), bridges telegram/imessage, self_improve/watcher, skills tranche (translate/fact_extract/create_skill/skill_forge).
 **Location:** Sample sites: `codec.py:702`, `codec_dashboard.py:980,1076,1215`, `codec_voice.py:180,196,208,213`, `codec_session.py:215,278,307`, `codec_agents.py:51`, `codec_agent_plan.py:239`, `codec_agent_runner.py:148`, `codec_compaction.py:78`, `codec_self_improve.py:238`, `codec_telegram.py:471,508`, `codec_imessage.py:341,391`, `codec_textassist.py:33`, `codec_dictate.py:492`, `codec_watcher.py:86,182`. Total: 51 occurrences via `grep -rn "chat/completions"`.
 **Impact:** Each site repeats: headers build, `Authorization: Bearer {api_key}` formatting, `{Content-Type: application/json}`, payload assembly with `chat_template_kwargs.enable_thinking=False`, `<think>` stripping, `try/except` for `r.json()` shape (`choices[0].message.content` or `.reasoning` fallback). Many also re-implement streaming SSE parsing. When the Qwen-3.6 upgrade landed, this is exactly the kind of change that needs to be applied in 20+ places.
 **Recommended fix:** Add `codec_llm_proxy.call(messages, **kwargs)` and `codec_llm_proxy.stream(messages, **kwargs)` as the single canonical API (the module already exists at `codec_llm_proxy.py`, only 130 LOC, only used by codec_voice + codec_agents). Migrate all 51 sites over the course of the Phase 1 hardening. As a first step, just covering the 5 sites in codec.py + codec_dashboard.py + codec_session.py would remove ~80% of the most-edited duplication.
diff --git a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
index f38d180..d8538af 100644
--- a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
+++ b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
@@ -244,6 +244,7 @@ Mirror the Intake Phase 3 wave pattern. 7 waves planned; sizes are PR-counts, NO
 - PR-3D: A-5 + A-6 + A-7 — extract helpers from the 3 monolithic functions (`_dispatch_inner`, `chat_completion`, `Agent.run`)
 - PR-3E: A-11 + A-12 — unify vision + `chat/completions` ✅ (branch `fix/pr3e-llm-vision-dedup`, design-first per §11 → `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md`; **Option 2** chosen by Mickael). **A-11 fully closed**: new `codec_vision.py` (sync+async, Gemini→local fallback, live config); all 3 consumers (codec.py/voice/session) delegate; session gains a Gemini fallback it lacked. **A-12 first tranche**: discovered `codec_llm_proxy` is a *queue*, not an HTTP caller — built genuinely-new `codec_llm.py` (`call()` + `strip_think`/`extract_content`, retry, never-raises) and migrated codec.py voice-reply chat + `codec_session.qwen_call`. **Deferred to phased follow-ons**: `qwen_stream` SSE (needs `codec_llm.stream()`) + ~40 remaining sites (dashboard/voice/agents/bridges/misc), each its own tranche. 19 tests (`tests/test_llm_vision_dedup.py`); full suite zero new failures.
 - PR-3E-2: A-12 tranche 2 ✅ (branch `fix/pr3-a12-tranche2-stream`, design-first → `docs/PR3E2-LLM-STREAM-TRANCHE2-DESIGN.md`; **Option 1** chosen). Built streaming keystone `codec_llm.stream()` (sync generator, raw deltas, never-raises) + shared `_build_request`; migrated `codec_session.qwen_stream` (proof) + non-streaming trivials `codec_compaction` + `codec_dictate`. Read-the-source moved `codec_textassist` + `regen_skill_descriptions` to **2c** (raise-on-failure contract — never-raise would paste empty over the user's selection / write empty descriptions). 14 tests (`tests/test_llm_stream.py`); zero net-new ruff; full suite 1409 passing, zero new failures. **Remaining A-12 tranches:** 2c (raise-mode: textassist/regen/agent_plan/agent_runner), bridges (telegram/imessage), dashboard (non-stream + the stream tag-machine), voice `_stream_qwen` + agents (async `astream()` + queue), skills tranche.
+- PR-3E-2c: A-12 tranche 2c ✅ (branch `fix/pr3-a12-tranche2c-raise-mode`, design-first → `docs/PR3E2C-RAISE-MODE-DESIGN.md`). Added `codec_llm.LLMError` + `codec_llm.call(raise_on_error=True)` (raises on every non-success path; default unchanged never-raise). Migrated the 4 fail-loud sites: `codec_textassist` (was pasting empty over the user's selection on failure) + `scripts/regen_skill_descriptions` (LLMError propagates like the old `raise_for_status`), and `codec_agent_plan`/`codec_agent_runner` `_qwen_chat` via an adapter mapping `LLMError` → their public `QwenUnavailableError` (+ parallel `_qwen_base()`). 14 tests (`tests/test_llm_raise_mode.py`); 109 agent tests still green; zero net-new ruff; full suite 1423 passing, zero new failures. **Remaining A-12:** bridges (telegram/imessage), dashboard (4 non-stream + stream tag-machine), voice `_stream_qwen` + agents `Agent.run` (async `astream()` + queue), skills tranche.
 - PR-3F (optional, large): A-19 — bridge unification (iMessage + Telegram → `BridgeRouter`)
 - PR-3G: small misc ✅ (branch `fix/pr3g-small-misc-cleanup`) — closed A-9 (DISABLED overlay, ~90 LOC), A-10 (run_session_module, 33 LOC + orphan `import sys`), A-14 (close_session shadow import), A-18 (9 unused Pydantic models + dead typing import). A-13 (dashboard pattern blocker) verified **already closed by PR-2C**. 6 regression tests; zero net-new ruff (net −); full suite 1344 passing. **Deferred from this batch (each needs its own focused PR):** A-8 (codec_keyboard.py 398 LOC — verify-first delete-or-migrate decision), A-15 (config_version — additive migration feature touching `load_config`), A-20 (inline sqlite in the live dispatch path — reliability fix needing a CodecMemory method).
 - A-15: config schema versioning ✅ (branch `fix/pr3-a15-config-versioning`; `CONFIG_SCHEMA_VERSION=1` + migration ladder + idempotent atomic write-back in `load_config`; never creates-on-missing or overwrites-corrupt; 12 tests; zero net-new ruff; full suite 1356 passing).
diff --git a/scripts/regen_skill_descriptions.py b/scripts/regen_skill_descriptions.py
index d867aee..8d0b0ea 100644
--- a/scripts/regen_skill_descriptions.py
+++ b/scripts/regen_skill_descriptions.py
@@ -53,19 +53,15 @@
 
 
 def _llm(prompt: str) -> str:
-    import requests
-    r = requests.post(
-        f"{QWEN_BASE_URL}/chat/completions",
-        json={
-            "model": QWEN_MODEL,
-            "messages": [{"role": "user", "content": prompt}],
-            "temperature": 0.2,
-            "max_tokens": 80,
-        },
-        timeout=60,
-    )
-    r.raise_for_status()
-    return r.json()["choices"][0]["message"]["content"].strip().strip('"').strip()
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True). Fail-loud
+    # preserved (LLMError propagates like the old raise_for_status, and an empty
+    # 200 now raises instead of writing an empty description). Gains <think> strip.
+    import codec_llm
+    return codec_llm.call(
+        [{"role": "user", "content": prompt}],
+        base_url=QWEN_BASE_URL, model=QWEN_MODEL,
+        max_tokens=80, temperature=0.2, timeout=60, raise_on_error=True,
+    ).strip('"').strip()
 
 
 def _extract_meta(text: str) -> dict:
diff --git a/tests/test_llm_raise_mode.py b/tests/test_llm_raise_mode.py
new file mode 100644
index 0000000..555cd51
--- /dev/null
+++ b/tests/test_llm_raise_mode.py
@@ -0,0 +1,165 @@
+"""Tests for PR-3E-2c — A-12 tranche 2c: codec_llm.call(raise_on_error=True).
+
+- raise_on_error=True raises codec_llm.LLMError on EVERY non-success path
+  (non-200, request exception, empty/unparseable 200). Default False keeps the
+  existing never-raise -> "" contract (regression guard).
+- The 4 fail-loud sites migrate: codec_agent_plan/_runner._qwen_chat adapt
+  LLMError -> their public QwenUnavailableError; textassist + regen propagate.
+
+Reference: docs/PR3E2C-RAISE-MODE-DESIGN.md.
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+
+import codec_llm  # noqa: E402
+
+
+class _Resp:
+    def __init__(self, status, payload=None, text=""):
+        self.status_code = status
+        self._payload = payload or {}
+        self.text = text
+
+    def json(self):
+        return self._payload
+
+
+def _ok(content):
+    return {"choices": [{"message": {"content": content}}]}
+
+
+# ── codec_llm.LLMError + raise_on_error ───────────────────────────────────────
+
+
+def test_llmerror_is_exception():
+    assert issubclass(codec_llm.LLMError, Exception)
+
+
+def test_raise_mode_success_returns_content(monkeypatch):
+    import requests
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(200, _ok("answer")))
+    out = codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                         model="m", raise_on_error=True)
+    assert out == "answer"
+
+
+def test_raise_mode_non_200_raises(monkeypatch):
+    import requests
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(500, text="boom"))
+    monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None)
+    with pytest.raises(codec_llm.LLMError):
+        codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                       model="m", retries=2, raise_on_error=True)
+
+
+def test_raise_mode_exception_raises(monkeypatch):
+    def boom(*a, **k):
+        raise ConnectionError("down")
+
+    import requests
+    monkeypatch.setattr(requests, "post", boom)
+    monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None)
+    with pytest.raises(codec_llm.LLMError):
+        codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                       model="m", raise_on_error=True)
+
+
+def test_raise_mode_empty_200_raises(monkeypatch):
+    import requests
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(200, _ok("")))
+    with pytest.raises(codec_llm.LLMError):
+        codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                       model="m", raise_on_error=True)
+
+
+def test_default_mode_never_raises(monkeypatch):
+    """Regression guard: raise_on_error defaults False -> "" on all failures."""
+    import requests
+    monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None)
+
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(500, text="x"))
+    assert codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                          model="m", retries=2) == ""
+
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(200, _ok("")))
+    assert codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                          model="m") == ""
+
+    def boom(*a, **k):
+        raise ConnectionError("down")
+    monkeypatch.setattr(requests, "post", boom)
+    assert codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1",
+                          model="m") == ""
+
+
+# ── agent_plan / agent_runner adapters (LLMError -> QwenUnavailableError) ──────
+
+
+def test_agent_plan_qwen_chat_success(monkeypatch):
+    import codec_agent_plan
+    monkeypatch.setattr(codec_llm, "call", lambda *a, **k: "plan-text")
+    assert codec_agent_plan._qwen_chat("hi", "sys") == "plan-text"
+
+
+def test_agent_plan_qwen_chat_adapts_to_qwen_unavailable(monkeypatch):
+    import codec_agent_plan
+
+    def raise_llm(*a, **k):
+        raise codec_llm.LLMError("boom-from-codec-llm")
+
+    monkeypatch.setattr(codec_llm, "call", raise_llm)
+    with pytest.raises(codec_agent_plan.QwenUnavailableError) as ei:
+        codec_agent_plan._qwen_chat("hi", "sys")
+    assert "boom-from-codec-llm" in str(ei.value)   # went through the adapter
+
+
+def test_agent_runner_qwen_chat_success(monkeypatch):
+    import codec_agent_runner
+    monkeypatch.setattr(codec_llm, "call", lambda *a, **k: "action-json")
+    assert codec_agent_runner._qwen_chat("hi", "sys") == "action-json"
+
+
+def test_agent_runner_qwen_chat_adapts_to_qwen_unavailable(monkeypatch):
+    import codec_agent_runner
+
+    def raise_llm(*a, **k):
+        raise codec_llm.LLMError("boom-runner")
+
+    monkeypatch.setattr(codec_llm, "call", raise_llm)
+    with pytest.raises(codec_agent_runner.QwenUnavailableError) as ei:
+        codec_agent_runner._qwen_chat("hi", "sys")
+    assert "boom-runner" in str(ei.value)
+
+
+# ── source-level migration invariants ─────────────────────────────────────────
+
+
+def test_textassist_uses_codec_llm():
+    src = (REPO / "codec_textassist.py").read_text()
+    assert "codec_llm.call(" in src
+    assert "/chat/completions" not in src       # inline POST gone
+
+
+def test_regen_uses_codec_llm():
+    src = (REPO / "scripts" / "regen_skill_descriptions.py").read_text()
+    assert "codec_llm.call(" in src
+    assert ".raise_for_status(" not in src      # fail-loud now via LLMError (call, not prose)
+
+
+def test_agent_plan_uses_codec_llm():
+    src = (REPO / "codec_agent_plan.py").read_text()
+    assert "codec_llm.call(" in src
+    assert "raise_on_error=True" in src
+
+
+def test_agent_runner_uses_codec_llm():
+    src = (REPO / "codec_agent_runner.py").read_text()
+    assert "codec_llm.call(" in src
+    assert "raise_on_error=True" in src