AVADSA25 · AVADSA25 · May 22, 2026 · May 22, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -56,7 +56,7 @@ docs/                        API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig
 
 Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.)
 
-**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches. Done: `codec_llm.call()` (non-stream) + `stream()` (sync SSE generator, yields raw deltas, never-raises); migrated sites = codec.py voice-reply, `codec_session.qwen_call` + `qwen_stream`, `codec_compaction`, `codec_dictate`. Pending tranches: 2c raise-mode (`codec_llm.call(raise_on_error=True)` for agent_plan/runner + textassist + the regen script — they MUST fail loud, never-raise would silently paste empty / write empty), an async `astream()` for voice `_stream_qwen` + agents (queue stays at the call site — `codec_llm` never owns the semaphore), dashboard (4 non-stream + the `[SKILL:…]` stream tag-machine, which keeps its own parser and consumes only `stream()`'s raw tokens), bridges, and a skills tranche.
+**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches. Done: `codec_llm.call()` (non-stream; + `raise_on_error=True` raising `codec_llm.LLMError` for fail-loud callers) + `stream()` (sync SSE generator, yields raw deltas, never-raises); migrated sites = codec.py voice-reply, `codec_session.qwen_call` + `qwen_stream`, `codec_compaction`, `codec_dictate`, `codec_textassist`, the regen script, and `codec_agent_plan`/`codec_agent_runner` `_qwen_chat` (adapter maps `LLMError` → their public `QwenUnavailableError`). Pending tranches: an async `astream()` for voice `_stream_qwen` + agents `Agent.run` (queue stays at the call site — `codec_llm` never owns the semaphore), dashboard (4 non-stream + the `[SKILL:…]` stream tag-machine, which keeps its own parser and consumes only `stream()`'s raw tokens), bridges, and a skills tranche.
 
 ## 3. Agent + Crew runtime
 

diff --git a/codec_agent_plan.py b/codec_agent_plan.py
@@ -249,6 +249,15 @@ def _qwen_model() -> str:
         return "mlx-community/Qwen3.6-35B-A3B-4bit"
 
 
+def _qwen_base() -> str:
+    """Base URL (no /chat/completions) for codec_llm.call — call-time resolved."""
+    try:
+        from codec_config import QWEN_BASE_URL
+        return QWEN_BASE_URL
+    except Exception:
+        return "http://localhost:8083/v1"
+
+
 QWEN_URL = _qwen_url()       # back-compat — module-level constant for tests
 QWEN_MODEL = _qwen_model()   # back-compat
 QWEN_TIMEOUT = 60  # seconds
@@ -268,33 +277,27 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
     assistant's content string. Raises QwenUnavailableError on
     network failure or non-2xx response.
 
-    URL + model resolved at call time via _qwen_url() / _qwen_model()
+    URL + model resolved at call time via _qwen_base() / _qwen_model()
     so they pick up ~/.codec/config.json:llm_base_url + :llm_model
     rather than the deploy-time hardcoded values."""
-    import requests  # lazy import — avoid forcing requests on test machines without it
-
-    payload = {
-        "model": _qwen_model(),
-        "messages": [
-            {"role": "system", "content": system_prompt or ""},
-            {"role": "user",   "content": user_prompt},
-        ],
-        "max_tokens": max_tokens,
-        "temperature": 0.2,
-    }
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True) replaces the
+    # inline POST + per-failure raises. The adapter maps codec_llm.LLMError onto
+    # the public QwenUnavailableError, so callers' `except QwenUnavailableError`
+    # is unchanged. (Now also strips <think> + enable_thinking=False — the
+    # downstream JSON parse is more robust for it.)
+    import codec_llm
     try:
-        r = requests.post(_qwen_url(), json=payload, timeout=QWEN_TIMEOUT)
-    except requests.exceptions.ConnectionError as e:
-        raise QwenUnavailableError(f"qwen3.6 unreachable: {e}")
-    except requests.exceptions.Timeout:
-        raise QwenUnavailableError("qwen3.6 request timed out")
-    if r.status_code != 200:
-        raise QwenUnavailableError(f"qwen3.6 returned {r.status_code}: {r.text[:200]}")
-    try:
-        data = r.json()
-        return data["choices"][0]["message"]["content"]
-    except (KeyError, json.JSONDecodeError) as e:
-        raise QwenUnavailableError(f"qwen3.6 returned malformed response: {e}")
+        return codec_llm.call(
+            [
+                {"role": "system", "content": system_prompt or ""},
+                {"role": "user",   "content": user_prompt},
+            ],
+            base_url=_qwen_base(), model=_qwen_model(),
+            max_tokens=max_tokens, temperature=0.2,
+            timeout=QWEN_TIMEOUT, raise_on_error=True,
+        )
+    except codec_llm.LLMError as e:
+        raise QwenUnavailableError(f"qwen3.6 unavailable: {e}") from e
 
 
 # ── Plan drafting ─────────────────────────────────────────────────────────────

diff --git a/codec_agent_runner.py b/codec_agent_runner.py
@@ -239,6 +239,15 @@ def _qwen_model() -> str:
         return "mlx-community/Qwen3.6-35B-A3B-4bit"
 
 
+def _qwen_base() -> str:
+    """Base URL (no /chat/completions) for codec_llm.call — call-time resolved."""
+    try:
+        from codec_config import QWEN_BASE_URL
+        return QWEN_BASE_URL
+    except Exception:
+        return "http://localhost:8083/v1"
+
+
 QWEN_URL = _qwen_url()
 QWEN_MODEL = _qwen_model()
 QWEN_TIMEOUT = 60
@@ -306,28 +315,23 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
 
     URL + model resolved at call time so config.json changes are picked
     up without a process restart."""
-    import requests
-    payload = {
-        "model": _qwen_model(),
-        "messages": [
-            {"role": "system", "content": system_prompt or ""},
-            {"role": "user",   "content": user_prompt},
-        ],
-        "max_tokens": max_tokens,
-        "temperature": 0.2,
-    }
-    try:
-        r = requests.post(_qwen_url(), json=payload, timeout=QWEN_TIMEOUT)
-    except requests.exceptions.ConnectionError as e:
-        raise QwenUnavailableError(f"qwen3.6 unreachable: {e}")
-    except requests.exceptions.Timeout:
-        raise QwenUnavailableError("qwen3.6 request timed out")
-    if r.status_code != 200:
-        raise QwenUnavailableError(f"qwen3.6 returned {r.status_code}")
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True). Adapter
+    # maps codec_llm.LLMError -> the public QwenUnavailableError so the daemon's
+    # retry/abort logic (except QwenUnavailableError) is unchanged. Kept parallel
+    # with codec_agent_plan._qwen_chat.
+    import codec_llm
     try:
-        return r.json()["choices"][0]["message"]["content"]
-    except (KeyError, json.JSONDecodeError) as e:
-        raise QwenUnavailableError(f"qwen3.6 returned malformed response: {e}")
+        return codec_llm.call(
+            [
+                {"role": "system", "content": system_prompt or ""},
+                {"role": "user",   "content": user_prompt},
+            ],
+            base_url=_qwen_base(), model=_qwen_model(),
+            max_tokens=max_tokens, temperature=0.2,
+            timeout=QWEN_TIMEOUT, raise_on_error=True,
+        )
+    except codec_llm.LLMError as e:
+        raise QwenUnavailableError(f"qwen3.6 unavailable: {e}") from e
 
 
 def _qwen_next_action(plan_dict: Dict[str, Any], checkpoint: Dict[str, Any],

diff --git a/codec_llm.py b/codec_llm.py
@@ -27,6 +27,16 @@
 
 log = logging.getLogger("codec.llm")
 
+
+class LLMError(Exception):
+    """Raised by ``call(raise_on_error=True)`` on any non-success outcome —
+    non-200 (after retries), a request exception (after retries), or a 200 with
+    empty/unparseable content. The default ``raise_on_error=False`` keeps the
+    never-raise → "" contract that the streaming/best-effort callers rely on.
+    Fail-loud callers (agent_plan/runner, textassist, the regen script) opt in
+    and map this onto their own error handling."""
+
+
 _THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
 
 
@@ -98,13 +108,21 @@ def call(
     retries: int = 1,
     enable_thinking: bool = False,
     extra_kwargs: Optional[Dict[str, Any]] = None,
+    raise_on_error: bool = False,
 ) -> str:
     """POST `messages` to `<base_url>/chat/completions` and return the parsed,
-    `<think>`-stripped assistant text (or "" on failure).
+    `<think>`-stripped assistant text.
 
     `retries` includes the first attempt (retries=3 → up to 3 tries with
     exponential 2**n backoff between them, matching codec_session.qwen_call).
-    Never raises — network/parse errors are logged and yield "".
+
+    Error contract:
+    - `raise_on_error=False` (default): never raises — network/parse errors and
+      empty/unparseable 200s are logged and yield "".
+    - `raise_on_error=True`: raises `LLMError` on EVERY non-success outcome
+      (non-200 after retries, request exception after retries, or a 200 with
+      empty/unparseable content). For fail-loud callers that must not silently
+      proceed on an empty answer.
     """
     import requests
     headers, payload = _build_request(
@@ -115,20 +133,29 @@ def call(
 
     attempts = max(1, retries)
     url = base_url.rstrip("/") + "/chat/completions"
+    last_error: Optional[Exception] = None
     for attempt in range(attempts):
         try:
             r = requests.post(url, json=payload, headers=headers, timeout=timeout)
             if r.status_code == 200:
                 resp = extract_content(r.json())
                 if resp:
                     return resp
-                # 200 but empty/odd shape — don't retry, nothing more to get.
+                # 200 but empty/odd shape — nothing more to get; don't retry.
+                if raise_on_error:
+                    raise LLMError("LLM returned empty or unparseable content")
                 return ""
+            last_error = LLMError(f"LLM call returned {r.status_code}: {r.text[:200]}")
             log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200])
+        except LLMError:
+            raise  # empty-200 in raise mode — propagate, don't swallow as a retry
         except Exception as e:
+            last_error = e
             log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e)
             if attempt < attempts - 1:
                 time.sleep(2 ** attempt)
+    if raise_on_error:
+        raise LLMError(f"LLM call failed after {attempts} attempt(s): {last_error}")
     return ""
 
 

diff --git a/codec_textassist.py b/codec_textassist.py
@@ -24,15 +24,19 @@ def call_qwen(text, mode):
         "translate": "You are a translator. Translate the following text into English. No matter what language the input is — Ukrainian, Spanish, French, Russian, Chinese, Arabic, anything — always translate to English. Output ONLY the translated English text, nothing else.",
         "prompt": "You are a prompt engineer. Rewrite the following text to be a clear, optimized prompt for an AI language model. Make it specific, structured, and effective. Remove ambiguity, add context where helpful, and ensure the intent is crystal clear. Output ONLY the optimized prompt, nothing else."
     }
-    payload = {"model": model, "messages": [
-        {"role": "system", "content": prompts.get(mode, prompts["proofread"])},
-        {"role": "user", "content": text}
-    ], "max_tokens": 4000, "temperature": 0.3, "stream": False,
-    "chat_template_kwargs": {"enable_thinking": False}}
-    payload.update(kwargs)
-    r = requests.post(f"{base}/chat/completions", json=payload, timeout=60)
-    result = r.json()["choices"][0]["message"]["content"].strip()
-    result = re.sub(r'<think>[\s\S]*?</think>', '', result).strip()
+    # A-12 (PR-3E-2c): canonical codec_llm.call(raise_on_error=True). Fail-loud
+    # is required here — the caller's except shows an Error overlay; never-raise
+    # would paste an empty result over the user's selection. codec_llm strips
+    # <think>; the `### FINAL ANSWER:` marker is textassist-specific so it stays.
+    import codec_llm
+    result = codec_llm.call(
+        [
+            {"role": "system", "content": prompts.get(mode, prompts["proofread"])},
+            {"role": "user", "content": text},
+        ],
+        base_url=base, model=model, max_tokens=4000, temperature=0.3,
+        extra_kwargs=kwargs, timeout=60, raise_on_error=True,
+    )
     return re.sub(r'###\s*FINAL ANSWER:\s*', '', result).strip()
 
 def overlay(text, color, duration):