DataHackIL · shaypal5 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py
@@ -15,11 +15,15 @@
 
 from __future__ import annotations
 
+import logging
+import re
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 
 from synthbanshee.tts.ssml_types import PhraseProsody
 
+_log = logging.getLogger(__name__)
+
 _AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis"
 _MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
 _SPEAK_LANG = "he-IL"
@@ -30,6 +34,28 @@
 # output — it may need per-provider tuning if engines respond differently.
 _WORD_BREAK_MS = 50
 
+# Azure prosody attribute limits (documented ranges).
+_AZURE_RATE_MIN_PCT = -50  # rate="-50%" → 0.5x
+_AZURE_RATE_MAX_PCT = 200  # rate="+200%" → 3.0x
+_AZURE_PITCH_MIN_PCT = -50
+_AZURE_PITCH_MAX_PCT = 50
+_AZURE_VOLUME_MIN_PCT = -50
+_AZURE_VOLUME_MAX_PCT = 50
+
+# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F.
+# These must be stripped before embedding text in SSML.
+_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+
+
+def _sanitize_text(text: str) -> str:
+    """Remove characters that are invalid in XML 1.0 from *text*.
+
+    Defense-in-depth: ideally invalid chars should be rejected at the LLM
+    response parsing boundary (script/generator.py).  This guard ensures the
+    SSML builder never produces unparseable XML regardless of upstream bugs.
+    """
+    return _XML_INVALID_CHARS_RE.sub("", text)
+
 
 def _inject_word_breaks(
     parent: ET.Element,
@@ -113,23 +139,44 @@ class UtteranceSpec:
 
 
 def _semitones_to_percent(st: float) -> str:
-    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%')."""
+    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').
+
+    Values are clamped to Azure's documented ±50% range.  A warning is logged
+    when clamping activates — this indicates a speaker config or state-drift bug.
+    """
     # Approximation: 1 semitone ≈ 5.946% pitch change
     pct = round(st * 5.946)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _rate_to_string(rate: float) -> str:
-    """Format a rate multiplier as a percentage string for <prosody rate=...>."""
+    """Format a rate multiplier as a percentage string for <prosody rate=...>.
+
+    Values are clamped to Azure's documented -50% to +200% range.  A warning is
+    logged when clamping activates.
+    """
     pct = round((rate - 1.0) * 100)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _volume_to_string(db: float) -> str:
-    """Format a dB volume offset as a percentage for <prosody volume=...>."""
+    """Format a dB volume offset as a percentage for <prosody volume=...>.
+
+    Values are clamped to Azure's documented ±50% range.  A warning is logged
+    when clamping activates.
+    """
     # Azure volume is 0–100; default is 100. Map dB linearly (rough).
     pct = round(db * 1.0)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _apply_phrase_prosody(
@@ -180,9 +227,23 @@ def _append_break(ms: int) -> ET.Element:
         if phrase.char_start > cursor:
             _append_text(text[cursor : phrase.char_start])
 
-        # Optional break before the phrase.
+        # Optional break before the phrase.  Merge with the preceding
+        # <break> element (if any) to avoid adjacent breaks that Azure's SSML
+        # parser rejects with error 0x80045003 (#67).
         if phrase.break_before_ms > 0:
-            _append_break(phrase.break_before_ms)
+            if prev is not None and prev.tag == "break":
+                prev_time = prev.attrib.get("time", "0ms")
+                prev_ms = int(prev_time.replace("ms", ""))
+                if prev_ms == _WORD_BREAK_MS:
+                    # Word-boundary break: replace with the phrase break
+                    # (the phrase break subsumes the word-boundary intent).
+                    prev.attrib["time"] = f"{phrase.break_before_ms}ms"
+                else:
+                    # Semantic break (e.g. break_after from prior phrase):
+                    # sum durations to preserve both intents.
+                    prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms"
+            else:
+                _append_break(phrase.break_before_ms)
 
         # Phrase-level prosody wrapper (omitted when only breaks are requested).
         phrase_attrs: dict[str, str] = {}
@@ -256,6 +317,8 @@ def build_multi(
         speak = ET.Element("speak", attrib=speak_attribs)
 
         for utt in utterances:
+            # Sanitize text: strip characters invalid in XML 1.0 (#67).
+            utt_text = _sanitize_text(utt.text)
             voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id})
 
             # Add express-as only when a non-default style is requested AND
@@ -285,9 +348,9 @@ def build_multi(
                 inner = parent
 
             if utt.phrase_prosody:
-                _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody)
+                _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
             else:
-                _inject_word_breaks(inner, utt.text)
+                _inject_word_breaks(inner, utt_text)
 
         raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw