From 99eeb84fde3840c4e4e2cf6ffe1f74eeb6d12cdc Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 23:29:29 +0300
Subject: [PATCH 1/3] Revert "fix(tts): Azure SSML parsing error on adjacent
 break elements (#67) (#71)"

This reverts commit 0bdb217fc5e83a11a9c4161c0178bbb4f8c3ebc2.
---
 synthbanshee/tts/ssml_builder.py |  83 ++-------
 tests/unit/test_tts.py           | 290 -------------------------------
 2 files changed, 10 insertions(+), 363 deletions(-)
diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py
index 57de60d..ed07039 100644
--- a/synthbanshee/tts/ssml_builder.py
+++ b/synthbanshee/tts/ssml_builder.py
@@ -15,15 +15,11 @@
 
 from __future__ import annotations
 
-import logging
-import re
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 
 from synthbanshee.tts.ssml_types import PhraseProsody
 
-_log = logging.getLogger(__name__)
-
 _AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis"
 _MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
 _SPEAK_LANG = "he-IL"
@@ -34,28 +30,6 @@
 # output — it may need per-provider tuning if engines respond differently.
 _WORD_BREAK_MS = 50
 
-# Azure prosody attribute limits (documented ranges).
-_AZURE_RATE_MIN_PCT = -50  # rate="-50%" → 0.5x
-_AZURE_RATE_MAX_PCT = 200  # rate="+200%" → 3.0x
-_AZURE_PITCH_MIN_PCT = -50
-_AZURE_PITCH_MAX_PCT = 50
-_AZURE_VOLUME_MIN_PCT = -50
-_AZURE_VOLUME_MAX_PCT = 50
-
-# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F.
-# These must be stripped before embedding text in SSML.
-_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
-
-
-def _sanitize_text(text: str) -> str:
-    """Remove characters that are invalid in XML 1.0 from *text*.
-
-    Defense-in-depth: ideally invalid chars should be rejected at the LLM
-    response parsing boundary (script/generator.py).  This guard ensures the
-    SSML builder never produces unparseable XML regardless of upstream bugs.
-    """
-    return _XML_INVALID_CHARS_RE.sub("", text)
-
 
 def _inject_word_breaks(
     parent: ET.Element,
@@ -139,44 +113,23 @@ class UtteranceSpec:
 
 
 def _semitones_to_percent(st: float) -> str:
-    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').
-
-    Values are clamped to Azure's documented ±50% range.  A warning is logged
-    when clamping activates — this indicates a speaker config or state-drift bug.
-    """
+    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%')."""
     # Approximation: 1 semitone ≈ 5.946% pitch change
     pct = round(st * 5.946)
-    clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct))
-    if clamped != pct:
-        _log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
-    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
+    return f"+{pct}%" if pct >= 0 else f"{pct}%"
 
 
 def _rate_to_string(rate: float) -> str:
-    """Format a rate multiplier as a percentage string for <prosody rate=...>.
-
-    Values are clamped to Azure's documented -50% to +200% range.  A warning is
-    logged when clamping activates.
-    """
+    """Format a rate multiplier as a percentage string for <prosody rate=...>."""
     pct = round((rate - 1.0) * 100)
-    clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct))
-    if clamped != pct:
-        _log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
-    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
+    return f"+{pct}%" if pct >= 0 else f"{pct}%"
 
 
 def _volume_to_string(db: float) -> str:
-    """Format a dB volume offset as a percentage for <prosody volume=...>.
-
-    Values are clamped to Azure's documented ±50% range.  A warning is logged
-    when clamping activates.
-    """
+    """Format a dB volume offset as a percentage for <prosody volume=...>."""
     # Azure volume is 0–100; default is 100. Map dB linearly (rough).
     pct = round(db * 1.0)
-    clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct))
-    if clamped != pct:
-        _log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
-    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
+    return f"+{pct}%" if pct >= 0 else f"{pct}%"
 
 
 def _apply_phrase_prosody(
@@ -227,23 +180,9 @@ def _append_break(ms: int) -> ET.Element:
         if phrase.char_start > cursor:
             _append_text(text[cursor : phrase.char_start])
 
-        # Optional break before the phrase.  Merge with the preceding
-        # <break> element (if any) to avoid adjacent breaks that Azure's SSML
-        # parser rejects with error 0x80045003 (#67).
+        # Optional break before the phrase.
         if phrase.break_before_ms > 0:
-            if prev is not None and prev.tag == "break":
-                prev_time = prev.attrib.get("time", "0ms")
-                prev_ms = int(prev_time.replace("ms", ""))
-                if prev_ms == _WORD_BREAK_MS:
-                    # Word-boundary break: replace with the phrase break
-                    # (the phrase break subsumes the word-boundary intent).
-                    prev.attrib["time"] = f"{phrase.break_before_ms}ms"
-                else:
-                    # Semantic break (e.g. break_after from prior phrase):
-                    # sum durations to preserve both intents.
-                    prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms"
-            else:
-                _append_break(phrase.break_before_ms)
+            _append_break(phrase.break_before_ms)
 
         # Phrase-level prosody wrapper (omitted when only breaks are requested).
         phrase_attrs: dict[str, str] = {}
@@ -317,8 +256,6 @@ def build_multi(
         speak = ET.Element("speak", attrib=speak_attribs)
 
         for utt in utterances:
-            # Sanitize text: strip characters invalid in XML 1.0 (#67).
-            utt_text = _sanitize_text(utt.text)
             voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id})
 
             # Add express-as only when a non-default style is requested AND
@@ -348,9 +285,9 @@ def build_multi(
                 inner = parent
 
             if utt.phrase_prosody:
-                _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
+                _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody)
             else:
-                _inject_word_breaks(inner, utt_text)
+                _inject_word_breaks(inner, utt.text)
 
         raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw
diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py
index f74cad8..29c11c7 100644
--- a/tests/unit/test_tts.py
+++ b/tests/unit/test_tts.py
@@ -330,296 +330,6 @@ def test_inject_word_breaks_whitespace_only(self):
         assert parent.text == "  "
 
 
-# ---------------------------------------------------------------------------
-# SSML parse-error regression tests (#67)
-# ---------------------------------------------------------------------------
-
-
-class TestSSMLParseErrorFix67:
-    """Regression tests for Azure SSML parsing error 0x80045003.
-
-    The root cause is adjacent <break> elements created when inter-word breaks
-    (PR #70) interact with phrase prosody break_before/break_after attributes.
-    Azure's SSML parser rejects adjacent breaks as malformed.
-    """
-
-    def setup_method(self):
-        self.builder = SSMLBuilder()
-
-    def _body(self, ssml: str) -> str:
-        return ssml.split("\n", 1)[1] if ssml.startswith("<?xml") else ssml
-
-    def test_no_adjacent_breaks_with_break_before(self):
-        """Phrase break_before must merge with preceding inter-word break."""
-        import re
-
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # "word1 word2 phrase1 phrase2" — phrase starts at word boundary
-        text = "word1 word2 phrase1 phrase2"
-        phrase = PhraseProsody(
-            phrase_id="p0",
-            char_start=12,  # "phrase1 phrase2"
-            char_end=27,
-            rate="-25%",
-            pitch="-1st",
-            break_before_ms=300,  # "menace" hint
-        )
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            phrase_prosody=[phrase],
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-
-        # No two <break .../> elements should be adjacent (only whitespace between).
-        adjacent_breaks = re.findall(r"<break[^/]*/>\s*<break", ssml)
-        assert len(adjacent_breaks) == 0, f"Adjacent breaks found in SSML: {ssml}"
-
-    def test_merged_break_uses_max_duration(self):
-        """When merging breaks, the longer duration wins."""
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        text = "before phrase_word"
-        phrase = PhraseProsody(
-            phrase_id="p0",
-            char_start=7,  # "phrase_word"
-            char_end=18,
-            rate="-20%",
-            break_before_ms=150,  # "slow" hint — longer than 50ms word break
-        )
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            phrase_prosody=[phrase],
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-
-        # The inter-word break (50ms) between "before" and phrase should be
-        # merged to 150ms (the phrase break_before duration wins).
-        assert 'time="150ms"' in ssml
-        # The original 50ms word break should NOT appear as a separate element.
-        assert ssml.count('time="50ms"') == 0
-
-    def test_menace_hint_no_adjacent_breaks(self):
-        """Menace hint (break_before=300ms) must not create adjacent breaks."""
-        import re
-
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # Simulate a failing pattern: multi-word text with menace-annotated phrase.
-        text = "word1 word2 word3 threat_word1 threat_word2"
-        phrase = PhraseProsody(
-            phrase_id="p0",
-            char_start=18,
-            char_end=44,
-            rate="-25%",
-            pitch="-1st",
-            break_before_ms=300,
-        )
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            rate_multiplier=1.14,
-            pitch_delta_st=2.0,
-            volume_delta_db=13.0,
-            phrase_prosody=[phrase],
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-
-        # Must be valid XML.
-        import xml.etree.ElementTree as ET
-
-        ET.fromstring(self._body(ssml))
-
-        # No adjacent breaks.
-        adjacent_breaks = re.findall(r"<break[^/]*/>\s*<break", ssml)
-        assert len(adjacent_breaks) == 0, f"Adjacent breaks found in SSML: {ssml}"
-
-        # The 300ms menace break should appear (replaced the 50ms word break).
-        assert 'time="300ms"' in ssml
-
-    def test_semantic_break_after_then_break_before_sums(self):
-        """break_after + break_before of consecutive phrases must sum durations."""
-        import re
-
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # Two phrases with break_after on first and break_before on second.
-        # Space between them is " " (no words → no word breaks created).
-        text = "intro phrase_a end_a mid phrase_b outro"
-        phrases = [
-            PhraseProsody(
-                phrase_id="p0",
-                char_start=6,
-                char_end=20,  # "phrase_a end_a"
-                rate="-20%",
-                break_after_ms=250,
-            ),
-            PhraseProsody(
-                phrase_id="p1",
-                char_start=25,
-                char_end=33,  # "phrase_b"
-                rate="-25%",
-                break_before_ms=300,
-            ),
-        ]
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            phrase_prosody=phrases,
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-
-        # No adjacent breaks.
-        adjacent_breaks = re.findall(r"<break[^/]*/>\s*<break", ssml)
-        assert len(adjacent_breaks) == 0, f"Adjacent breaks found: {ssml}"
-
-        # The 250ms break_after and 300ms break_before should be summed (550ms)
-        # because the preceding break is semantic (not a word-boundary break).
-        assert 'time="550ms"' in ssml
-
-    def test_text_with_invalid_xml_chars_sanitized(self):
-        """Characters invalid in XML 1.0 must be stripped before SSML building."""
-        # Simulate LLM output with control characters.
-        text = "hello\x00world\x0bfoo\x1fbar"
-        utt = UtteranceSpec(text=text, voice_id="he-IL-AvriNeural")
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-        # Must not contain invalid chars.
-        assert "\x00" not in ssml
-        assert "\x0b" not in ssml
-        assert "\x1f" not in ssml
-        # Words must still be present.
-        assert "hello" in ssml
-        assert "world" in ssml
-        assert "foo" in ssml
-        assert "bar" in ssml
-
-    def test_hebrew_high_intensity_with_menace_hint(self):
-        """Regression: Hebrew I5 turn with menace hint must produce valid SSML.
-
-        Simulates the exact pattern from failing scene sp_sv_a_0001: AGG speaker
-        at intensity 5 with accumulated state drift, multi-word Hebrew text,
-        and a "menace" phrase hint with break_before=300ms.
-        """
-        import re
-        import xml.etree.ElementTree as ET
-
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # Hebrew text simulating an I5 assault-scene turn (3 sentences).
-        # Contains niqqud on one word to test PR #69 interaction.
-        text = (
-            "\u05d0\u05ea \u05dc\u05d0 \u05ea\u05e2\u05e9\u05d4"  # "you don't do"
-            " \u05de\u05d4 \u05e9\u05d0\u05de\u05e8\u05ea\u05d9"  # " what I said"
-            " \u05dc\u05da. "  # " to you. "
-            "\u05ea\u05b4\u05e9\u05b0\u05de\u05e2\u05d9"  # "listen" (with niqqud)
-            " \u05d8\u05d5\u05d1!"  # " well!"
-        )
-        # "menace" hint on the imperative word (with niqqud): char_start/end
-        # covering the niqqud-bearing word.
-        imperative_start = text.index("\u05ea\u05b4\u05e9\u05b0\u05de\u05e2\u05d9")
-        imperative_end = imperative_start + len("\u05ea\u05b4\u05e9\u05b0\u05de\u05e2\u05d9")
-        phrase = PhraseProsody(
-            phrase_id="t5_p0",
-            char_start=imperative_start,
-            char_end=imperative_end,
-            rate="-25%",
-            pitch="-1st",
-            break_before_ms=300,
-        )
-        # AGG_M_30-45_001 at I5 with state drift: rate=1.14*1.05*~1.15=1.38,
-        # pitch=2+~1.5=3.5 st, volume=13+0+~4=17 dB.
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            rate_multiplier=1.38,
-            pitch_delta_st=3.5,
-            volume_delta_db=17.0,
-            phrase_prosody=[phrase],
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-
-        # Must be valid XML.
-        ET.fromstring(self._body(ssml))
-
-        # No adjacent breaks.
-        adjacent_breaks = re.findall(r"<break[^/]*/>\s*<break", ssml)
-        assert len(adjacent_breaks) == 0, f"Adjacent breaks in Hebrew SSML: {ssml}"
-
-        # Hebrew text must survive (spot-check key words).
-        assert "\u05ea\u05e2\u05e9\u05d4" in ssml  # "do"
-        assert "\u05ea\u05b4\u05e9\u05b0\u05de\u05e2\u05d9" in ssml  # "listen" with niqqud
-        assert "\u05d8\u05d5\u05d1" in ssml  # "well"
-
-        # The 300ms menace break must appear (merged with preceding word break).
-        assert 'time="300ms"' in ssml
-
-    def test_prosody_pitch_clamped_to_azure_range(self):
-        """Extreme pitch values must be clamped to ±50%."""
-        # 12 semitones → 71% unclamped, should be clamped to 50%
-        assert _semitones_to_percent(12.0) == "+50%"
-        assert _semitones_to_percent(-12.0) == "-50%"
-        # Moderate values remain unchanged
-        assert _semitones_to_percent(3.0) == "+18%"
-        assert _semitones_to_percent(-2.0) == "-12%"
-
-    def test_prosody_rate_clamped_to_azure_range(self):
-        """Extreme rate values must be clamped to Azure's -50% to +200% range."""
-        # rate=4.0 → +300% unclamped, should be clamped to +200%
-        assert _rate_to_string(4.0) == "+200%"
-        # rate=0.3 → -70% unclamped, should be clamped to -50%
-        assert _rate_to_string(0.3) == "-50%"
-        # Normal values pass through
-        assert _rate_to_string(1.1) == "+10%"
-
-    def test_well_formed_ssml_with_phrase_prosody_and_breaks(self):
-        """Full integration: high-intensity turn with phrase prosody must be valid XML."""
-        import xml.etree.ElementTree as ET
-
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # Simulate a high-intensity assault-scene turn with multiple hints.
-        text = "word1 word2 word3 word4 phrase_a1 phrase_a2 word5 word6 phrase_b1 word7"
-        # Offsets: phrase_a1 phrase_a2 = 24:43, phrase_b1 = 56:65
-        phrases = [
-            PhraseProsody(
-                phrase_id="p0",
-                char_start=24,
-                char_end=43,
-                rate="+15%",
-                volume="+3dB",
-                pitch="+1st",
-                break_before_ms=0,  # "stress" — no break_before
-            ),
-            PhraseProsody(
-                phrase_id="p1",
-                char_start=56,
-                char_end=65,
-                rate="-25%",
-                pitch="-1st",
-                break_before_ms=300,  # "menace" — has break_before
-            ),
-        ]
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            rate_multiplier=1.14,
-            pitch_delta_st=3.5,
-            volume_delta_db=17.0,
-            phrase_prosody=phrases,
-        )
-        ssml = self.builder.build_single(utt, supports_style_tags=False)
-        body = self._body(ssml)
-
-        # Must parse as valid XML.
-        ET.fromstring(body)
-
-        # All words must be present in the output.
-        for word in text.split():
-            assert word in ssml
-
-
 # ---------------------------------------------------------------------------
 # AzureProvider tests (mocked)
 # ---------------------------------------------------------------------------

From d07e14428acd150442379333898ddf36fd78e8ae Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 23:29:29 +0300
Subject: [PATCH 2/3] Revert "fix(tts): insert inter-word <break> tags to
 prevent Hebrew word merging (#70)"

This reverts commit d0c273bbc431a49a3818b973b41745c9a4a51cec.
---
 synthbanshee/tts/ssml_builder.py  |  95 ++-------------
 tests/unit/test_phrase_prosody.py |  17 +--
 tests/unit/test_tts.py            | 196 ------------------------------
 3 files changed, 13 insertions(+), 295 deletions(-)

diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py
index ed07039..9d987e2 100644
--- a/synthbanshee/tts/ssml_builder.py
+++ b/synthbanshee/tts/ssml_builder.py
@@ -6,8 +6,6 @@
     ``supports_style_tags=False``)
   - <prosody> elements for rate, pitch, and volume control
   - Nested per-phrase <prosody> + <break> elements (M2b)
-  - Inter-word ``<break time="50ms"/>`` elements to prevent Hebrew word
-    merging (#62)
 
 Azure SSML reference:
 https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice
@@ -24,80 +22,6 @@
 _MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
 _SPEAK_LANG = "he-IL"
 
-# Inter-word break duration in milliseconds.  50 ms is the initial estimate
-# for signalling a word boundary to Azure / Google he-IL without introducing
-# an audible pause.  This value needs empirical validation with real TTS
-# output — it may need per-provider tuning if engines respond differently.
-_WORD_BREAK_MS = 50
-
-
-def _inject_word_breaks(
-    parent: ET.Element,
-    text: str,
-    after: ET.Element | None = None,
-) -> ET.Element | None:
-    """Insert *text* into *parent* with ``<break>`` tags between words.
-
-    Hebrew TTS engines (Azure he-IL, Google Chirp) merge adjacent words into
-    unintelligible speech when no explicit boundary cue exists.  This function
-    splits *text* on whitespace and inserts a short ``<break>`` element between
-    every pair of consecutive words.
-
-    Args:
-        parent: The XML element to add content to.
-        text: The text to inject (may contain multi-word Hebrew).
-        after: If provided, the first text chunk is appended to
-            ``after.tail`` instead of ``parent.text``.
-
-    Returns:
-        The last child element added to *parent*, or *after* if no ``<break>``
-        elements were created (single word or empty text).
-    """
-    if not text or not text.strip():
-        # Pure whitespace or empty — append as-is without inserting breaks.
-        if text:
-            if after is None:
-                parent.text = (parent.text or "") + text
-            else:
-                after.tail = (after.tail or "") + text
-        return after
-
-    words = text.split()
-    last = after
-
-    # Preserve any leading whitespace (e.g. space before a text fragment
-    # that follows a <break> or <prosody> element).
-    leading = text[: len(text) - len(text.lstrip())]
-    if leading:
-        if last is None:
-            parent.text = (parent.text or "") + leading
-        else:
-            last.tail = (last.tail or "") + leading
-
-    for i, word in enumerate(words):
-        if i == 0:
-            # First word: append directly (no break needed before it).
-            if last is None:
-                parent.text = (parent.text or "") + word
-            else:
-                last.tail = (last.tail or "") + word
-        else:
-            # Subsequent words: insert <break/> then the word with a
-            # leading space in the tail to preserve normal spacing.
-            brk = ET.SubElement(parent, "break", attrib={"time": f"{_WORD_BREAK_MS}ms"})
-            brk.tail = " " + word
-            last = brk
-
-    # Preserve any trailing whitespace.
-    trailing = text[len(text.rstrip()) :]
-    if trailing:
-        if last is None:
-            parent.text = (parent.text or "") + trailing
-        else:
-            last.tail = (last.tail or "") + trailing
-
-    return last
-
 
 @dataclass
 class UtteranceSpec:
@@ -141,10 +65,9 @@ def _apply_phrase_prosody(
 
     Splits *text* around phrase spans and wraps each phrase in a nested
     ``<prosody>`` element with optional ``<break time="…"/>`` elements
-    inserted before and/or after.  Inter-word ``<break>`` elements are
-    inserted within each text fragment to prevent Hebrew word merging (#62).
-    Overlapping spans are skipped (the span whose ``char_start`` falls
-    before the previous span's ``char_end`` is silently dropped).
+    inserted before and/or after.  Overlapping spans are skipped (the span
+    whose ``char_start`` falls before the previous span's ``char_end`` is
+    silently dropped).
 
     Args:
         parent: The XML element that will receive the mixed text/element content.
@@ -155,13 +78,13 @@ def _apply_phrase_prosody(
     prev: ET.Element | None = None
 
     def _append_text(s: str) -> None:
-        """Append *s* with inter-word ``<break>`` elements."""
         nonlocal prev
         if not s:
             return
-        result = _inject_word_breaks(parent, s, after=prev)
-        if result is not None:
-            prev = result
+        if prev is None:
+            parent.text = (parent.text or "") + s
+        else:
+            prev.tail = (prev.tail or "") + s
 
     def _append_break(ms: int) -> ET.Element:
         nonlocal prev
@@ -196,7 +119,7 @@ def _append_break(ms: int) -> ET.Element:
         phrase_text = text[phrase.char_start : phrase.char_end]
         if phrase_attrs:
             pe = ET.SubElement(parent, "prosody", attrib=phrase_attrs)
-            _inject_word_breaks(pe, phrase_text)
+            pe.text = phrase_text
             prev = pe
         else:
             _append_text(phrase_text)
@@ -287,7 +210,7 @@ def build_multi(
             if utt.phrase_prosody:
                 _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody)
             else:
-                _inject_word_breaks(inner, utt.text)
+                inner.text = utt.text
 
         raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw
diff --git a/tests/unit/test_phrase_prosody.py b/tests/unit/test_phrase_prosody.py
index d730dd1..ce61794 100644
--- a/tests/unit/test_phrase_prosody.py
+++ b/tests/unit/test_phrase_prosody.py
@@ -333,13 +333,8 @@ class TestApplyPhraseProsody:
     def test_no_phrases_sets_text(self) -> None:
         parent = _make_parent()
         _apply_phrase_prosody(parent, "hello world", [])
-        # Inter-word <break> splits text: parent.text = "hello",
-        # then <break time="50ms"/> with tail " world".
-        assert parent.text == "hello"
-        children = list(parent)
-        assert len(children) == 1
-        assert children[0].tag == "break"
-        assert children[0].tail == " world"
+        assert parent.text == "hello world"
+        assert len(list(parent)) == 0  # no children
 
     def test_single_phrase_mid_text(self) -> None:
         parent = _make_parent()
@@ -377,12 +372,8 @@ def test_zero_length_phrase_skipped(self) -> None:
         parent = _make_parent()
         phrase = PhraseProsody("p0", 3, 3)  # zero-length span
         _apply_phrase_prosody(parent, "hello world", [phrase])
-        # Zero-length phrase skipped → text set with inter-word breaks
-        assert parent.text == "hello"
-        children = list(parent)
-        assert len(children) == 1
-        assert children[0].tag == "break"
-        assert children[0].tail == " world"
+        # Zero-length phrase skipped → text is set as-is
+        assert parent.text == "hello world"
 
     def test_phrase_with_pitch_attribute(self) -> None:
         # Covers `if phrase.pitch is not None: phrase_attrs["pitch"] = ...` (line 113-114).
diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py
index 29c11c7..0f1ed74 100644
--- a/tests/unit/test_tts.py
+++ b/tests/unit/test_tts.py
@@ -16,10 +16,8 @@
 from synthbanshee.tts.azure_provider import AzureProvider
 from synthbanshee.tts.renderer import TTSRenderer
 from synthbanshee.tts.ssml_builder import (
-    _WORD_BREAK_MS,
     SSMLBuilder,
     UtteranceSpec,
-    _inject_word_breaks,
     _rate_to_string,
     _semitones_to_percent,
 )
@@ -136,200 +134,6 @@ def test_xml_is_well_formed(self):
         ET.fromstring(ssml_body)  # Should not raise
 
 
-# ---------------------------------------------------------------------------
-# Word-boundary break tests (#62)
-# ---------------------------------------------------------------------------
-
-
-class TestWordBoundaryBreaks:
-    """Verify that multi-word Hebrew text produces inter-word <break> tags."""
-
-    def setup_method(self):
-        self.builder = SSMLBuilder()
-
-    def _body(self, ssml: str) -> str:
-        """Strip XML declaration to get parseable SSML body."""
-        return ssml.split("\n", 1)[1] if ssml.startswith("<?xml") else ssml
-
-    def test_multi_word_text_has_breaks(self):
-        """Multi-word text must contain <break> elements between words."""
-        utt = UtteranceSpec(
-            text="word1 word2 word3",
-            voice_id="he-IL-AvriNeural",
-        )
-        ssml = self.builder.build_single(utt)
-        # Two word boundaries → two <break> elements
-        assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 2
-
-    def test_single_word_no_breaks(self):
-        """Single-word text must not contain any <break> elements."""
-        utt = UtteranceSpec(
-            text="hello",
-            voice_id="he-IL-AvriNeural",
-        )
-        ssml = self.builder.build_single(utt)
-        assert f'time="{_WORD_BREAK_MS}ms"' not in ssml
-
-    def test_text_preserved_after_breaks(self):
-        """All original words must appear in the serialised SSML."""
-        utt = UtteranceSpec(
-            text="word1 word2 word3",
-            voice_id="he-IL-AvriNeural",
-        )
-        ssml = self.builder.build_single(utt)
-        assert "word1" in ssml
-        assert "word2" in ssml
-        assert "word3" in ssml
-
-    def test_breaks_inside_prosody(self):
-        """Word breaks must also appear when a <prosody> wrapper is present."""
-        utt = UtteranceSpec(
-            text="word1 word2",
-            voice_id="he-IL-AvriNeural",
-            rate_multiplier=1.2,
-        )
-        ssml = self.builder.build_single(utt)
-        # One word boundary inside the prosody wrapper
-        assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1
-        assert "prosody" in ssml
-
-    def test_breaks_with_phrase_prosody(self):
-        """Word breaks must appear in text fragments around phrase prosody spans."""
-        from synthbanshee.tts.ssml_types import PhraseProsody
-
-        # "before1 before2 phrase1 phrase2 after1 after2"
-        #  0123456789...
-        # "before1"=0:7, " "=7, "before2"=8:15, " "=15,
-        # "phrase1"=16:23, " "=23, "phrase2"=24:31, " "=31,
-        # "after1"=32:38, " "=38, "after2"=39:45
-        text = "before1 before2 phrase1 phrase2 after1 after2"
-        phrase = PhraseProsody(
-            phrase_id="p0",
-            char_start=16,  # "phrase1 phrase2"
-            char_end=31,
-            rate="+10%",
-        )
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-            phrase_prosody=[phrase],
-        )
-        ssml = self.builder.build_single(utt)
-        # All words must appear
-        for word in text.split():
-            assert word in ssml, f"word {word!r} missing from SSML"
-        # Exact break count: 1 (before1↔before2) + 1 (phrase1↔phrase2
-        # inside <prosody>) + 1 (after1↔after2) = 3 word-boundary breaks.
-        assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 3
-
-    def test_hebrew_multi_word_text_has_breaks(self):
-        """Hebrew multi-word text must produce inter-word <break> elements."""
-        # Reproduces the core scenario from issue #62.
-        utt = UtteranceSpec(
-            text="\u05d4\u05d9\u05d9, \u05d7\u05e9\u05d1\u05ea\u05d9",
-            voice_id="he-IL-AvriNeural",
-        )
-        ssml = self.builder.build_single(utt)
-        assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1
-        # Both tokens must survive serialization intact.
-        assert "\u05d4\u05d9\u05d9," in ssml
-        assert "\u05d7\u05e9\u05d1\u05ea\u05d9" in ssml
-
-    def test_hebrew_with_niqqud_preserved(self):
-        """Niqqud-bearing Hebrew words must not be corrupted by break injection."""
-        # Two words, the first with niqqud (shin + shva + lamed + dagesh).
-        text = "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd \u05e2\u05d5\u05dc\u05dd"
-        utt = UtteranceSpec(text=text, voice_id="he-IL-AvriNeural")
-        ssml = self.builder.build_single(utt)
-        assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1
-        assert "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd" in ssml
-        assert "\u05e2\u05d5\u05dc\u05dd" in ssml
-
-    def test_text_roundtrip_preserves_content(self):
-        """Serialise → parse → extract text: all content must match the input."""
-        import xml.etree.ElementTree as ET
-
-        text = "one two three four"
-        utt = UtteranceSpec(
-            text=text,
-            voice_id="he-IL-AvriNeural",
-        )
-        ssml = self.builder.build_single(utt)
-        root = ET.fromstring(self._body(ssml))
-
-        # Walk the tree and collect every text/tail fragment.
-        fragments: list[str] = []
-
-        def _collect(el: ET.Element) -> None:
-            if el.text:
-                fragments.append(el.text)
-            for child in el:
-                _collect(child)
-                if child.tail:
-                    fragments.append(child.tail)
-
-        _collect(root)
-        recovered = "".join(fragments)
-        # Recovered text (ignoring break elements) must equal the original.
-        assert recovered == text
-
-    def test_hebrew_text_roundtrip(self):
-        """Hebrew content must survive the SSML serialise → parse roundtrip."""
-        import xml.etree.ElementTree as ET
-
-        text = (
-            "\u05d0\u05d2\u05d1 \u05e0\u05d9\u05e1\u05d9\u05ea\u05d9"
-            " \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da"
-        )
-        utt = UtteranceSpec(text=text, voice_id="he-IL-HilaNeural")
-        ssml = self.builder.build_single(utt)
-        root = ET.fromstring(self._body(ssml))
-
-        fragments: list[str] = []
-
-        def _collect(el: ET.Element) -> None:
-            if el.text:
-                fragments.append(el.text)
-            for child in el:
-                _collect(child)
-                if child.tail:
-                    fragments.append(child.tail)
-
-        _collect(root)
-        assert "".join(fragments) == text
-
-    def test_xml_well_formed_with_breaks(self):
-        """SSML with word breaks must remain valid XML."""
-        import xml.etree.ElementTree as ET
-
-        utt = UtteranceSpec(
-            text="word1 word2 word3 word4",
-            voice_id="he-IL-AvriNeural",
-            style="angry",
-            rate_multiplier=1.1,
-        )
-        ssml = self.builder.build_single(utt)
-        ET.fromstring(self._body(ssml))  # Should not raise
-
-    def test_inject_word_breaks_empty_text(self):
-        """_inject_word_breaks with empty text should not create elements."""
-        import xml.etree.ElementTree as ET
-
-        parent = ET.Element("test")
-        result = _inject_word_breaks(parent, "")
-        assert result is None
-        assert len(list(parent)) == 0
-
-    def test_inject_word_breaks_whitespace_only(self):
-        """_inject_word_breaks with whitespace-only text should preserve it."""
-        import xml.etree.ElementTree as ET
-
-        parent = ET.Element("test")
-        result = _inject_word_breaks(parent, "  ")
-        assert result is None
-        assert parent.text == "  "
-
-
 # ---------------------------------------------------------------------------
 # AzureProvider tests (mocked)
 # ---------------------------------------------------------------------------

From df651981211de9d2d8db65caf992a31dac8f58a4 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Wed, 6 May 2026 00:19:12 +0300
Subject: [PATCH 3/3] restore(tts): reinstate hardenings from #71 unrelated to
 #70 + add #83 regression test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The initial revert in this branch (revert PR #70 + PR #71 wholesale) was
too aggressive: PR #71 bundled three independent hardenings, only one of
which was caused by PR #70.  This commit restores the two hardenings that
have nothing to do with inter-word break injection, and narrows the third
to the residual case that survives #70's revert.  Also adds the #83
regression test that pins the bisect finding.

Restored from #71 (and explicitly verified to NOT re-introduce #83):

- `_sanitize_text` + `_XML_INVALID_CHARS_RE` regex.  Defends against an
  LLM emitting XML 1.0 control characters that otherwise make the SSML
  unparseable by Azure.  Independent bug class from per-word breaks.
- Azure-range prosody clamping in `_semitones_to_percent`,
  `_rate_to_string`, `_volume_to_string`, plus warning logs on clamp
  activation.  `speaker_BYS_F_6-10_001.yaml` ships `pitch_delta_st=+9`
  → +54% unclamped, which Azure rejects.  Independent bug class.
- Adjacent `<break>` merging in `_apply_phrase_prosody`, narrowed to the
  phrase-after / phrase-before case (the only adjacent-break source that
  survives #70's revert).  The original #71 logic also had a word-break
  branch that is no longer reachable.

Added:

- `test_no_per_word_breaks_in_default_ssml` regression test pinned to
  #83.  The default multi-word SSML must not contain `<break>` tags;
  per-word break injection (PR #70) tripped Whisper's silence-detection
  heuristic and produced the WER regression.  Any future Hebrew word-
  merge mitigation (#62) must not re-introduce per-word breaks.
- `test_text_with_invalid_xml_chars_sanitized`,
  `test_prosody_pitch_clamped_to_azure_range`,
  `test_prosody_rate_clamped_to_azure_range`,
  `test_prosody_volume_clamped_to_azure_range`,
  `test_adjacent_phrase_breaks_are_merged` — pin the restored hardenings.

All three of these were independently flagged by Copilot's review on this
PR (resolves three Copilot review threads).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 synthbanshee/tts/ssml_builder.py |  85 +++++++++++++++++++---
 tests/unit/test_tts.py           | 118 +++++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 11 deletions(-)

diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py
index 9d987e2..5df230a 100644
--- a/synthbanshee/tts/ssml_builder.py
+++ b/synthbanshee/tts/ssml_builder.py
@@ -13,15 +13,41 @@
 
 from __future__ import annotations
 
+import logging
+import re
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 
 from synthbanshee.tts.ssml_types import PhraseProsody
 
+_log = logging.getLogger(__name__)
+
 _AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis"
 _MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
 _SPEAK_LANG = "he-IL"
 
+# Azure prosody attribute limits (documented ranges).
+_AZURE_RATE_MIN_PCT = -50  # rate="-50%" → 0.5x
+_AZURE_RATE_MAX_PCT = 200  # rate="+200%" → 3.0x
+_AZURE_PITCH_MIN_PCT = -50
+_AZURE_PITCH_MAX_PCT = 50
+_AZURE_VOLUME_MIN_PCT = -50
+_AZURE_VOLUME_MAX_PCT = 50
+
+# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F.
+# These must be stripped before embedding text in SSML.
+_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+
+
+def _sanitize_text(text: str) -> str:
+    """Remove characters that are invalid in XML 1.0 from *text*.
+
+    Defense-in-depth: ideally invalid chars should be rejected at the LLM
+    response parsing boundary (script/generator.py).  This guard ensures the
+    SSML builder never produces unparseable XML regardless of upstream bugs.
+    """
+    return _XML_INVALID_CHARS_RE.sub("", text)
+
 
 @dataclass
 class UtteranceSpec:
@@ -37,23 +63,44 @@ class UtteranceSpec:
 
 
 def _semitones_to_percent(st: float) -> str:
-    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%')."""
+    """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').
+
+    Values are clamped to Azure's documented ±50% range.  A warning is logged
+    when clamping activates — this indicates a speaker config or state-drift bug.
+    """
     # Approximation: 1 semitone ≈ 5.946% pitch change
     pct = round(st * 5.946)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _rate_to_string(rate: float) -> str:
-    """Format a rate multiplier as a percentage string for <prosody rate=...>."""
+    """Format a rate multiplier as a percentage string for <prosody rate=...>.
+
+    Values are clamped to Azure's documented -50% to +200% range.  A warning is
+    logged when clamping activates.
+    """
     pct = round((rate - 1.0) * 100)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _volume_to_string(db: float) -> str:
-    """Format a dB volume offset as a percentage for <prosody volume=...>."""
+    """Format a dB volume offset as a percentage for <prosody volume=...>.
+
+    Values are clamped to Azure's documented ±50% range.  A warning is logged
+    when clamping activates.
+    """
     # Azure volume is 0–100; default is 100. Map dB linearly (rough).
     pct = round(db * 1.0)
-    return f"+{pct}%" if pct >= 0 else f"{pct}%"
+    clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct))
+    if clamped != pct:
+        _log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
+    return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"
 
 
 def _apply_phrase_prosody(
@@ -65,9 +112,13 @@ def _apply_phrase_prosody(
 
     Splits *text* around phrase spans and wraps each phrase in a nested
     ``<prosody>`` element with optional ``<break time="…"/>`` elements
-    inserted before and/or after.  Overlapping spans are skipped (the span
-    whose ``char_start`` falls before the previous span's ``char_end`` is
-    silently dropped).
+    inserted before and/or after.  Adjacent ``<break>`` elements are merged
+    (durations summed) to avoid Azure SSML parser error 0x80045003 (#67):
+    when a phrase carries ``break_after_ms`` and the next phrase carries
+    ``break_before_ms``, two ``<break>`` siblings would otherwise appear
+    back-to-back.  Overlapping spans are skipped (the span whose
+    ``char_start`` falls before the previous span's ``char_end`` is silently
+    dropped).
 
     Args:
         parent: The XML element that will receive the mixed text/element content.
@@ -87,7 +138,17 @@ def _append_text(s: str) -> None:
             prev.tail = (prev.tail or "") + s
 
     def _append_break(ms: int) -> ET.Element:
+        """Add a ``<break time="{ms}ms"/>`` or merge into the preceding break.
+
+        Azure rejects adjacent ``<break>`` siblings (#67); when the most
+        recently appended element is itself a ``<break>``, we sum the
+        durations into the existing element instead of creating a new one.
+        """
         nonlocal prev
+        if prev is not None and prev.tag == "break":
+            prev_ms = int(prev.attrib.get("time", "0ms").replace("ms", ""))
+            prev.attrib["time"] = f"{prev_ms + ms}ms"
+            return prev
         el = ET.SubElement(parent, "break", attrib={"time": f"{ms}ms"})
         prev = el
         return el
@@ -179,6 +240,8 @@ def build_multi(
         speak = ET.Element("speak", attrib=speak_attribs)
 
         for utt in utterances:
+            # Sanitize text: strip characters invalid in XML 1.0 (#67).
+            utt_text = _sanitize_text(utt.text)
             voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id})
 
             # Add express-as only when a non-default style is requested AND
@@ -208,9 +271,9 @@ def build_multi(
                 inner = parent
 
             if utt.phrase_prosody:
-                _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody)
+                _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
             else:
-                inner.text = utt.text
+                inner.text = utt_text
 
         raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw
diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py
index 0f1ed74..1add25d 100644
--- a/tests/unit/test_tts.py
+++ b/tests/unit/test_tts.py
@@ -9,6 +9,7 @@
 import struct
 import sys
 import wave
+import xml.etree.ElementTree as ET
 from pathlib import Path
 from unittest.mock import MagicMock
 
@@ -20,7 +21,9 @@
     UtteranceSpec,
     _rate_to_string,
     _semitones_to_percent,
+    _volume_to_string,
 )
+from synthbanshee.tts.ssml_types import PhraseProsody
 
 EXAMPLES_DIR = Path(__file__).parent.parent.parent / "configs" / "examples"
 
@@ -134,6 +137,121 @@ def test_xml_is_well_formed(self):
         ET.fromstring(ssml_body)  # Should not raise
 
 
+# ---------------------------------------------------------------------------
+# SSML invariants — sanitization, clamping, regression guard for #83
+# ---------------------------------------------------------------------------
+
+
+class TestSSMLInvariants:
+    """Defense-in-depth invariants the SSML builder must hold.
+
+    These assertions catch regressions in three orthogonal hardenings that
+    were lost when PR #70 + #71 were initially reverted as a bundle for
+    #83, then restored individually:
+
+      - XML 1.0 control-character sanitization (originally from #71).
+      - Azure-range prosody clamping (originally from #71).
+      - Adjacent ``<break>`` merging in ``_apply_phrase_prosody`` to avoid
+        Azure error 0x80045003 (originally from #71, narrowed to the
+        phrase-after / phrase-before case after #70 was reverted).
+      - The ``no per-word <break> tags`` rule pinned to #83.
+    """
+
+    def setup_method(self):
+        self.builder = SSMLBuilder()
+
+    def _body(self, ssml: str) -> str:
+        return ssml.split("\n", 1)[1] if ssml.startswith("<?xml") else ssml
+
+    def test_text_with_invalid_xml_chars_sanitized(self):
+        # Simulate LLM output with control characters (#67 root cause).
+        text = "hello\x00world\x0bfoo\x1fbar"
+        utt = UtteranceSpec(text=text, voice_id="he-IL-AvriNeural")
+        ssml = self.builder.build_single(utt, supports_style_tags=False)
+        for ch in ("\x00", "\x0b", "\x1f"):
+            assert ch not in ssml
+        for word in ("hello", "world", "foo", "bar"):
+            assert word in ssml
+
+    def test_prosody_pitch_clamped_to_azure_range(self):
+        # 12 st → +71% unclamped; clamps to +50%.
+        assert _semitones_to_percent(12.0) == "+50%"
+        assert _semitones_to_percent(-12.0) == "-50%"
+        # In-range values pass through.
+        assert _semitones_to_percent(3.0) == "+18%"
+        assert _semitones_to_percent(-2.0) == "-12%"
+
+    def test_prosody_rate_clamped_to_azure_range(self):
+        # rate=4.0 → +300% unclamped; clamps to +200%.
+        assert _rate_to_string(4.0) == "+200%"
+        # rate=0.3 → -70% unclamped; clamps to -50%.
+        assert _rate_to_string(0.3) == "-50%"
+        assert _rate_to_string(1.1) == "+10%"
+
+    def test_prosody_volume_clamped_to_azure_range(self):
+        assert _volume_to_string(80.0) == "+50%"
+        assert _volume_to_string(-80.0) == "-50%"
+        assert _volume_to_string(5.0) == "+5%"
+
+    def test_no_per_word_breaks_in_default_ssml(self):
+        """Default multi-word SSML must not contain per-word ``<break>`` tags.
+
+        Pinned to #83: per-word ``<break time="50ms"/>`` insertion (PR #70)
+        tripped Whisper's silence-detection / segmentation heuristic and
+        produced a 6× WER regression on Tier A clips.  Any future Hebrew
+        word-merge mitigation (#62) must not re-introduce per-word breaks.
+        """
+        utt = UtteranceSpec(
+            text="one two three four five six seven eight",
+            voice_id="he-IL-AvriNeural",
+        )
+        ssml = self.builder.build_single(utt, supports_style_tags=False)
+        assert "<break" not in ssml, "Default SSML must not emit per-word <break> tags — see #83."
+
+    def test_adjacent_phrase_breaks_are_merged(self):
+        """break_after of one phrase + break_before of the next must merge.
+
+        Without merging, Azure rejects the SSML with parse error
+        0x80045003 (#67).  With #70 reverted, the only remaining adjacent-
+        break risk is between two consecutive phrases.
+        """
+        text = "intro phrase_a end_a mid phrase_b outro"
+        phrases = [
+            PhraseProsody(
+                phrase_id="p0",
+                char_start=6,
+                char_end=20,  # "phrase_a end_a"
+                rate="-20%",
+                break_after_ms=250,
+            ),
+            PhraseProsody(
+                phrase_id="p1",
+                char_start=25,
+                char_end=33,  # "phrase_b"
+                rate="-25%",
+                break_before_ms=300,
+            ),
+        ]
+        utt = UtteranceSpec(
+            text=text,
+            voice_id="he-IL-AvriNeural",
+            phrase_prosody=phrases,
+        )
+        ssml = self.builder.build_single(utt, supports_style_tags=False)
+
+        # Must remain valid XML.
+        ET.fromstring(self._body(ssml))
+
+        # No two <break .../> elements separated only by whitespace.
+        import re
+
+        adjacent = re.findall(r"<break[^/]*/>\s*<break", ssml)
+        assert adjacent == [], f"Adjacent breaks found in SSML: {ssml}"
+
+        # The two break durations must be summed into the surviving element.
+        assert 'time="550ms"' in ssml
+
+
 # ---------------------------------------------------------------------------
 # AzureProvider tests (mocked)
 # ---------------------------------------------------------------------------