DataHackIL · shaypal5 · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py
@@ -6,8 +6,6 @@
     ``supports_style_tags=False``)
   - <prosody> elements for rate, pitch, and volume control
   - Nested per-phrase <prosody> + <break> elements (M2b)
-  - Inter-word ``<break time="50ms"/>`` elements to prevent Hebrew word
-    merging (#62)
 
 Azure SSML reference:
 https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice
@@ -28,12 +26,6 @@
 _MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
 _SPEAK_LANG = "he-IL"
 
-# Inter-word break duration in milliseconds.  50 ms is the initial estimate
-# for signalling a word boundary to Azure / Google he-IL without introducing
-# an audible pause.  This value needs empirical validation with real TTS
-# output — it may need per-provider tuning if engines respond differently.
-_WORD_BREAK_MS = 50
-
 # Azure prosody attribute limits (documented ranges).
 _AZURE_RATE_MIN_PCT = -50  # rate="-50%" → 0.5x
 _AZURE_RATE_MAX_PCT = 200  # rate="+200%" → 3.0x
@@ -57,74 +49,6 @@ def _sanitize_text(text: str) -> str:
     return _XML_INVALID_CHARS_RE.sub("", text)
 
 
-def _inject_word_breaks(
-    parent: ET.Element,
-    text: str,
-    after: ET.Element | None = None,
-) -> ET.Element | None:
-    """Insert *text* into *parent* with ``<break>`` tags between words.
-
-    Hebrew TTS engines (Azure he-IL, Google Chirp) merge adjacent words into
-    unintelligible speech when no explicit boundary cue exists.  This function
-    splits *text* on whitespace and inserts a short ``<break>`` element between
-    every pair of consecutive words.
-
-    Args:
-        parent: The XML element to add content to.
-        text: The text to inject (may contain multi-word Hebrew).
-        after: If provided, the first text chunk is appended to
-            ``after.tail`` instead of ``parent.text``.
-
-    Returns:
-        The last child element added to *parent*, or *after* if no ``<break>``
-        elements were created (single word or empty text).
-    """
-    if not text or not text.strip():
-        # Pure whitespace or empty — append as-is without inserting breaks.
-        if text:
-            if after is None:
-                parent.text = (parent.text or "") + text
-            else:
-                after.tail = (after.tail or "") + text
-        return after
-
-    words = text.split()
-    last = after
-
-    # Preserve any leading whitespace (e.g. space before a text fragment
-    # that follows a <break> or <prosody> element).
-    leading = text[: len(text) - len(text.lstrip())]
-    if leading:
-        if last is None:
-            parent.text = (parent.text or "") + leading
-        else:
-            last.tail = (last.tail or "") + leading
-
-    for i, word in enumerate(words):
-        if i == 0:
-            # First word: append directly (no break needed before it).
-            if last is None:
-                parent.text = (parent.text or "") + word
-            else:
-                last.tail = (last.tail or "") + word
-        else:
-            # Subsequent words: insert <break/> then the word with a
-            # leading space in the tail to preserve normal spacing.
-            brk = ET.SubElement(parent, "break", attrib={"time": f"{_WORD_BREAK_MS}ms"})
-            brk.tail = " " + word
-            last = brk
-
-    # Preserve any trailing whitespace.
-    trailing = text[len(text.rstrip()) :]
-    if trailing:
-        if last is None:
-            parent.text = (parent.text or "") + trailing
-        else:
-            last.tail = (last.tail or "") + trailing
-
-    return last
-
-
 @dataclass
 class UtteranceSpec:
     """Specifies one TTS utterance to be included in an SSML document."""
@@ -188,10 +112,13 @@ def _apply_phrase_prosody(
 
     Splits *text* around phrase spans and wraps each phrase in a nested
     ``<prosody>`` element with optional ``<break time="…"/>`` elements
-    inserted before and/or after.  Inter-word ``<break>`` elements are
-    inserted within each text fragment to prevent Hebrew word merging (#62).
-    Overlapping spans are skipped (the span whose ``char_start`` falls
-    before the previous span's ``char_end`` is silently dropped).
+    inserted before and/or after.  Adjacent ``<break>`` elements are merged
+    (durations summed) to avoid Azure SSML parser error 0x80045003 (#67):
+    when a phrase carries ``break_after_ms`` and the next phrase carries
+    ``break_before_ms``, two ``<break>`` siblings would otherwise appear
+    back-to-back.  Overlapping spans are skipped (the span whose
+    ``char_start`` falls before the previous span's ``char_end`` is silently
+    dropped).
 
     Args:
         parent: The XML element that will receive the mixed text/element content.
@@ -202,16 +129,26 @@ def _apply_phrase_prosody(
     prev: ET.Element | None = None
 
     def _append_text(s: str) -> None:
-        """Append *s* with inter-word ``<break>`` elements."""
         nonlocal prev
         if not s:
             return
-        result = _inject_word_breaks(parent, s, after=prev)
-        if result is not None:
-            prev = result
+        if prev is None:
+            parent.text = (parent.text or "") + s
+        else:
+            prev.tail = (prev.tail or "") + s
 
     def _append_break(ms: int) -> ET.Element:
+        """Add a ``<break time="{ms}ms"/>`` or merge into the preceding break.
+
+        Azure rejects adjacent ``<break>`` siblings (#67); when the most
+        recently appended element is itself a ``<break>``, we sum the
+        durations into the existing element instead of creating a new one.
+        """
         nonlocal prev
+        if prev is not None and prev.tag == "break":
+            prev_ms = int(prev.attrib.get("time", "0ms").replace("ms", ""))
+            prev.attrib["time"] = f"{prev_ms + ms}ms"
+            return prev
         el = ET.SubElement(parent, "break", attrib={"time": f"{ms}ms"})
         prev = el
         return el
@@ -227,23 +164,9 @@ def _append_break(ms: int) -> ET.Element:
         if phrase.char_start > cursor:
             _append_text(text[cursor : phrase.char_start])
 
-        # Optional break before the phrase.  Merge with the preceding
-        # <break> element (if any) to avoid adjacent breaks that Azure's SSML
-        # parser rejects with error 0x80045003 (#67).
+        # Optional break before the phrase.
         if phrase.break_before_ms > 0:
-            if prev is not None and prev.tag == "break":
-                prev_time = prev.attrib.get("time", "0ms")
-                prev_ms = int(prev_time.replace("ms", ""))
-                if prev_ms == _WORD_BREAK_MS:
-                    # Word-boundary break: replace with the phrase break
-                    # (the phrase break subsumes the word-boundary intent).
-                    prev.attrib["time"] = f"{phrase.break_before_ms}ms"
-                else:
-                    # Semantic break (e.g. break_after from prior phrase):
-                    # sum durations to preserve both intents.
-                    prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms"
-            else:
-                _append_break(phrase.break_before_ms)
+            _append_break(phrase.break_before_ms)
 
         # Phrase-level prosody wrapper (omitted when only breaks are requested).
         phrase_attrs: dict[str, str] = {}
@@ -257,7 +180,7 @@ def _append_break(ms: int) -> ET.Element:
         phrase_text = text[phrase.char_start : phrase.char_end]
         if phrase_attrs:
             pe = ET.SubElement(parent, "prosody", attrib=phrase_attrs)
-            _inject_word_breaks(pe, phrase_text)
+            pe.text = phrase_text
             prev = pe
         else:
             _append_text(phrase_text)
@@ -350,7 +273,7 @@ def build_multi(
             if utt.phrase_prosody:
                 _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
             else:
-                _inject_word_breaks(inner, utt_text)
+                inner.text = utt_text
 
         raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw

diff --git a/tests/unit/test_phrase_prosody.py b/tests/unit/test_phrase_prosody.py
@@ -333,13 +333,8 @@ class TestApplyPhraseProsody:
     def test_no_phrases_sets_text(self) -> None:
         parent = _make_parent()
         _apply_phrase_prosody(parent, "hello world", [])
-        # Inter-word <break> splits text: parent.text = "hello",
-        # then <break time="50ms"/> with tail " world".
-        assert parent.text == "hello"
-        children = list(parent)
-        assert len(children) == 1
-        assert children[0].tag == "break"
-        assert children[0].tail == " world"
+        assert parent.text == "hello world"
+        assert len(list(parent)) == 0  # no children
 
     def test_single_phrase_mid_text(self) -> None:
         parent = _make_parent()
@@ -377,12 +372,8 @@ def test_zero_length_phrase_skipped(self) -> None:
         parent = _make_parent()
         phrase = PhraseProsody("p0", 3, 3)  # zero-length span
         _apply_phrase_prosody(parent, "hello world", [phrase])
-        # Zero-length phrase skipped → text set with inter-word breaks
-        assert parent.text == "hello"
-        children = list(parent)
-        assert len(children) == 1
-        assert children[0].tag == "break"
-        assert children[0].tail == " world"
+        # Zero-length phrase skipped → text is set as-is
+        assert parent.text == "hello world"
 
     def test_phrase_with_pitch_attribute(self) -> None:
         # Covers `if phrase.pitch is not None: phrase_attrs["pitch"] = ...` (line 113-114).