diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py index 57de60d..5df230a 100644 --- a/synthbanshee/tts/ssml_builder.py +++ b/synthbanshee/tts/ssml_builder.py @@ -6,8 +6,6 @@ ``supports_style_tags=False``) - elements for rate, pitch, and volume control - Nested per-phrase + elements (M2b) - - Inter-word ```` elements to prevent Hebrew word - merging (#62) Azure SSML reference: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice @@ -28,12 +26,6 @@ _MSTTS_XMLNS = "http://www.w3.org/2001/mstts" _SPEAK_LANG = "he-IL" -# Inter-word break duration in milliseconds. 50 ms is the initial estimate -# for signalling a word boundary to Azure / Google he-IL without introducing -# an audible pause. This value needs empirical validation with real TTS -# output — it may need per-provider tuning if engines respond differently. -_WORD_BREAK_MS = 50 - # Azure prosody attribute limits (documented ranges). _AZURE_RATE_MIN_PCT = -50 # rate="-50%" → 0.5x _AZURE_RATE_MAX_PCT = 200 # rate="+200%" → 3.0x @@ -57,74 +49,6 @@ def _sanitize_text(text: str) -> str: return _XML_INVALID_CHARS_RE.sub("", text) -def _inject_word_breaks( - parent: ET.Element, - text: str, - after: ET.Element | None = None, -) -> ET.Element | None: - """Insert *text* into *parent* with ```` tags between words. - - Hebrew TTS engines (Azure he-IL, Google Chirp) merge adjacent words into - unintelligible speech when no explicit boundary cue exists. This function - splits *text* on whitespace and inserts a short ```` element between - every pair of consecutive words. - - Args: - parent: The XML element to add content to. - text: The text to inject (may contain multi-word Hebrew). - after: If provided, the first text chunk is appended to - ``after.tail`` instead of ``parent.text``. - - Returns: - The last child element added to *parent*, or *after* if no ```` - elements were created (single word or empty text). - """ - if not text or not text.strip(): - # Pure whitespace or empty — append as-is without inserting breaks. - if text: - if after is None: - parent.text = (parent.text or "") + text - else: - after.tail = (after.tail or "") + text - return after - - words = text.split() - last = after - - # Preserve any leading whitespace (e.g. space before a text fragment - # that follows a or element). - leading = text[: len(text) - len(text.lstrip())] - if leading: - if last is None: - parent.text = (parent.text or "") + leading - else: - last.tail = (last.tail or "") + leading - - for i, word in enumerate(words): - if i == 0: - # First word: append directly (no break needed before it). - if last is None: - parent.text = (parent.text or "") + word - else: - last.tail = (last.tail or "") + word - else: - # Subsequent words: insert then the word with a - # leading space in the tail to preserve normal spacing. - brk = ET.SubElement(parent, "break", attrib={"time": f"{_WORD_BREAK_MS}ms"}) - brk.tail = " " + word - last = brk - - # Preserve any trailing whitespace. - trailing = text[len(text.rstrip()) :] - if trailing: - if last is None: - parent.text = (parent.text or "") + trailing - else: - last.tail = (last.tail or "") + trailing - - return last - - @dataclass class UtteranceSpec: """Specifies one TTS utterance to be included in an SSML document.""" @@ -188,10 +112,13 @@ def _apply_phrase_prosody( Splits *text* around phrase spans and wraps each phrase in a nested ```` element with optional ```` elements - inserted before and/or after. Inter-word ```` elements are - inserted within each text fragment to prevent Hebrew word merging (#62). - Overlapping spans are skipped (the span whose ``char_start`` falls - before the previous span's ``char_end`` is silently dropped). + inserted before and/or after. Adjacent ```` elements are merged + (durations summed) to avoid Azure SSML parser error 0x80045003 (#67): + when a phrase carries ``break_after_ms`` and the next phrase carries + ``break_before_ms``, two ```` siblings would otherwise appear + back-to-back. Overlapping spans are skipped (the span whose + ``char_start`` falls before the previous span's ``char_end`` is silently + dropped). Args: parent: The XML element that will receive the mixed text/element content. @@ -202,16 +129,26 @@ def _apply_phrase_prosody( prev: ET.Element | None = None def _append_text(s: str) -> None: - """Append *s* with inter-word ```` elements.""" nonlocal prev if not s: return - result = _inject_word_breaks(parent, s, after=prev) - if result is not None: - prev = result + if prev is None: + parent.text = (parent.text or "") + s + else: + prev.tail = (prev.tail or "") + s def _append_break(ms: int) -> ET.Element: + """Add a ```` or merge into the preceding break. + + Azure rejects adjacent ```` siblings (#67); when the most + recently appended element is itself a ````, we sum the + durations into the existing element instead of creating a new one. + """ nonlocal prev + if prev is not None and prev.tag == "break": + prev_ms = int(prev.attrib.get("time", "0ms").replace("ms", "")) + prev.attrib["time"] = f"{prev_ms + ms}ms" + return prev el = ET.SubElement(parent, "break", attrib={"time": f"{ms}ms"}) prev = el return el @@ -227,23 +164,9 @@ def _append_break(ms: int) -> ET.Element: if phrase.char_start > cursor: _append_text(text[cursor : phrase.char_start]) - # Optional break before the phrase. Merge with the preceding - # element (if any) to avoid adjacent breaks that Azure's SSML - # parser rejects with error 0x80045003 (#67). + # Optional break before the phrase. if phrase.break_before_ms > 0: - if prev is not None and prev.tag == "break": - prev_time = prev.attrib.get("time", "0ms") - prev_ms = int(prev_time.replace("ms", "")) - if prev_ms == _WORD_BREAK_MS: - # Word-boundary break: replace with the phrase break - # (the phrase break subsumes the word-boundary intent). - prev.attrib["time"] = f"{phrase.break_before_ms}ms" - else: - # Semantic break (e.g. break_after from prior phrase): - # sum durations to preserve both intents. - prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms" - else: - _append_break(phrase.break_before_ms) + _append_break(phrase.break_before_ms) # Phrase-level prosody wrapper (omitted when only breaks are requested). phrase_attrs: dict[str, str] = {} @@ -257,7 +180,7 @@ def _append_break(ms: int) -> ET.Element: phrase_text = text[phrase.char_start : phrase.char_end] if phrase_attrs: pe = ET.SubElement(parent, "prosody", attrib=phrase_attrs) - _inject_word_breaks(pe, phrase_text) + pe.text = phrase_text prev = pe else: _append_text(phrase_text) @@ -350,7 +273,7 @@ def build_multi( if utt.phrase_prosody: _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody) else: - _inject_word_breaks(inner, utt_text) + inner.text = utt_text raw = ET.tostring(speak, encoding="unicode", xml_declaration=False) return '\n' + raw diff --git a/tests/unit/test_phrase_prosody.py b/tests/unit/test_phrase_prosody.py index d730dd1..ce61794 100644 --- a/tests/unit/test_phrase_prosody.py +++ b/tests/unit/test_phrase_prosody.py @@ -333,13 +333,8 @@ class TestApplyPhraseProsody: def test_no_phrases_sets_text(self) -> None: parent = _make_parent() _apply_phrase_prosody(parent, "hello world", []) - # Inter-word splits text: parent.text = "hello", - # then with tail " world". - assert parent.text == "hello" - children = list(parent) - assert len(children) == 1 - assert children[0].tag == "break" - assert children[0].tail == " world" + assert parent.text == "hello world" + assert len(list(parent)) == 0 # no children def test_single_phrase_mid_text(self) -> None: parent = _make_parent() @@ -377,12 +372,8 @@ def test_zero_length_phrase_skipped(self) -> None: parent = _make_parent() phrase = PhraseProsody("p0", 3, 3) # zero-length span _apply_phrase_prosody(parent, "hello world", [phrase]) - # Zero-length phrase skipped → text set with inter-word breaks - assert parent.text == "hello" - children = list(parent) - assert len(children) == 1 - assert children[0].tag == "break" - assert children[0].tail == " world" + # Zero-length phrase skipped → text is set as-is + assert parent.text == "hello world" def test_phrase_with_pitch_attribute(self) -> None: # Covers `if phrase.pitch is not None: phrase_attrs["pitch"] = ...` (line 113-114). diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index f74cad8..1add25d 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -9,6 +9,7 @@ import struct import sys import wave +import xml.etree.ElementTree as ET from pathlib import Path from unittest.mock import MagicMock @@ -16,13 +17,13 @@ from synthbanshee.tts.azure_provider import AzureProvider from synthbanshee.tts.renderer import TTSRenderer from synthbanshee.tts.ssml_builder import ( - _WORD_BREAK_MS, SSMLBuilder, UtteranceSpec, - _inject_word_breaks, _rate_to_string, _semitones_to_percent, + _volume_to_string, ) +from synthbanshee.tts.ssml_types import PhraseProsody EXAMPLES_DIR = Path(__file__).parent.parent.parent / "configs" / "examples" @@ -137,210 +138,23 @@ def test_xml_is_well_formed(self): # --------------------------------------------------------------------------- -# Word-boundary break tests (#62) +# SSML invariants — sanitization, clamping, regression guard for #83 # --------------------------------------------------------------------------- -class TestWordBoundaryBreaks: - """Verify that multi-word Hebrew text produces inter-word tags.""" +class TestSSMLInvariants: + """Defense-in-depth invariants the SSML builder must hold. - def setup_method(self): - self.builder = SSMLBuilder() - - def _body(self, ssml: str) -> str: - """Strip XML declaration to get parseable SSML body.""" - return ssml.split("\n", 1)[1] if ssml.startswith(" elements between words.""" - utt = UtteranceSpec( - text="word1 word2 word3", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - # Two word boundaries → two elements - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 2 - - def test_single_word_no_breaks(self): - """Single-word text must not contain any elements.""" - utt = UtteranceSpec( - text="hello", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert f'time="{_WORD_BREAK_MS}ms"' not in ssml - - def test_text_preserved_after_breaks(self): - """All original words must appear in the serialised SSML.""" - utt = UtteranceSpec( - text="word1 word2 word3", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert "word1" in ssml - assert "word2" in ssml - assert "word3" in ssml - - def test_breaks_inside_prosody(self): - """Word breaks must also appear when a wrapper is present.""" - utt = UtteranceSpec( - text="word1 word2", - voice_id="he-IL-AvriNeural", - rate_multiplier=1.2, - ) - ssml = self.builder.build_single(utt) - # One word boundary inside the prosody wrapper - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - assert "prosody" in ssml - - def test_breaks_with_phrase_prosody(self): - """Word breaks must appear in text fragments around phrase prosody spans.""" - from synthbanshee.tts.ssml_types import PhraseProsody - - # "before1 before2 phrase1 phrase2 after1 after2" - # 0123456789... - # "before1"=0:7, " "=7, "before2"=8:15, " "=15, - # "phrase1"=16:23, " "=23, "phrase2"=24:31, " "=31, - # "after1"=32:38, " "=38, "after2"=39:45 - text = "before1 before2 phrase1 phrase2 after1 after2" - phrase = PhraseProsody( - phrase_id="p0", - char_start=16, # "phrase1 phrase2" - char_end=31, - rate="+10%", - ) - utt = UtteranceSpec( - text=text, - voice_id="he-IL-AvriNeural", - phrase_prosody=[phrase], - ) - ssml = self.builder.build_single(utt) - # All words must appear - for word in text.split(): - assert word in ssml, f"word {word!r} missing from SSML" - # Exact break count: 1 (before1↔before2) + 1 (phrase1↔phrase2 - # inside ) + 1 (after1↔after2) = 3 word-boundary breaks. - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 3 - - def test_hebrew_multi_word_text_has_breaks(self): - """Hebrew multi-word text must produce inter-word elements.""" - # Reproduces the core scenario from issue #62. - utt = UtteranceSpec( - text="\u05d4\u05d9\u05d9, \u05d7\u05e9\u05d1\u05ea\u05d9", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - # Both tokens must survive serialization intact. - assert "\u05d4\u05d9\u05d9," in ssml - assert "\u05d7\u05e9\u05d1\u05ea\u05d9" in ssml - - def test_hebrew_with_niqqud_preserved(self): - """Niqqud-bearing Hebrew words must not be corrupted by break injection.""" - # Two words, the first with niqqud (shin + shva + lamed + dagesh). - text = "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd \u05e2\u05d5\u05dc\u05dd" - utt = UtteranceSpec(text=text, voice_id="he-IL-AvriNeural") - ssml = self.builder.build_single(utt) - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - assert "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd" in ssml - assert "\u05e2\u05d5\u05dc\u05dd" in ssml - - def test_text_roundtrip_preserves_content(self): - """Serialise → parse → extract text: all content must match the input.""" - import xml.etree.ElementTree as ET + These assertions catch regressions in three orthogonal hardenings that + were lost when PR #70 + #71 were initially reverted as a bundle for + #83, then restored individually: - text = "one two three four" - utt = UtteranceSpec( - text=text, - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - root = ET.fromstring(self._body(ssml)) - - # Walk the tree and collect every text/tail fragment. - fragments: list[str] = [] - - def _collect(el: ET.Element) -> None: - if el.text: - fragments.append(el.text) - for child in el: - _collect(child) - if child.tail: - fragments.append(child.tail) - - _collect(root) - recovered = "".join(fragments) - # Recovered text (ignoring break elements) must equal the original. - assert recovered == text - - def test_hebrew_text_roundtrip(self): - """Hebrew content must survive the SSML serialise → parse roundtrip.""" - import xml.etree.ElementTree as ET - - text = ( - "\u05d0\u05d2\u05d1 \u05e0\u05d9\u05e1\u05d9\u05ea\u05d9" - " \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da" - ) - utt = UtteranceSpec(text=text, voice_id="he-IL-HilaNeural") - ssml = self.builder.build_single(utt) - root = ET.fromstring(self._body(ssml)) - - fragments: list[str] = [] - - def _collect(el: ET.Element) -> None: - if el.text: - fragments.append(el.text) - for child in el: - _collect(child) - if child.tail: - fragments.append(child.tail) - - _collect(root) - assert "".join(fragments) == text - - def test_xml_well_formed_with_breaks(self): - """SSML with word breaks must remain valid XML.""" - import xml.etree.ElementTree as ET - - utt = UtteranceSpec( - text="word1 word2 word3 word4", - voice_id="he-IL-AvriNeural", - style="angry", - rate_multiplier=1.1, - ) - ssml = self.builder.build_single(utt) - ET.fromstring(self._body(ssml)) # Should not raise - - def test_inject_word_breaks_empty_text(self): - """_inject_word_breaks with empty text should not create elements.""" - import xml.etree.ElementTree as ET - - parent = ET.Element("test") - result = _inject_word_breaks(parent, "") - assert result is None - assert len(list(parent)) == 0 - - def test_inject_word_breaks_whitespace_only(self): - """_inject_word_breaks with whitespace-only text should preserve it.""" - import xml.etree.ElementTree as ET - - parent = ET.Element("test") - result = _inject_word_breaks(parent, " ") - assert result is None - assert parent.text == " " - - -# --------------------------------------------------------------------------- -# SSML parse-error regression tests (#67) -# --------------------------------------------------------------------------- - - -class TestSSMLParseErrorFix67: - """Regression tests for Azure SSML parsing error 0x80045003. - - The root cause is adjacent elements created when inter-word breaks - (PR #70) interact with phrase prosody break_before/break_after attributes. - Azure's SSML parser rejects adjacent breaks as malformed. + - XML 1.0 control-character sanitization (originally from #71). + - Azure-range prosody clamping (originally from #71). + - Adjacent ```` merging in ``_apply_phrase_prosody`` to avoid + Azure error 0x80045003 (originally from #71, narrowed to the + phrase-after / phrase-before case after #70 was reverted). + - The ``no per-word tags`` rule pinned to #83. """ def setup_method(self): @@ -349,104 +163,58 @@ def setup_method(self): def _body(self, ssml: str) -> str: return ssml.split("\n", 1)[1] if ssml.startswith(" elements should be adjacent (only whitespace between). - adjacent_breaks = re.findall(r"\s*`` tags. + + Pinned to #83: per-word ```` insertion (PR #70) + tripped Whisper's silence-detection / segmentation heuristic and + produced a 6× WER regression on Tier A clips. Any future Hebrew + word-merge mitigation (#62) must not re-introduce per-word breaks. + """ utt = UtteranceSpec( - text=text, + text="one two three four five six seven eight", voice_id="he-IL-AvriNeural", - rate_multiplier=1.14, - pitch_delta_st=2.0, - volume_delta_db=13.0, - phrase_prosody=[phrase], ) ssml = self.builder.build_single(utt, supports_style_tags=False) + assert " tags — see #83." - # Must be valid XML. - import xml.etree.ElementTree as ET - - ET.fromstring(self._body(ssml)) + def test_adjacent_phrase_breaks_are_merged(self): + """break_after of one phrase + break_before of the next must merge. - # No adjacent breaks. - adjacent_breaks = re.findall(r"\s*\s*\s* elements separated only by whitespace. + import re - # Must parse as valid XML. - ET.fromstring(body) + adjacent = re.findall(r"\s*