From 99eeb84fde3840c4e4e2cf6ffe1f74eeb6d12cdc Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 5 May 2026 23:29:29 +0300 Subject: [PATCH 1/3] Revert "fix(tts): Azure SSML parsing error on adjacent break elements (#67) (#71)" This reverts commit 0bdb217fc5e83a11a9c4161c0178bbb4f8c3ebc2. --- synthbanshee/tts/ssml_builder.py | 83 ++------- tests/unit/test_tts.py | 290 ------------------------------- 2 files changed, 10 insertions(+), 363 deletions(-) diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py index 57de60d..ed07039 100644 --- a/synthbanshee/tts/ssml_builder.py +++ b/synthbanshee/tts/ssml_builder.py @@ -15,15 +15,11 @@ from __future__ import annotations -import logging -import re import xml.etree.ElementTree as ET from dataclasses import dataclass, field from synthbanshee.tts.ssml_types import PhraseProsody -_log = logging.getLogger(__name__) - _AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis" _MSTTS_XMLNS = "http://www.w3.org/2001/mstts" _SPEAK_LANG = "he-IL" @@ -34,28 +30,6 @@ # output — it may need per-provider tuning if engines respond differently. _WORD_BREAK_MS = 50 -# Azure prosody attribute limits (documented ranges). -_AZURE_RATE_MIN_PCT = -50 # rate="-50%" → 0.5x -_AZURE_RATE_MAX_PCT = 200 # rate="+200%" → 3.0x -_AZURE_PITCH_MIN_PCT = -50 -_AZURE_PITCH_MAX_PCT = 50 -_AZURE_VOLUME_MIN_PCT = -50 -_AZURE_VOLUME_MAX_PCT = 50 - -# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F. -# These must be stripped before embedding text in SSML. -_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") - - -def _sanitize_text(text: str) -> str: - """Remove characters that are invalid in XML 1.0 from *text*. - - Defense-in-depth: ideally invalid chars should be rejected at the LLM - response parsing boundary (script/generator.py). This guard ensures the - SSML builder never produces unparseable XML regardless of upstream bugs. - """ - return _XML_INVALID_CHARS_RE.sub("", text) - def _inject_word_breaks( parent: ET.Element, @@ -139,44 +113,23 @@ class UtteranceSpec: def _semitones_to_percent(st: float) -> str: - """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%'). - - Values are clamped to Azure's documented ±50% range. A warning is logged - when clamping activates — this indicates a speaker config or state-drift bug. - """ + """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').""" # Approximation: 1 semitone ≈ 5.946% pitch change pct = round(st * 5.946) - clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct)) - if clamped != pct: - _log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) - return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" + return f"+{pct}%" if pct >= 0 else f"{pct}%" def _rate_to_string(rate: float) -> str: - """Format a rate multiplier as a percentage string for . - - Values are clamped to Azure's documented -50% to +200% range. A warning is - logged when clamping activates. - """ + """Format a rate multiplier as a percentage string for .""" pct = round((rate - 1.0) * 100) - clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct)) - if clamped != pct: - _log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) - return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" + return f"+{pct}%" if pct >= 0 else f"{pct}%" def _volume_to_string(db: float) -> str: - """Format a dB volume offset as a percentage for . - - Values are clamped to Azure's documented ±50% range. A warning is logged - when clamping activates. - """ + """Format a dB volume offset as a percentage for .""" # Azure volume is 0–100; default is 100. Map dB linearly (rough). pct = round(db * 1.0) - clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct)) - if clamped != pct: - _log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) - return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" + return f"+{pct}%" if pct >= 0 else f"{pct}%" def _apply_phrase_prosody( @@ -227,23 +180,9 @@ def _append_break(ms: int) -> ET.Element: if phrase.char_start > cursor: _append_text(text[cursor : phrase.char_start]) - # Optional break before the phrase. Merge with the preceding - # element (if any) to avoid adjacent breaks that Azure's SSML - # parser rejects with error 0x80045003 (#67). + # Optional break before the phrase. if phrase.break_before_ms > 0: - if prev is not None and prev.tag == "break": - prev_time = prev.attrib.get("time", "0ms") - prev_ms = int(prev_time.replace("ms", "")) - if prev_ms == _WORD_BREAK_MS: - # Word-boundary break: replace with the phrase break - # (the phrase break subsumes the word-boundary intent). - prev.attrib["time"] = f"{phrase.break_before_ms}ms" - else: - # Semantic break (e.g. break_after from prior phrase): - # sum durations to preserve both intents. - prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms" - else: - _append_break(phrase.break_before_ms) + _append_break(phrase.break_before_ms) # Phrase-level prosody wrapper (omitted when only breaks are requested). phrase_attrs: dict[str, str] = {} @@ -317,8 +256,6 @@ def build_multi( speak = ET.Element("speak", attrib=speak_attribs) for utt in utterances: - # Sanitize text: strip characters invalid in XML 1.0 (#67). - utt_text = _sanitize_text(utt.text) voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id}) # Add express-as only when a non-default style is requested AND @@ -348,9 +285,9 @@ def build_multi( inner = parent if utt.phrase_prosody: - _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody) + _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody) else: - _inject_word_breaks(inner, utt_text) + _inject_word_breaks(inner, utt.text) raw = ET.tostring(speak, encoding="unicode", xml_declaration=False) return '\n' + raw diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index f74cad8..29c11c7 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -330,296 +330,6 @@ def test_inject_word_breaks_whitespace_only(self): assert parent.text == " " -# --------------------------------------------------------------------------- -# SSML parse-error regression tests (#67) -# --------------------------------------------------------------------------- - - -class TestSSMLParseErrorFix67: - """Regression tests for Azure SSML parsing error 0x80045003. - - The root cause is adjacent elements created when inter-word breaks - (PR #70) interact with phrase prosody break_before/break_after attributes. - Azure's SSML parser rejects adjacent breaks as malformed. - """ - - def setup_method(self): - self.builder = SSMLBuilder() - - def _body(self, ssml: str) -> str: - return ssml.split("\n", 1)[1] if ssml.startswith(" elements should be adjacent (only whitespace between). - adjacent_breaks = re.findall(r"\s*\s*\s*\s* Date: Tue, 5 May 2026 23:29:29 +0300 Subject: [PATCH 2/3] Revert "fix(tts): insert inter-word tags to prevent Hebrew word merging (#70)" This reverts commit d0c273bbc431a49a3818b973b41745c9a4a51cec. --- synthbanshee/tts/ssml_builder.py | 95 ++------------- tests/unit/test_phrase_prosody.py | 17 +-- tests/unit/test_tts.py | 196 ------------------------------ 3 files changed, 13 insertions(+), 295 deletions(-) diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py index ed07039..9d987e2 100644 --- a/synthbanshee/tts/ssml_builder.py +++ b/synthbanshee/tts/ssml_builder.py @@ -6,8 +6,6 @@ ``supports_style_tags=False``) - elements for rate, pitch, and volume control - Nested per-phrase + elements (M2b) - - Inter-word ```` elements to prevent Hebrew word - merging (#62) Azure SSML reference: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice @@ -24,80 +22,6 @@ _MSTTS_XMLNS = "http://www.w3.org/2001/mstts" _SPEAK_LANG = "he-IL" -# Inter-word break duration in milliseconds. 50 ms is the initial estimate -# for signalling a word boundary to Azure / Google he-IL without introducing -# an audible pause. This value needs empirical validation with real TTS -# output — it may need per-provider tuning if engines respond differently. -_WORD_BREAK_MS = 50 - - -def _inject_word_breaks( - parent: ET.Element, - text: str, - after: ET.Element | None = None, -) -> ET.Element | None: - """Insert *text* into *parent* with ```` tags between words. - - Hebrew TTS engines (Azure he-IL, Google Chirp) merge adjacent words into - unintelligible speech when no explicit boundary cue exists. This function - splits *text* on whitespace and inserts a short ```` element between - every pair of consecutive words. - - Args: - parent: The XML element to add content to. - text: The text to inject (may contain multi-word Hebrew). - after: If provided, the first text chunk is appended to - ``after.tail`` instead of ``parent.text``. - - Returns: - The last child element added to *parent*, or *after* if no ```` - elements were created (single word or empty text). - """ - if not text or not text.strip(): - # Pure whitespace or empty — append as-is without inserting breaks. - if text: - if after is None: - parent.text = (parent.text or "") + text - else: - after.tail = (after.tail or "") + text - return after - - words = text.split() - last = after - - # Preserve any leading whitespace (e.g. space before a text fragment - # that follows a or element). - leading = text[: len(text) - len(text.lstrip())] - if leading: - if last is None: - parent.text = (parent.text or "") + leading - else: - last.tail = (last.tail or "") + leading - - for i, word in enumerate(words): - if i == 0: - # First word: append directly (no break needed before it). - if last is None: - parent.text = (parent.text or "") + word - else: - last.tail = (last.tail or "") + word - else: - # Subsequent words: insert then the word with a - # leading space in the tail to preserve normal spacing. - brk = ET.SubElement(parent, "break", attrib={"time": f"{_WORD_BREAK_MS}ms"}) - brk.tail = " " + word - last = brk - - # Preserve any trailing whitespace. - trailing = text[len(text.rstrip()) :] - if trailing: - if last is None: - parent.text = (parent.text or "") + trailing - else: - last.tail = (last.tail or "") + trailing - - return last - @dataclass class UtteranceSpec: @@ -141,10 +65,9 @@ def _apply_phrase_prosody( Splits *text* around phrase spans and wraps each phrase in a nested ```` element with optional ```` elements - inserted before and/or after. Inter-word ```` elements are - inserted within each text fragment to prevent Hebrew word merging (#62). - Overlapping spans are skipped (the span whose ``char_start`` falls - before the previous span's ``char_end`` is silently dropped). + inserted before and/or after. Overlapping spans are skipped (the span + whose ``char_start`` falls before the previous span's ``char_end`` is + silently dropped). Args: parent: The XML element that will receive the mixed text/element content. @@ -155,13 +78,13 @@ def _apply_phrase_prosody( prev: ET.Element | None = None def _append_text(s: str) -> None: - """Append *s* with inter-word ```` elements.""" nonlocal prev if not s: return - result = _inject_word_breaks(parent, s, after=prev) - if result is not None: - prev = result + if prev is None: + parent.text = (parent.text or "") + s + else: + prev.tail = (prev.tail or "") + s def _append_break(ms: int) -> ET.Element: nonlocal prev @@ -196,7 +119,7 @@ def _append_break(ms: int) -> ET.Element: phrase_text = text[phrase.char_start : phrase.char_end] if phrase_attrs: pe = ET.SubElement(parent, "prosody", attrib=phrase_attrs) - _inject_word_breaks(pe, phrase_text) + pe.text = phrase_text prev = pe else: _append_text(phrase_text) @@ -287,7 +210,7 @@ def build_multi( if utt.phrase_prosody: _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody) else: - _inject_word_breaks(inner, utt.text) + inner.text = utt.text raw = ET.tostring(speak, encoding="unicode", xml_declaration=False) return '\n' + raw diff --git a/tests/unit/test_phrase_prosody.py b/tests/unit/test_phrase_prosody.py index d730dd1..ce61794 100644 --- a/tests/unit/test_phrase_prosody.py +++ b/tests/unit/test_phrase_prosody.py @@ -333,13 +333,8 @@ class TestApplyPhraseProsody: def test_no_phrases_sets_text(self) -> None: parent = _make_parent() _apply_phrase_prosody(parent, "hello world", []) - # Inter-word splits text: parent.text = "hello", - # then with tail " world". - assert parent.text == "hello" - children = list(parent) - assert len(children) == 1 - assert children[0].tag == "break" - assert children[0].tail == " world" + assert parent.text == "hello world" + assert len(list(parent)) == 0 # no children def test_single_phrase_mid_text(self) -> None: parent = _make_parent() @@ -377,12 +372,8 @@ def test_zero_length_phrase_skipped(self) -> None: parent = _make_parent() phrase = PhraseProsody("p0", 3, 3) # zero-length span _apply_phrase_prosody(parent, "hello world", [phrase]) - # Zero-length phrase skipped → text set with inter-word breaks - assert parent.text == "hello" - children = list(parent) - assert len(children) == 1 - assert children[0].tag == "break" - assert children[0].tail == " world" + # Zero-length phrase skipped → text is set as-is + assert parent.text == "hello world" def test_phrase_with_pitch_attribute(self) -> None: # Covers `if phrase.pitch is not None: phrase_attrs["pitch"] = ...` (line 113-114). diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index 29c11c7..0f1ed74 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -16,10 +16,8 @@ from synthbanshee.tts.azure_provider import AzureProvider from synthbanshee.tts.renderer import TTSRenderer from synthbanshee.tts.ssml_builder import ( - _WORD_BREAK_MS, SSMLBuilder, UtteranceSpec, - _inject_word_breaks, _rate_to_string, _semitones_to_percent, ) @@ -136,200 +134,6 @@ def test_xml_is_well_formed(self): ET.fromstring(ssml_body) # Should not raise -# --------------------------------------------------------------------------- -# Word-boundary break tests (#62) -# --------------------------------------------------------------------------- - - -class TestWordBoundaryBreaks: - """Verify that multi-word Hebrew text produces inter-word tags.""" - - def setup_method(self): - self.builder = SSMLBuilder() - - def _body(self, ssml: str) -> str: - """Strip XML declaration to get parseable SSML body.""" - return ssml.split("\n", 1)[1] if ssml.startswith(" elements between words.""" - utt = UtteranceSpec( - text="word1 word2 word3", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - # Two word boundaries → two elements - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 2 - - def test_single_word_no_breaks(self): - """Single-word text must not contain any elements.""" - utt = UtteranceSpec( - text="hello", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert f'time="{_WORD_BREAK_MS}ms"' not in ssml - - def test_text_preserved_after_breaks(self): - """All original words must appear in the serialised SSML.""" - utt = UtteranceSpec( - text="word1 word2 word3", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert "word1" in ssml - assert "word2" in ssml - assert "word3" in ssml - - def test_breaks_inside_prosody(self): - """Word breaks must also appear when a wrapper is present.""" - utt = UtteranceSpec( - text="word1 word2", - voice_id="he-IL-AvriNeural", - rate_multiplier=1.2, - ) - ssml = self.builder.build_single(utt) - # One word boundary inside the prosody wrapper - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - assert "prosody" in ssml - - def test_breaks_with_phrase_prosody(self): - """Word breaks must appear in text fragments around phrase prosody spans.""" - from synthbanshee.tts.ssml_types import PhraseProsody - - # "before1 before2 phrase1 phrase2 after1 after2" - # 0123456789... - # "before1"=0:7, " "=7, "before2"=8:15, " "=15, - # "phrase1"=16:23, " "=23, "phrase2"=24:31, " "=31, - # "after1"=32:38, " "=38, "after2"=39:45 - text = "before1 before2 phrase1 phrase2 after1 after2" - phrase = PhraseProsody( - phrase_id="p0", - char_start=16, # "phrase1 phrase2" - char_end=31, - rate="+10%", - ) - utt = UtteranceSpec( - text=text, - voice_id="he-IL-AvriNeural", - phrase_prosody=[phrase], - ) - ssml = self.builder.build_single(utt) - # All words must appear - for word in text.split(): - assert word in ssml, f"word {word!r} missing from SSML" - # Exact break count: 1 (before1↔before2) + 1 (phrase1↔phrase2 - # inside ) + 1 (after1↔after2) = 3 word-boundary breaks. - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 3 - - def test_hebrew_multi_word_text_has_breaks(self): - """Hebrew multi-word text must produce inter-word elements.""" - # Reproduces the core scenario from issue #62. - utt = UtteranceSpec( - text="\u05d4\u05d9\u05d9, \u05d7\u05e9\u05d1\u05ea\u05d9", - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - # Both tokens must survive serialization intact. - assert "\u05d4\u05d9\u05d9," in ssml - assert "\u05d7\u05e9\u05d1\u05ea\u05d9" in ssml - - def test_hebrew_with_niqqud_preserved(self): - """Niqqud-bearing Hebrew words must not be corrupted by break injection.""" - # Two words, the first with niqqud (shin + shva + lamed + dagesh). - text = "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd \u05e2\u05d5\u05dc\u05dd" - utt = UtteranceSpec(text=text, voice_id="he-IL-AvriNeural") - ssml = self.builder.build_single(utt) - assert ssml.count(f'time="{_WORD_BREAK_MS}ms"') == 1 - assert "\u05e9\u05b0\u05dc\u05d5\u05bc\u05dd" in ssml - assert "\u05e2\u05d5\u05dc\u05dd" in ssml - - def test_text_roundtrip_preserves_content(self): - """Serialise → parse → extract text: all content must match the input.""" - import xml.etree.ElementTree as ET - - text = "one two three four" - utt = UtteranceSpec( - text=text, - voice_id="he-IL-AvriNeural", - ) - ssml = self.builder.build_single(utt) - root = ET.fromstring(self._body(ssml)) - - # Walk the tree and collect every text/tail fragment. - fragments: list[str] = [] - - def _collect(el: ET.Element) -> None: - if el.text: - fragments.append(el.text) - for child in el: - _collect(child) - if child.tail: - fragments.append(child.tail) - - _collect(root) - recovered = "".join(fragments) - # Recovered text (ignoring break elements) must equal the original. - assert recovered == text - - def test_hebrew_text_roundtrip(self): - """Hebrew content must survive the SSML serialise → parse roundtrip.""" - import xml.etree.ElementTree as ET - - text = ( - "\u05d0\u05d2\u05d1 \u05e0\u05d9\u05e1\u05d9\u05ea\u05d9" - " \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da" - ) - utt = UtteranceSpec(text=text, voice_id="he-IL-HilaNeural") - ssml = self.builder.build_single(utt) - root = ET.fromstring(self._body(ssml)) - - fragments: list[str] = [] - - def _collect(el: ET.Element) -> None: - if el.text: - fragments.append(el.text) - for child in el: - _collect(child) - if child.tail: - fragments.append(child.tail) - - _collect(root) - assert "".join(fragments) == text - - def test_xml_well_formed_with_breaks(self): - """SSML with word breaks must remain valid XML.""" - import xml.etree.ElementTree as ET - - utt = UtteranceSpec( - text="word1 word2 word3 word4", - voice_id="he-IL-AvriNeural", - style="angry", - rate_multiplier=1.1, - ) - ssml = self.builder.build_single(utt) - ET.fromstring(self._body(ssml)) # Should not raise - - def test_inject_word_breaks_empty_text(self): - """_inject_word_breaks with empty text should not create elements.""" - import xml.etree.ElementTree as ET - - parent = ET.Element("test") - result = _inject_word_breaks(parent, "") - assert result is None - assert len(list(parent)) == 0 - - def test_inject_word_breaks_whitespace_only(self): - """_inject_word_breaks with whitespace-only text should preserve it.""" - import xml.etree.ElementTree as ET - - parent = ET.Element("test") - result = _inject_word_breaks(parent, " ") - assert result is None - assert parent.text == " " - - # --------------------------------------------------------------------------- # AzureProvider tests (mocked) # --------------------------------------------------------------------------- From df651981211de9d2d8db65caf992a31dac8f58a4 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 00:19:12 +0300 Subject: [PATCH 3/3] restore(tts): reinstate hardenings from #71 unrelated to #70 + add #83 regression test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial revert in this branch (revert PR #70 + PR #71 wholesale) was too aggressive: PR #71 bundled three independent hardenings, only one of which was caused by PR #70. This commit restores the two hardenings that have nothing to do with inter-word break injection, and narrows the third to the residual case that survives #70's revert. Also adds the #83 regression test that pins the bisect finding. Restored from #71 (and explicitly verified to NOT re-introduce #83): - `_sanitize_text` + `_XML_INVALID_CHARS_RE` regex. Defends against an LLM emitting XML 1.0 control characters that otherwise make the SSML unparseable by Azure. Independent bug class from per-word breaks. - Azure-range prosody clamping in `_semitones_to_percent`, `_rate_to_string`, `_volume_to_string`, plus warning logs on clamp activation. `speaker_BYS_F_6-10_001.yaml` ships `pitch_delta_st=+9` → +54% unclamped, which Azure rejects. Independent bug class. - Adjacent `` merging in `_apply_phrase_prosody`, narrowed to the phrase-after / phrase-before case (the only adjacent-break source that survives #70's revert). The original #71 logic also had a word-break branch that is no longer reachable. Added: - `test_no_per_word_breaks_in_default_ssml` regression test pinned to #83. The default multi-word SSML must not contain `` tags; per-word break injection (PR #70) tripped Whisper's silence-detection heuristic and produced the WER regression. Any future Hebrew word- merge mitigation (#62) must not re-introduce per-word breaks. - `test_text_with_invalid_xml_chars_sanitized`, `test_prosody_pitch_clamped_to_azure_range`, `test_prosody_rate_clamped_to_azure_range`, `test_prosody_volume_clamped_to_azure_range`, `test_adjacent_phrase_breaks_are_merged` — pin the restored hardenings. All three of these were independently flagged by Copilot's review on this PR (resolves three Copilot review threads). Co-Authored-By: Claude Opus 4.7 --- synthbanshee/tts/ssml_builder.py | 85 +++++++++++++++++++--- tests/unit/test_tts.py | 118 +++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 11 deletions(-) diff --git a/synthbanshee/tts/ssml_builder.py b/synthbanshee/tts/ssml_builder.py index 9d987e2..5df230a 100644 --- a/synthbanshee/tts/ssml_builder.py +++ b/synthbanshee/tts/ssml_builder.py @@ -13,15 +13,41 @@ from __future__ import annotations +import logging +import re import xml.etree.ElementTree as ET from dataclasses import dataclass, field from synthbanshee.tts.ssml_types import PhraseProsody +_log = logging.getLogger(__name__) + _AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis" _MSTTS_XMLNS = "http://www.w3.org/2001/mstts" _SPEAK_LANG = "he-IL" +# Azure prosody attribute limits (documented ranges). +_AZURE_RATE_MIN_PCT = -50 # rate="-50%" → 0.5x +_AZURE_RATE_MAX_PCT = 200 # rate="+200%" → 3.0x +_AZURE_PITCH_MIN_PCT = -50 +_AZURE_PITCH_MAX_PCT = 50 +_AZURE_VOLUME_MIN_PCT = -50 +_AZURE_VOLUME_MAX_PCT = 50 + +# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F. +# These must be stripped before embedding text in SSML. +_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") + + +def _sanitize_text(text: str) -> str: + """Remove characters that are invalid in XML 1.0 from *text*. + + Defense-in-depth: ideally invalid chars should be rejected at the LLM + response parsing boundary (script/generator.py). This guard ensures the + SSML builder never produces unparseable XML regardless of upstream bugs. + """ + return _XML_INVALID_CHARS_RE.sub("", text) + @dataclass class UtteranceSpec: @@ -37,23 +63,44 @@ class UtteranceSpec: def _semitones_to_percent(st: float) -> str: - """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').""" + """Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%'). + + Values are clamped to Azure's documented ±50% range. A warning is logged + when clamping activates — this indicates a speaker config or state-drift bug. + """ # Approximation: 1 semitone ≈ 5.946% pitch change pct = round(st * 5.946) - return f"+{pct}%" if pct >= 0 else f"{pct}%" + clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct)) + if clamped != pct: + _log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) + return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" def _rate_to_string(rate: float) -> str: - """Format a rate multiplier as a percentage string for .""" + """Format a rate multiplier as a percentage string for . + + Values are clamped to Azure's documented -50% to +200% range. A warning is + logged when clamping activates. + """ pct = round((rate - 1.0) * 100) - return f"+{pct}%" if pct >= 0 else f"{pct}%" + clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct)) + if clamped != pct: + _log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) + return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" def _volume_to_string(db: float) -> str: - """Format a dB volume offset as a percentage for .""" + """Format a dB volume offset as a percentage for . + + Values are clamped to Azure's documented ±50% range. A warning is logged + when clamping activates. + """ # Azure volume is 0–100; default is 100. Map dB linearly (rough). pct = round(db * 1.0) - return f"+{pct}%" if pct >= 0 else f"{pct}%" + clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct)) + if clamped != pct: + _log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped) + return f"+{clamped}%" if clamped >= 0 else f"{clamped}%" def _apply_phrase_prosody( @@ -65,9 +112,13 @@ def _apply_phrase_prosody( Splits *text* around phrase spans and wraps each phrase in a nested ```` element with optional ```` elements - inserted before and/or after. Overlapping spans are skipped (the span - whose ``char_start`` falls before the previous span's ``char_end`` is - silently dropped). + inserted before and/or after. Adjacent ```` elements are merged + (durations summed) to avoid Azure SSML parser error 0x80045003 (#67): + when a phrase carries ``break_after_ms`` and the next phrase carries + ``break_before_ms``, two ```` siblings would otherwise appear + back-to-back. Overlapping spans are skipped (the span whose + ``char_start`` falls before the previous span's ``char_end`` is silently + dropped). Args: parent: The XML element that will receive the mixed text/element content. @@ -87,7 +138,17 @@ def _append_text(s: str) -> None: prev.tail = (prev.tail or "") + s def _append_break(ms: int) -> ET.Element: + """Add a ```` or merge into the preceding break. + + Azure rejects adjacent ```` siblings (#67); when the most + recently appended element is itself a ````, we sum the + durations into the existing element instead of creating a new one. + """ nonlocal prev + if prev is not None and prev.tag == "break": + prev_ms = int(prev.attrib.get("time", "0ms").replace("ms", "")) + prev.attrib["time"] = f"{prev_ms + ms}ms" + return prev el = ET.SubElement(parent, "break", attrib={"time": f"{ms}ms"}) prev = el return el @@ -179,6 +240,8 @@ def build_multi( speak = ET.Element("speak", attrib=speak_attribs) for utt in utterances: + # Sanitize text: strip characters invalid in XML 1.0 (#67). + utt_text = _sanitize_text(utt.text) voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id}) # Add express-as only when a non-default style is requested AND @@ -208,9 +271,9 @@ def build_multi( inner = parent if utt.phrase_prosody: - _apply_phrase_prosody(inner, utt.text, utt.phrase_prosody) + _apply_phrase_prosody(inner, utt_text, utt.phrase_prosody) else: - inner.text = utt.text + inner.text = utt_text raw = ET.tostring(speak, encoding="unicode", xml_declaration=False) return '\n' + raw diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index 0f1ed74..1add25d 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -9,6 +9,7 @@ import struct import sys import wave +import xml.etree.ElementTree as ET from pathlib import Path from unittest.mock import MagicMock @@ -20,7 +21,9 @@ UtteranceSpec, _rate_to_string, _semitones_to_percent, + _volume_to_string, ) +from synthbanshee.tts.ssml_types import PhraseProsody EXAMPLES_DIR = Path(__file__).parent.parent.parent / "configs" / "examples" @@ -134,6 +137,121 @@ def test_xml_is_well_formed(self): ET.fromstring(ssml_body) # Should not raise +# --------------------------------------------------------------------------- +# SSML invariants — sanitization, clamping, regression guard for #83 +# --------------------------------------------------------------------------- + + +class TestSSMLInvariants: + """Defense-in-depth invariants the SSML builder must hold. + + These assertions catch regressions in three orthogonal hardenings that + were lost when PR #70 + #71 were initially reverted as a bundle for + #83, then restored individually: + + - XML 1.0 control-character sanitization (originally from #71). + - Azure-range prosody clamping (originally from #71). + - Adjacent ```` merging in ``_apply_phrase_prosody`` to avoid + Azure error 0x80045003 (originally from #71, narrowed to the + phrase-after / phrase-before case after #70 was reverted). + - The ``no per-word tags`` rule pinned to #83. + """ + + def setup_method(self): + self.builder = SSMLBuilder() + + def _body(self, ssml: str) -> str: + return ssml.split("\n", 1)[1] if ssml.startswith("`` tags. + + Pinned to #83: per-word ```` insertion (PR #70) + tripped Whisper's silence-detection / segmentation heuristic and + produced a 6× WER regression on Tier A clips. Any future Hebrew + word-merge mitigation (#62) must not re-introduce per-word breaks. + """ + utt = UtteranceSpec( + text="one two three four five six seven eight", + voice_id="he-IL-AvriNeural", + ) + ssml = self.builder.build_single(utt, supports_style_tags=False) + assert " tags — see #83." + + def test_adjacent_phrase_breaks_are_merged(self): + """break_after of one phrase + break_before of the next must merge. + + Without merging, Azure rejects the SSML with parse error + 0x80045003 (#67). With #70 reverted, the only remaining adjacent- + break risk is between two consecutive phrases. + """ + text = "intro phrase_a end_a mid phrase_b outro" + phrases = [ + PhraseProsody( + phrase_id="p0", + char_start=6, + char_end=20, # "phrase_a end_a" + rate="-20%", + break_after_ms=250, + ), + PhraseProsody( + phrase_id="p1", + char_start=25, + char_end=33, # "phrase_b" + rate="-25%", + break_before_ms=300, + ), + ] + utt = UtteranceSpec( + text=text, + voice_id="he-IL-AvriNeural", + phrase_prosody=phrases, + ) + ssml = self.builder.build_single(utt, supports_style_tags=False) + + # Must remain valid XML. + ET.fromstring(self._body(ssml)) + + # No two elements separated only by whitespace. + import re + + adjacent = re.findall(r"\s*