Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 25 additions & 102 deletions synthbanshee/tts/ssml_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
``supports_style_tags=False``)
- <prosody> elements for rate, pitch, and volume control
- Nested per-phrase <prosody> + <break> elements (M2b)
- Inter-word ``<break time="50ms"/>`` elements to prevent Hebrew word
merging (#62)

Azure SSML reference:
https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice
Expand All @@ -28,12 +26,6 @@
_MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
_SPEAK_LANG = "he-IL"

# Inter-word break duration in milliseconds. 50 ms is the initial estimate
# for signalling a word boundary to Azure / Google he-IL without introducing
# an audible pause. This value needs empirical validation with real TTS
# output — it may need per-provider tuning if engines respond differently.
_WORD_BREAK_MS = 50

# Azure prosody attribute limits (documented ranges).
_AZURE_RATE_MIN_PCT = -50 # rate="-50%" → 0.5x
_AZURE_RATE_MAX_PCT = 200 # rate="+200%" → 3.0x
Expand All @@ -57,74 +49,6 @@ def _sanitize_text(text: str) -> str:
return _XML_INVALID_CHARS_RE.sub("", text)


def _inject_word_breaks(
parent: ET.Element,
text: str,
after: ET.Element | None = None,
) -> ET.Element | None:
"""Insert *text* into *parent* with ``<break>`` tags between words.

Hebrew TTS engines (Azure he-IL, Google Chirp) merge adjacent words into
unintelligible speech when no explicit boundary cue exists. This function
splits *text* on whitespace and inserts a short ``<break>`` element between
every pair of consecutive words.

Args:
parent: The XML element to add content to.
text: The text to inject (may contain multi-word Hebrew).
after: If provided, the first text chunk is appended to
``after.tail`` instead of ``parent.text``.

Returns:
The last child element added to *parent*, or *after* if no ``<break>``
elements were created (single word or empty text).
"""
if not text or not text.strip():
# Pure whitespace or empty — append as-is without inserting breaks.
if text:
if after is None:
parent.text = (parent.text or "") + text
else:
after.tail = (after.tail or "") + text
return after

words = text.split()
last = after

# Preserve any leading whitespace (e.g. space before a text fragment
# that follows a <break> or <prosody> element).
leading = text[: len(text) - len(text.lstrip())]
if leading:
if last is None:
parent.text = (parent.text or "") + leading
else:
last.tail = (last.tail or "") + leading

for i, word in enumerate(words):
if i == 0:
# First word: append directly (no break needed before it).
if last is None:
parent.text = (parent.text or "") + word
else:
last.tail = (last.tail or "") + word
else:
# Subsequent words: insert <break/> then the word with a
# leading space in the tail to preserve normal spacing.
brk = ET.SubElement(parent, "break", attrib={"time": f"{_WORD_BREAK_MS}ms"})
brk.tail = " " + word
last = brk

# Preserve any trailing whitespace.
trailing = text[len(text.rstrip()) :]
if trailing:
if last is None:
parent.text = (parent.text or "") + trailing
else:
last.tail = (last.tail or "") + trailing

return last


@dataclass
class UtteranceSpec:
"""Specifies one TTS utterance to be included in an SSML document."""
Expand Down Expand Up @@ -188,10 +112,13 @@ def _apply_phrase_prosody(

Splits *text* around phrase spans and wraps each phrase in a nested
``<prosody>`` element with optional ``<break time="…"/>`` elements
inserted before and/or after. Inter-word ``<break>`` elements are
inserted within each text fragment to prevent Hebrew word merging (#62).
Overlapping spans are skipped (the span whose ``char_start`` falls
before the previous span's ``char_end`` is silently dropped).
inserted before and/or after. Adjacent ``<break>`` elements are merged
(durations summed) to avoid Azure SSML parser error 0x80045003 (#67):
when a phrase carries ``break_after_ms`` and the next phrase carries
``break_before_ms``, two ``<break>`` siblings would otherwise appear
back-to-back. Overlapping spans are skipped (the span whose
``char_start`` falls before the previous span's ``char_end`` is silently
dropped).

Args:
parent: The XML element that will receive the mixed text/element content.
Expand All @@ -202,16 +129,26 @@ def _apply_phrase_prosody(
prev: ET.Element | None = None

def _append_text(s: str) -> None:
"""Append *s* with inter-word ``<break>`` elements."""
nonlocal prev
if not s:
return
result = _inject_word_breaks(parent, s, after=prev)
if result is not None:
prev = result
if prev is None:
parent.text = (parent.text or "") + s
else:
prev.tail = (prev.tail or "") + s

def _append_break(ms: int) -> ET.Element:
"""Add a ``<break time="{ms}ms"/>`` or merge into the preceding break.

Azure rejects adjacent ``<break>`` siblings (#67); when the most
recently appended element is itself a ``<break>``, we sum the
durations into the existing element instead of creating a new one.
"""
nonlocal prev
if prev is not None and prev.tag == "break":
prev_ms = int(prev.attrib.get("time", "0ms").replace("ms", ""))
prev.attrib["time"] = f"{prev_ms + ms}ms"
return prev
el = ET.SubElement(parent, "break", attrib={"time": f"{ms}ms"})
prev = el
return el
Expand All @@ -227,23 +164,9 @@ def _append_break(ms: int) -> ET.Element:
if phrase.char_start > cursor:
_append_text(text[cursor : phrase.char_start])

# Optional break before the phrase. Merge with the preceding
# <break> element (if any) to avoid adjacent breaks that Azure's SSML
# parser rejects with error 0x80045003 (#67).
# Optional break before the phrase.
if phrase.break_before_ms > 0:
if prev is not None and prev.tag == "break":
prev_time = prev.attrib.get("time", "0ms")
prev_ms = int(prev_time.replace("ms", ""))
if prev_ms == _WORD_BREAK_MS:
# Word-boundary break: replace with the phrase break
# (the phrase break subsumes the word-boundary intent).
prev.attrib["time"] = f"{phrase.break_before_ms}ms"
else:
# Semantic break (e.g. break_after from prior phrase):
# sum durations to preserve both intents.
prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms"
else:
_append_break(phrase.break_before_ms)
_append_break(phrase.break_before_ms)

# Phrase-level prosody wrapper (omitted when only breaks are requested).
phrase_attrs: dict[str, str] = {}
Expand All @@ -257,7 +180,7 @@ def _append_break(ms: int) -> ET.Element:
phrase_text = text[phrase.char_start : phrase.char_end]
if phrase_attrs:
pe = ET.SubElement(parent, "prosody", attrib=phrase_attrs)
_inject_word_breaks(pe, phrase_text)
pe.text = phrase_text
prev = pe
else:
_append_text(phrase_text)
Expand Down Expand Up @@ -350,7 +273,7 @@ def build_multi(
if utt.phrase_prosody:
_apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
else:
_inject_word_breaks(inner, utt_text)
inner.text = utt_text

raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw
Expand Down
17 changes: 4 additions & 13 deletions tests/unit/test_phrase_prosody.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,13 +333,8 @@ class TestApplyPhraseProsody:
def test_no_phrases_sets_text(self) -> None:
parent = _make_parent()
_apply_phrase_prosody(parent, "hello world", [])
# Inter-word <break> splits text: parent.text = "hello",
# then <break time="50ms"/> with tail " world".
assert parent.text == "hello"
children = list(parent)
assert len(children) == 1
assert children[0].tag == "break"
assert children[0].tail == " world"
assert parent.text == "hello world"
assert len(list(parent)) == 0 # no children

def test_single_phrase_mid_text(self) -> None:
parent = _make_parent()
Expand Down Expand Up @@ -377,12 +372,8 @@ def test_zero_length_phrase_skipped(self) -> None:
parent = _make_parent()
phrase = PhraseProsody("p0", 3, 3) # zero-length span
_apply_phrase_prosody(parent, "hello world", [phrase])
# Zero-length phrase skipped → text set with inter-word breaks
assert parent.text == "hello"
children = list(parent)
assert len(children) == 1
assert children[0].tag == "break"
assert children[0].tail == " world"
# Zero-length phrase skipped → text is set as-is
assert parent.text == "hello world"

def test_phrase_with_pitch_attribute(self) -> None:
# Covers `if phrase.pitch is not None: phrase_attrs["pitch"] = ...` (line 113-114).
Expand Down
Loading
Loading