Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 73 additions & 10 deletions synthbanshee/tts/ssml_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@

from __future__ import annotations

import logging
import re
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field

from synthbanshee.tts.ssml_types import PhraseProsody

_log = logging.getLogger(__name__)

_AZURE_XMLNS = "http://www.w3.org/2001/10/synthesis"
_MSTTS_XMLNS = "http://www.w3.org/2001/mstts"
_SPEAK_LANG = "he-IL"
Expand All @@ -30,6 +34,28 @@
# output — it may need per-provider tuning if engines respond differently.
_WORD_BREAK_MS = 50

# Azure prosody attribute limits (documented ranges).
_AZURE_RATE_MIN_PCT = -50 # rate="-50%" → 0.5x
_AZURE_RATE_MAX_PCT = 200 # rate="+200%" → 3.0x
_AZURE_PITCH_MIN_PCT = -50
_AZURE_PITCH_MAX_PCT = 50
_AZURE_VOLUME_MIN_PCT = -50
_AZURE_VOLUME_MAX_PCT = 50

# Characters invalid in XML 1.0: U+0000–U+0008, U+000B, U+000C, U+000E–U+001F.
# These must be stripped before embedding text in SSML.
_XML_INVALID_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")


def _sanitize_text(text: str) -> str:
"""Remove characters that are invalid in XML 1.0 from *text*.

Defense-in-depth: ideally invalid chars should be rejected at the LLM
response parsing boundary (script/generator.py). This guard ensures the
SSML builder never produces unparseable XML regardless of upstream bugs.
"""
return _XML_INVALID_CHARS_RE.sub("", text)


def _inject_word_breaks(
parent: ET.Element,
Expand Down Expand Up @@ -113,23 +139,44 @@ class UtteranceSpec:


def _semitones_to_percent(st: float) -> str:
"""Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%')."""
"""Convert a semitone shift to the Azure pitch % format (e.g. '+5%' / '-10%').

Values are clamped to Azure's documented ±50% range. A warning is logged
when clamping activates — this indicates a speaker config or state-drift bug.
"""
# Approximation: 1 semitone ≈ 5.946% pitch change
pct = round(st * 5.946)
return f"+{pct}%" if pct >= 0 else f"{pct}%"
clamped = max(_AZURE_PITCH_MIN_PCT, min(_AZURE_PITCH_MAX_PCT, pct))
if clamped != pct:
_log.warning("Pitch %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"


def _rate_to_string(rate: float) -> str:
"""Format a rate multiplier as a percentage string for <prosody rate=...>."""
"""Format a rate multiplier as a percentage string for <prosody rate=...>.

Values are clamped to Azure's documented -50% to +200% range. A warning is
logged when clamping activates.
"""
pct = round((rate - 1.0) * 100)
return f"+{pct}%" if pct >= 0 else f"{pct}%"
clamped = max(_AZURE_RATE_MIN_PCT, min(_AZURE_RATE_MAX_PCT, pct))
if clamped != pct:
_log.warning("Rate %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"


def _volume_to_string(db: float) -> str:
"""Format a dB volume offset as a percentage for <prosody volume=...>."""
"""Format a dB volume offset as a percentage for <prosody volume=...>.

Values are clamped to Azure's documented ±50% range. A warning is logged
when clamping activates.
"""
# Azure volume is 0–100; default is 100. Map dB linearly (rough).
pct = round(db * 1.0)
return f"+{pct}%" if pct >= 0 else f"{pct}%"
clamped = max(_AZURE_VOLUME_MIN_PCT, min(_AZURE_VOLUME_MAX_PCT, pct))
if clamped != pct:
_log.warning("Volume %+d%% exceeds Azure range; clamped to %+d%%", pct, clamped)
return f"+{clamped}%" if clamped >= 0 else f"{clamped}%"


def _apply_phrase_prosody(
Expand Down Expand Up @@ -180,9 +227,23 @@ def _append_break(ms: int) -> ET.Element:
if phrase.char_start > cursor:
_append_text(text[cursor : phrase.char_start])

# Optional break before the phrase.
# Optional break before the phrase. Merge with the preceding
# <break> element (if any) to avoid adjacent breaks that Azure's SSML
# parser rejects with error 0x80045003 (#67).
if phrase.break_before_ms > 0:
_append_break(phrase.break_before_ms)
if prev is not None and prev.tag == "break":
prev_time = prev.attrib.get("time", "0ms")
prev_ms = int(prev_time.replace("ms", ""))
if prev_ms == _WORD_BREAK_MS:
# Word-boundary break: replace with the phrase break
# (the phrase break subsumes the word-boundary intent).
prev.attrib["time"] = f"{phrase.break_before_ms}ms"
else:
# Semantic break (e.g. break_after from prior phrase):
# sum durations to preserve both intents.
prev.attrib["time"] = f"{prev_ms + phrase.break_before_ms}ms"
else:
_append_break(phrase.break_before_ms)

# Phrase-level prosody wrapper (omitted when only breaks are requested).
phrase_attrs: dict[str, str] = {}
Expand Down Expand Up @@ -256,6 +317,8 @@ def build_multi(
speak = ET.Element("speak", attrib=speak_attribs)

for utt in utterances:
# Sanitize text: strip characters invalid in XML 1.0 (#67).
utt_text = _sanitize_text(utt.text)
voice = ET.SubElement(speak, "voice", attrib={"name": utt.voice_id})

# Add express-as only when a non-default style is requested AND
Expand Down Expand Up @@ -285,9 +348,9 @@ def build_multi(
inner = parent

if utt.phrase_prosody:
_apply_phrase_prosody(inner, utt.text, utt.phrase_prosody)
_apply_phrase_prosody(inner, utt_text, utt.phrase_prosody)
else:
_inject_word_breaks(inner, utt.text)
_inject_word_breaks(inner, utt_text)

raw = ET.tostring(speak, encoding="unicode", xml_declaration=False)
return '<?xml version="1.0" encoding="UTF-8"?>\n' + raw
Expand Down
Loading
Loading