diff --git a/CHANGELOG.md b/CHANGELOG.md
index a967665..177d224 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format
 
 ## [Unreleased]
 
+### Added
+
+- **Word-level Media Overlay sync** (M6.5). When `--transcribe` runs and the cleanup path is active, dpub now extracts per-token timestamps from whisper.cpp, coalesces BPE pieces back into whole words via a leading-space rule (with punctuation attachment and degenerate-timing clamping), wraps each word in a `<span id="w-NNN-MMM-KKK">` inside the cleaned `<p id="tx-NNN-MMM">`, and emits one SMIL `<par>` per word — wrapped in nested `<seq epub:textref="...#tx-...">` per paragraph. The result is karaoke-style highlight-along-with-audio in compatible reading systems (Thorium, Readium). Default-on; pass `--no-word-sync` to fall back to per-paragraph sync. Workspace EPUBCheck assertions extended to gate the new overlay shape; reference book stays 0/0/0.
+- `dpub-whisper` exposes a public `Word { start_seconds, end_seconds, text }` struct and `Segment.words: Vec<Word>` populated by the new BPE coalescer (`crates/dpub-whisper/src/words.rs`). Eight unit tests cover the BPE coalescing rules.
+
 ## [0.5.0] - 2026-05-06
 
 First tagged release. Feature-complete for the v1 candidate: DAISY 2.02 → EPUB 3 conversion with Media Overlays, EPUBCheck-clean output, ACE accessibility validation, MP3 → Opus audio recompression, local Whisper transcription with prose-shaped paragraph cleanup, automatic and explicit cover lookup, parallel batch conversion, JSON output for CI/pipeline use. No API stability commitment yet — that comes with 1.0.
diff --git a/README.md b/README.md
index 74e2a70..706c35e 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,7 @@ Walks the input directory for every `ncc.html`, converts each book in parallel v
 | **M4** | Built-in validation (EPUBCheck + ACE) — `dpub validate`, `dpub a11y`. ✅ |
 | **M5** | Audio recompression (MP3 → Opus) — `dpub convert --audio opus --bitrate <kbps>`. ✅ |
 | **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe <lang> --whisper-model <path>`. ✅ (segments are merged into prose-shaped paragraphs by default; pass `--no-text-cleanup` for raw output) |
+| **M6.5** | Word-level Media Overlay sync — karaoke-style highlight-along-with-audio in reading systems that honour Media Overlays. Default-on with `--transcribe`; pass `--no-word-sync` to fall back to per-paragraph sync. ✅ |
 | **Tier 1 polish** | Whisper model caching, cover lookup (`--cover` and `--auto-cover`), parallel batch mode, JSON output for validators. ✅ |
 | **M7** | WASM build for browser-based conversion (planned scope: `info` + `validate` only — Whisper / ffmpeg are too heavy for a browser tab). |
 | **M8** | 1.0 release: signed binaries for macOS / Linux / Windows. |
diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs
index 732da8f..8195717 100644
--- a/crates/dpub-cli/src/main.rs
+++ b/crates/dpub-cli/src/main.rs
@@ -52,6 +52,14 @@ enum Command {
         /// output; not recommended for distribution.
         #[arg(long)]
         no_text_cleanup: bool,
+        /// Skip per-word Media Overlay sync. Word-level sync (the
+        /// default for transcribed books) drives karaoke-style
+        /// highlight-along-with-audio in compatible reading systems
+        /// (Thorium, Readium). Pass this flag to fall back to
+        /// per-paragraph sync — produces a smaller SMIL at the cost
+        /// of a coarser reading experience.
+        #[arg(long)]
+        no_word_sync: bool,
         /// Path to a JPEG or PNG image to embed as the EPUB cover.
         #[arg(long, value_name = "PATH", conflicts_with = "auto_cover")]
         cover: Option<PathBuf>,
@@ -150,6 +158,7 @@ fn main() -> Result<()> {
             transcribe,
             whisper_model,
             no_text_cleanup,
+            no_word_sync,
             cover,
             auto_cover,
             rights,
@@ -163,6 +172,7 @@ fn main() -> Result<()> {
             transcribe,
             whisper_model,
             no_text_cleanup,
+            no_word_sync,
             cover,
             auto_cover,
             rights,
@@ -190,6 +200,7 @@ fn cmd_convert(
     transcribe: Option<String>,
     whisper_model: Option<PathBuf>,
     no_text_cleanup: bool,
+    no_word_sync: bool,
     cover: Option<PathBuf>,
     auto_cover: bool,
     rights: Option<String>,
@@ -255,6 +266,7 @@ fn cmd_convert(
         cover,
         auto_cover,
         rights,
+        no_word_sync,
     };
     let start = std::time::Instant::now();
     dpub_convert::convert_to_file(&book, output, &opts)
@@ -545,6 +557,7 @@ fn cmd_batch(
         cover: None,
         auto_cover: false,
         rights: None,
+        no_word_sync: false,
     };
     let start = std::time::Instant::now();
     let entries: Vec<BatchEntry> = books
diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs
index 21d7852..138f130 100644
--- a/crates/dpub-convert/src/lib.rs
+++ b/crates/dpub-convert/src/lib.rs
@@ -564,6 +564,13 @@ pub struct ConvertOptions {
     /// is the simplest way to assert rights when the source
     /// doesn't carry one.
     pub rights: Option<String>,
+    /// When `true`, skip per-word Media Overlay sync and fall back
+    /// to per-paragraph sync. Default `false` — the cleanup path
+    /// emits one SMIL `<par>` per word, anchored to per-word
+    /// `<span>`s in the content XHTML, for karaoke-style
+    /// highlight-along-with-audio. Set this to keep SMIL files
+    /// small at the cost of a coarser reading experience.
+    pub no_word_sync: bool,
 }
 
 /// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call.
@@ -596,6 +603,7 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res
             &mut publication,
             transcribe,
             opts.raw_transcript_segments,
+            opts.no_word_sync,
         )?;
     }
 
@@ -713,6 +721,7 @@ fn inject_transcripts(
     publication: &mut Publication,
     opts: &TranscribeOptions,
     raw_segments: bool,
+    no_word_sync: bool,
 ) -> Result<()> {
     let whisper_opts = dpub_whisper::TranscribeOptions {
         model_path: opts.model_path.clone(),
@@ -746,8 +755,11 @@ fn inject_transcripts(
             cache.insert(audio_basename.clone(), segments);
         }
 
-        // Collect the in-range segments in document order.
+        // Collect the in-range segments in document order, paired with
+        // the audio basename each came from so the cleanup state machine
+        // can flush at audio-file boundaries.
         let mut section_segments: Vec<dpub_whisper::Segment> = Vec::new();
+        let mut section_audio_srcs: Vec<String> = Vec::new();
         for (audio_basename, t0, t1) in &audio_ranges {
             let Some(segments) = cache.get(audio_basename) else {
                 continue;
@@ -756,6 +768,7 @@ fn inject_transcripts(
                 let mid = (seg.start_seconds + seg.end_seconds) * 0.5;
                 if mid >= *t0 && mid <= *t1 && !seg.text.is_empty() {
                     section_segments.push(seg.clone());
+                    section_audio_srcs.push(audio_basename.clone());
                 }
             }
         }
@@ -765,9 +778,27 @@ fn inject_transcripts(
         } else {
             let cleaned = text_cleanup::merge_into_paragraphs(
                 &section_segments,
+                &section_audio_srcs,
                 &text_cleanup::CleanupOpts::default(),
             );
-            render_cleaned_paragraphs(idx, &cleaned)
+            let html = render_cleaned_paragraphs(idx, &cleaned);
+            // Word-level Media Overlay sync: rebuild this section's
+            // overlay from the cleaned paragraphs, replacing the
+            // heading-level shell that build_sections constructed.
+            // The new overlay drives karaoke-style highlighting in
+            // reading systems (Thorium etc.) that honour Media Overlays.
+            if !no_word_sync
+                && cleaned.iter().any(|p| !p.words.is_empty())
+                && let Some(overlay) = section_part.overlay.as_mut()
+            {
+                let new_root = build_word_overlay_seq(
+                    &section_part.content.href,
+                    idx,
+                    &cleaned,
+                );
+                overlay.root = new_root;
+            }
+            html
         };
         if !new_paragraphs.is_empty() {
             section_part.content.body_xhtml.push_str(&new_paragraphs);
@@ -791,19 +822,92 @@ fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment])
     out
 }
 
+/// Build a fresh `OverlaySeq` for a section from the cleaned paragraphs
+/// produced by `text_cleanup::merge_into_paragraphs`.
+///
+/// Shape: an outer `<seq epub:textref="../content/sNN.xhtml">` wrapping
+/// one inner `<seq epub:textref="...#tx-NNN-MMM">` per paragraph, each
+/// wrapping one `<par id="w-NNN-MMM-KKK">` per word. This gives reading
+/// systems (Thorium, Readium) a structural place to scope highlight to
+/// "current paragraph" while still tracking the spoken word.
+///
+/// The `content_href` is EPUB-relative to the OPF (e.g.
+/// `content/section-001.xhtml`); SMIL lives in `media-overlays/` so
+/// every emitted `src` starts with `../`.
+fn build_word_overlay_seq(
+    content_href: &str,
+    section_idx: usize,
+    paragraphs: &[text_cleanup::Paragraph],
+) -> OverlaySeq {
+    let mut top_children: Vec<OverlayItem> = Vec::with_capacity(paragraphs.len());
+    for (para_idx, para) in paragraphs.iter().enumerate() {
+        if para.words.is_empty() {
+            continue;
+        }
+        let para_anchor = format!("tx-{section_idx:03}-{para_idx:03}");
+        let para_textref = format!("../{content_href}#{para_anchor}");
+        let audio_src = format!("../audio/{}", para.audio_src);
+
+        let mut word_children: Vec<OverlayItem> = Vec::with_capacity(para.words.len());
+        for (word_idx, word) in para.words.iter().enumerate() {
+            let word_id = format!("w-{section_idx:03}-{para_idx:03}-{word_idx:03}");
+            word_children.push(OverlayItem::Par(OverlayPar {
+                id: Some(word_id.clone()),
+                text_src: format!("../{content_href}#{word_id}"),
+                audio_src: audio_src.clone(),
+                clip_begin_seconds: word.start_seconds,
+                clip_end_seconds: word.end_seconds,
+            }));
+        }
+        top_children.push(OverlayItem::Seq(OverlaySeq {
+            textref: Some(para_textref),
+            children: word_children,
+        }));
+    }
+    OverlaySeq {
+        textref: Some(format!("../{content_href}")),
+        children: top_children,
+    }
+}
+
 fn render_cleaned_paragraphs(
     section_idx: usize,
     paragraphs: &[text_cleanup::Paragraph],
 ) -> String {
     let mut out = String::new();
     for (para_idx, para) in paragraphs.iter().enumerate() {
+        if para.words.is_empty() {
+            // Whisper run with no per-word data (or test fixture
+            // without words). Fall back to a flat <p> with the
+            // paragraph text.
+            let _ = std::fmt::Write::write_fmt(
+                &mut out,
+                format_args!(
+                    "  <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
+                    escape_text(&para.text)
+                ),
+            );
+            continue;
+        }
+        // Emit one <span id="w-..."> per word, separated by single ASCII
+        // spaces so reading systems render natural word spacing.
         let _ = std::fmt::Write::write_fmt(
             &mut out,
-            format_args!(
-                "  <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
-                escape_text(&para.text)
-            ),
+            format_args!("  <p id=\"tx-{section_idx:03}-{para_idx:03}\">"),
         );
+        for (word_idx, word) in para.words.iter().enumerate() {
+            if word_idx > 0 {
+                out.push(' ');
+            }
+            let _ = std::fmt::Write::write_fmt(
+                &mut out,
+                format_args!(
+                    "<span id=\"w-{section_idx:03}-{para_idx:03}-{word_idx:03}\">{}</span>",
+                    escape_text(&word.text)
+                ),
+            );
+        }
+        out.push_str("</p>\n");
     }
     out
 }
@@ -948,6 +1052,169 @@ mod tests {
         assert_eq!(media_type_for(""), "application/octet-stream");
     }
 
+    #[test]
+    fn render_cleaned_paragraphs_emits_word_spans_when_words_present() {
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 1.5,
+            text: "Hallo wereld.".into(),
+            words: vec![
+                dpub_whisper::Word {
+                    start_seconds: 0.0,
+                    end_seconds: 0.5,
+                    text: "Hallo".into(),
+                },
+                dpub_whisper::Word {
+                    start_seconds: 0.5,
+                    end_seconds: 1.5,
+                    text: "wereld.".into(),
+                },
+            ],
+            audio_src: "a.mp3".into(),
+        };
+        let html = render_cleaned_paragraphs(7, &[para]);
+        assert!(
+            html.contains(r#"<p id="tx-007-000">"#),
+            "missing paragraph id: {html}"
+        );
+        assert!(
+            html.contains(r#"<span id="w-007-000-000">Hallo</span>"#),
+            "missing first word span: {html}"
+        );
+        assert!(
+            html.contains(r#"<span id="w-007-000-001">wereld.</span>"#),
+            "missing second word span: {html}"
+        );
+        // Single ASCII space between word spans.
+        assert!(
+            html.contains("</span> <span"),
+            "missing inter-span space: {html}"
+        );
+    }
+
+    #[test]
+    fn render_cleaned_paragraphs_falls_back_to_flat_p_without_words() {
+        // Defensive: when a Paragraph has no per-word data (test fixture
+        // without words, or a Whisper run that returned empty word list)
+        // the renderer must still emit something readable.
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 1.0,
+            text: "Hallo wereld.".into(),
+            words: vec![],
+            audio_src: "a.mp3".into(),
+        };
+        let html = render_cleaned_paragraphs(0, &[para]);
+        assert!(
+            html.contains(r#"<p id="tx-000-000">Hallo wereld.</p>"#),
+            "expected flat <p> fallback, got: {html}"
+        );
+        assert!(!html.contains("<span"), "no spans in fallback: {html}");
+    }
+
+    #[test]
+    fn build_word_overlay_seq_produces_nested_seqs_per_paragraph() {
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 2.0,
+            text: "Hallo wereld.".into(),
+            words: vec![
+                dpub_whisper::Word {
+                    start_seconds: 0.0,
+                    end_seconds: 0.5,
+                    text: "Hallo".into(),
+                },
+                dpub_whisper::Word {
+                    start_seconds: 0.5,
+                    end_seconds: 2.0,
+                    text: "wereld.".into(),
+                },
+            ],
+            audio_src: "07_Inleiding.mp3".into(),
+        };
+        let root = build_word_overlay_seq("content/sec.xhtml", 7, &[para]);
+        assert_eq!(root.textref.as_deref(), Some("../content/sec.xhtml"));
+        assert_eq!(root.children.len(), 1);
+
+        let OverlayItem::Seq(para_seq) = &root.children[0] else {
+            panic!("expected paragraph-level <seq>, got {:?}", root.children[0]);
+        };
+        assert_eq!(
+            para_seq.textref.as_deref(),
+            Some("../content/sec.xhtml#tx-007-000")
+        );
+        assert_eq!(para_seq.children.len(), 2);
+
+        let OverlayItem::Par(first_par) = &para_seq.children[0] else {
+            panic!("expected per-word <par>, got {:?}", para_seq.children[0]);
+        };
+        assert_eq!(first_par.id.as_deref(), Some("w-007-000-000"));
+        assert_eq!(first_par.text_src, "../content/sec.xhtml#w-007-000-000");
+        assert_eq!(first_par.audio_src, "../audio/07_Inleiding.mp3");
+        assert!((first_par.clip_begin_seconds - 0.0).abs() < 1e-9);
+        assert!((first_par.clip_end_seconds - 0.5).abs() < 1e-9);
+
+        let OverlayItem::Par(second_par) = &para_seq.children[1] else {
+            panic!("expected per-word <par>, got {:?}", para_seq.children[1]);
+        };
+        assert_eq!(second_par.id.as_deref(), Some("w-007-000-001"));
+        assert!((second_par.clip_end_seconds - 2.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn build_word_overlay_seq_skips_paragraphs_without_words() {
+        let p1 = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 1.0,
+            text: "no words here".into(),
+            words: vec![],
+            audio_src: "a.mp3".into(),
+        };
+        let p2 = text_cleanup::Paragraph {
+            start_seconds: 1.0,
+            end_seconds: 2.0,
+            text: "Hi".into(),
+            words: vec![dpub_whisper::Word {
+                start_seconds: 1.0,
+                end_seconds: 2.0,
+                text: "Hi".into(),
+            }],
+            audio_src: "a.mp3".into(),
+        };
+        let root = build_word_overlay_seq("content/x.xhtml", 0, &[p1, p2]);
+        // Paragraph 0 had no words and was skipped; paragraph 1 produced
+        // one inner <seq>.
+        assert_eq!(root.children.len(), 1);
+        let OverlayItem::Seq(inner) = &root.children[0] else {
+            panic!("expected one inner seq");
+        };
+        assert_eq!(
+            inner.textref.as_deref(),
+            Some("../content/x.xhtml#tx-000-001")
+        );
+    }
+
+    #[test]
+    fn render_cleaned_paragraphs_escapes_word_text() {
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 0.5,
+            text: r#"a<b&c"d"#.into(),
+            words: vec![dpub_whisper::Word {
+                start_seconds: 0.0,
+                end_seconds: 0.5,
+                text: r#"a<b&c"d"#.into(),
+            }],
+            audio_src: "a.mp3".into(),
+        };
+        let html = render_cleaned_paragraphs(0, &[para]);
+        assert!(
+            html.contains("a&lt;b&amp;c"),
+            "expected XML escaping in word text: {html}"
+        );
+        assert!(!html.contains("a<b&c"), "raw < and & must not leak");
+    }
+
     #[test]
     fn swap_extension_preserves_directory_with_forward_slashes() {
         assert_eq!(swap_extension("audio/foo.mp3", "opus"), "audio/foo.opus");
diff --git a/crates/dpub-convert/src/text_cleanup.rs b/crates/dpub-convert/src/text_cleanup.rs
index d52b408..e12b5da 100644
--- a/crates/dpub-convert/src/text_cleanup.rs
+++ b/crates/dpub-convert/src/text_cleanup.rs
@@ -11,15 +11,28 @@
 //! non-overlapping and chronological, so this is the trivially-correct
 //! span for any future per-paragraph Media Overlay sync.
 
-use dpub_whisper::Segment;
+use dpub_whisper::{Segment, Word};
 
 /// One paragraph of cleaned-up transcript text and the audio time range
 /// it spans.
+///
+/// `words` carries per-word timings for SMIL Media Overlay sync. It is
+/// non-empty whenever `text` is non-empty, *provided* the input
+/// segments came from a real Whisper run (the test helper builds
+/// segments without per-word data, in which case `words` is empty —
+/// callers using it for SMIL emission should fall back gracefully).
+///
+/// `audio_src` is the basename of the audio file these words came from
+/// (e.g. `"07_Inleiding.mp3"`). Invariant: every word in a paragraph
+/// comes from the same audio file. The cleanup state machine doesn't
+/// merge across audio-file boundaries.
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) struct Paragraph {
     pub start_seconds: f64,
     pub end_seconds: f64,
     pub text: String,
+    pub words: Vec<Word>,
+    pub audio_src: String,
 }
 
 /// Tunable bounds for [`merge_into_paragraphs`]. See module docs for the
@@ -49,19 +62,42 @@ impl Default for CleanupOpts {
 /// sentence terminator AND the paragraph has accumulated enough sentences
 /// or characters; force-flush at the upper caps so a hallucinated run
 /// without punctuation can't grow unbounded.
-pub(crate) fn merge_into_paragraphs(segments: &[Segment], opts: &CleanupOpts) -> Vec<Paragraph> {
+///
+/// `audio_srcs` is a parallel slice giving the audio basename for each
+/// segment. The cleanup state machine flushes the current paragraph
+/// when the audio file changes, preserving the invariant that every
+/// `Paragraph.words[i]` came from the same audio file.
+pub(crate) fn merge_into_paragraphs(
+    segments: &[Segment],
+    audio_srcs: &[String],
+    opts: &CleanupOpts,
+) -> Vec<Paragraph> {
+    debug_assert_eq!(
+        segments.len(),
+        audio_srcs.len(),
+        "segments and audio_srcs must be parallel slices",
+    );
+
     let mut out = Vec::new();
     let mut current = Builder::default();
 
-    for seg in segments {
+    for (seg, audio_src) in segments.iter().zip(audio_srcs.iter()) {
         let text = seg.text.trim();
         if text.is_empty() {
             continue;
         }
+        // Audio-file boundary forces a flush so the resulting paragraph
+        // doesn't span two audio files (would break per-word SMIL since
+        // each `<par>` carries one `<audio src=...>`).
+        if !current.is_empty() && current.audio_src != *audio_src {
+            out.push(current.finalize());
+            current = Builder::default();
+        }
         if current.is_empty() {
             current.start = seg.start_seconds;
+            current.audio_src.clone_from(audio_src);
         }
-        current.append(text, seg.end_seconds);
+        current.append(text, &seg.words, seg.end_seconds);
 
         let terminator = current.ends_at_sentence_terminator();
         if terminator {
@@ -87,6 +123,8 @@ struct Builder {
     start: f64,
     end: f64,
     buf: String,
+    words: Vec<Word>,
+    audio_src: String,
     sentences: usize,
 }
 
@@ -95,11 +133,12 @@ impl Builder {
         self.buf.is_empty()
     }
 
-    fn append(&mut self, text: &str, end: f64) {
+    fn append(&mut self, text: &str, words: &[Word], end: f64) {
         if !self.buf.is_empty() {
             self.buf.push(' ');
         }
         self.buf.push_str(text);
+        self.words.extend_from_slice(words);
         self.end = end;
     }
 
@@ -142,10 +181,15 @@ impl Builder {
 
     fn finalize(mut self) -> Paragraph {
         capitalise_first(&mut self.buf);
+        if let Some(first_word) = self.words.first_mut() {
+            capitalise_first(&mut first_word.text);
+        }
         Paragraph {
             start_seconds: self.start,
             end_seconds: self.end,
             text: self.buf,
+            words: self.words,
+            audio_src: self.audio_src,
         }
     }
 }
@@ -178,9 +222,26 @@ mod tests {
             start_seconds: start,
             end_seconds: end,
             text: text.into(),
+            words: Vec::new(),
         }
     }
 
+    /// Test helper: build a parallel `audio_srcs` slice that pairs every
+    /// segment with the same audio file name. Most cleanup tests don't
+    /// care about audio-file boundaries; the dedicated test
+    /// `audio_file_boundary_forces_flush` exercises the multi-file path.
+    fn srcs(n: usize) -> Vec<String> {
+        vec!["audio.mp3".to_owned(); n]
+    }
+
+    /// Test helper: call `merge_into_paragraphs` against a slice of
+    /// segments that all came from the same audio file. Saves every
+    /// existing test from threading parallel audio basenames.
+    fn merge(segs: &[Segment]) -> Vec<Paragraph> {
+        let audio = srcs(segs.len());
+        merge_into_paragraphs(segs, &audio, &CleanupOpts::default())
+    }
+
     #[test]
     fn three_sentences_merge_into_one_paragraph() {
         let segs = vec![
@@ -188,7 +249,7 @@ mod tests {
             seg(2.0, 4.0, "Hij keek naar de lucht."),
             seg(4.0, 6.0, "Het regende zachtjes."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!(out[0].text.starts_with("De man liep"));
         assert!(out[0].text.ends_with("zachtjes."));
@@ -204,7 +265,7 @@ mod tests {
             seg(1.0, 2.0, "Nee."),
             seg(2.0, 3.0, "Misschien."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1, "short sentences should not split");
     }
 
@@ -220,7 +281,7 @@ mod tests {
                 )
             })
             .collect();
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 2);
         // First paragraph should hold the first 6 sentences.
         assert_eq!(out[0].text.matches('.').count(), 6);
@@ -232,7 +293,7 @@ mod tests {
         // an unbounded paragraph.
         let long = "a ".repeat(400);
         let segs = vec![seg(0.0, 30.0, long.trim())];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!(out[0].text.len() <= 800);
     }
@@ -244,7 +305,7 @@ mod tests {
             seg(0.0, 1.0, "Een korte zin."),
             seg(1.0, 2.0, "En nog een."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!((out[0].end_seconds - 2.0).abs() < 1e-9);
     }
@@ -259,7 +320,7 @@ mod tests {
             seg(5.0, 10.0, "Hij was 3.14 keer ouder dan zij."),
             seg(10.0, 15.0, "Toen vertrok hij naar het volgende dorp."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         // Three real sentences (".", ".", ".") in the input; the decimal
         // is not counted, so we get exactly one paragraph.
         assert_eq!(out.len(), 1);
@@ -273,7 +334,7 @@ mod tests {
             seg(5.0, 10.0, "die hij allemaal had gelezen en zorgvuldig had bewaard."),
             seg(10.0, 15.0, "Hij was er trots op."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         // "enz." should not be treated as terminator, so this reads as
         // one or two real sentences depending on what does terminate.
         // Specifically, only ". " on "bewaard." and "trots op." count.
@@ -288,7 +349,7 @@ mod tests {
             5.0,
             "and then he came back home after a long day at the office.",
         )];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!(out[0].text.starts_with("And then"));
     }
@@ -300,7 +361,7 @@ mod tests {
             seg(1.0, 2.0, "   "),
             seg(2.0, 3.0, "Hallo wereld."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!((out[0].start_seconds - 2.0).abs() < 1e-9);
     }
@@ -312,7 +373,7 @@ mod tests {
             seg(12.0, 15.0, "Het was een lange weg."),
             seg(15.0, 18.7, "Maar hij gaf niet op."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!((out[0].start_seconds - 10.5).abs() < 1e-9);
         assert!((out[0].end_seconds - 18.7).abs() < 1e-9);
@@ -325,14 +386,96 @@ mod tests {
             seg(5.0, 10.0, "Dr. Jansen kwam binnen en groette iedereen vriendelijk."),
             seg(10.0, 15.0, "Hij ging zitten en de zitting begon."),
         ];
-        let out = merge_into_paragraphs(&segs, &CleanupOpts::default());
+        let out = merge(&segs);
         assert_eq!(out.len(), 1);
         assert!(out[0].text.contains("Dr. Jansen"));
     }
 
     #[test]
     fn empty_input_yields_no_paragraphs() {
-        let out = merge_into_paragraphs(&[], &CleanupOpts::default());
+        let out = merge_into_paragraphs(&[], &[], &CleanupOpts::default());
         assert!(out.is_empty());
     }
+
+    fn word(start: f64, end: f64, text: &str) -> Word {
+        Word {
+            start_seconds: start,
+            end_seconds: end,
+            text: text.into(),
+        }
+    }
+
+    fn seg_with_words(start: f64, end: f64, text: &str, words: Vec<Word>) -> Segment {
+        Segment {
+            start_seconds: start,
+            end_seconds: end,
+            text: text.into(),
+            words,
+        }
+    }
+
+    #[test]
+    fn words_thread_through_to_paragraph() {
+        // Two segments with synthetic per-word data; merged paragraph
+        // should preserve every word in document order.
+        let segs = vec![
+            seg_with_words(
+                0.0,
+                1.5,
+                "Hallo wereld.",
+                vec![word(0.0, 0.5, "Hallo"), word(0.5, 1.5, "wereld.")],
+            ),
+            seg_with_words(
+                1.5,
+                3.0,
+                "Goedemorgen.",
+                vec![word(1.5, 3.0, "Goedemorgen.")],
+            ),
+        ];
+        let out = merge(&segs);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].words.len(), 3);
+        assert_eq!(out[0].words[0].text, "Hallo");
+        assert_eq!(out[0].words[1].text, "wereld.");
+        assert_eq!(out[0].words[2].text, "Goedemorgen.");
+        assert_eq!(out[0].audio_src, "audio.mp3");
+    }
+
+    #[test]
+    fn capitalisation_propagates_to_first_word() {
+        // Paragraph starts mid-sentence with a lowercase word; the
+        // capitalisation fix must update both the rendered text AND
+        // the first word's text so the visible <span> reads "And".
+        let segs = vec![seg_with_words(
+            0.0,
+            2.0,
+            "and then he ran.",
+            vec![
+                word(0.0, 0.3, "and"),
+                word(0.3, 0.6, "then"),
+                word(0.6, 0.9, "he"),
+                word(0.9, 2.0, "ran."),
+            ],
+        )];
+        let out = merge(&segs);
+        assert_eq!(out.len(), 1);
+        assert!(out[0].text.starts_with("And"));
+        assert_eq!(out[0].words[0].text, "And");
+    }
+
+    #[test]
+    fn audio_file_boundary_forces_flush() {
+        // Two segments from different audio files. Even mid-sentence,
+        // the cleanup must flush at the file boundary so each
+        // resulting paragraph references one audio file.
+        let segs = vec![
+            seg(0.0, 1.0, "Eerste deel."),
+            seg(0.0, 1.0, "Tweede deel."),
+        ];
+        let audio = vec!["a.mp3".to_owned(), "b.mp3".to_owned()];
+        let out = merge_into_paragraphs(&segs, &audio, &CleanupOpts::default());
+        assert_eq!(out.len(), 2, "audio-file boundary must flush");
+        assert_eq!(out[0].audio_src, "a.mp3");
+        assert_eq!(out[1].audio_src, "b.mp3");
+    }
 }
diff --git a/crates/dpub-convert/tests/real_conversion.rs b/crates/dpub-convert/tests/real_conversion.rs
index 069d723..f007f7e 100644
--- a/crates/dpub-convert/tests/real_conversion.rs
+++ b/crates/dpub-convert/tests/real_conversion.rs
@@ -122,6 +122,7 @@ fn opus_recompression_shrinks_real_book() {
             cover: None,
             auto_cover: false,
             rights: None,
+            no_word_sync: false,
         },
     )
     .expect("write opus");
diff --git a/crates/dpub-whisper/src/lib.rs b/crates/dpub-whisper/src/lib.rs
index 86e1342..8fd6f14 100644
--- a/crates/dpub-whisper/src/lib.rs
+++ b/crates/dpub-whisper/src/lib.rs
@@ -36,6 +36,7 @@
 
 mod decode;
 mod error;
+mod words;
 
 pub use error::{Error, Result};
 
@@ -46,11 +47,32 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar
 /// One transcribed time-range with the text Whisper produced for it.
 ///
 /// Times are in seconds (whisper.cpp returns centiseconds; we convert).
-#[derive(Debug, Clone, serde::Serialize)]
+#[derive(Debug, Clone, PartialEq, serde::Serialize)]
 pub struct Segment {
     pub start_seconds: f64,
     pub end_seconds: f64,
     pub text: String,
+    /// Per-word timings derived from whisper.cpp's per-token data, with
+    /// BPE subword pieces coalesced back into whole words. Empty when
+    /// `text` is empty; otherwise one entry per visible word in the
+    /// segment, in chronological order.
+    pub words: Vec<Word>,
+}
+
+/// One transcribed word with its audio time range.
+///
+/// Used to drive per-word SMIL Media Overlay sync (`<par>` per word in
+/// the produced EPUB). Times are in seconds; whisper.cpp's token
+/// timestamps are notoriously approximate (~100–300 ms tolerance), so
+/// callers should not rely on word boundaries being lip-sync-accurate.
+#[derive(Debug, Clone, PartialEq, serde::Serialize)]
+pub struct Word {
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+    /// The visible word text, with no leading whitespace. Trailing
+    /// punctuation that the whisper tokenizer emitted as a separate
+    /// token is attached here (e.g. `"wereld."`).
+    pub text: String,
 }
 
 /// Knobs for [`transcribe`].
@@ -138,6 +160,33 @@ impl Transcriber {
                 .trim()
                 .to_owned();
 
+            // Walk every token in this segment and collect the raw
+            // (text, t0, t1) triples for the coalescer. whisper.cpp
+            // emits BPE tokens; the coalescer turns them back into
+            // visible words with sensible audio time ranges.
+            let n_tokens = seg.n_tokens();
+            #[allow(clippy::cast_sign_loss)]
+            let tok_cap = n_tokens.max(0) as usize;
+            let mut raw_tokens: Vec<words::RawToken<'_>> = Vec::with_capacity(tok_cap);
+            for j in 0..n_tokens {
+                let Some(tok) = seg.get_token(j) else {
+                    continue;
+                };
+                // Defensive: skip tokens whose text isn't valid UTF-8.
+                // Whisper occasionally emits partial multibyte sequences
+                // mid-word; we'd rather drop a token than poison the segment.
+                let Ok(token_text) = tok.to_str() else {
+                    continue;
+                };
+                let data = tok.token_data();
+                raw_tokens.push(words::RawToken {
+                    text: token_text,
+                    t0_cs: data.t0,
+                    t1_cs: data.t1,
+                });
+            }
+            let words_vec = words::coalesce(&raw_tokens, t0, t1);
+
             // whisper.cpp returns time in centiseconds (10 ms units).
             #[allow(clippy::cast_precision_loss)]
             let start = (t0 as f64) / 100.0;
@@ -147,6 +196,7 @@ impl Transcriber {
                 start_seconds: start,
                 end_seconds: end,
                 text,
+                words: words_vec,
             });
         }
         Ok(out)
diff --git a/crates/dpub-whisper/src/words.rs b/crates/dpub-whisper/src/words.rs
new file mode 100644
index 0000000..79eb33a
--- /dev/null
+++ b/crates/dpub-whisper/src/words.rs
@@ -0,0 +1,299 @@
+//! BPE-token → word coalescer.
+//!
+//! whisper.cpp emits BPE tokens, not words. `"translate"` comes back as
+//! roughly `[" trans", "late"]`; punctuation arrives as its own zero-or-
+//! near-zero-duration token; special tokens like `[_BEG_]` and
+//! `<|notimestamps|>` are interleaved.
+//!
+//! This module coalesces a flat token stream back into whole words with
+//! sensible audio time ranges, suitable for driving per-word SMIL Media
+//! Overlay sync. ASCII-only string ops; deliberately no
+//! `unicode-segmentation` dep — the same stance as `text_cleanup`.
+//!
+//! Word-boundary rule: a token whose text starts with ASCII space is
+//! the start of a new word. A token without leading space, or one whose
+//! text after trimming is pure ASCII punctuation, attaches to the
+//! current word. The first non-special token of the input always starts
+//! a new word (defends against whisper occasionally dropping the
+//! leading space at segment start).
+//!
+//! Defensive case: if the previous word ends in a sentence terminator
+//! (`. ! ? …` or a closing quote/bracket) and the next token's trimmed
+//! text starts with an alphabetic character, treat it as a new word
+//! even without a leading space — recovers from whisper occasionally
+//! omitting the space after sentence-final punctuation.
+
+use crate::Word;
+
+/// One whisper.cpp token, with its raw centisecond timing as returned
+/// by `whisper_full_get_token_data`. Caller is responsible for
+/// constructing these from the FFI; the coalescer takes them by slice
+/// so it stays unit-testable with literal data.
+pub(crate) struct RawToken<'a> {
+    pub text: &'a str,
+    pub t0_cs: i64,
+    pub t1_cs: i64,
+}
+
+/// Coalesce a token stream into words.
+///
+/// `seg_t0_cs` / `seg_t1_cs` are the segment's outer time bounds, also
+/// in centiseconds. Word ranges are clamped into `[seg_t0, seg_t1]` so
+/// a token whose timing runs slightly past the segment boundary
+/// doesn't overlap the next segment's first word.
+pub(crate) fn coalesce(tokens: &[RawToken<'_>], seg_t0_cs: i64, seg_t1_cs: i64) -> Vec<Word> {
+    let mut out: Vec<Word> = Vec::new();
+    let mut current: Option<WordBuf> = None;
+
+    for tok in tokens {
+        if is_special(tok.text) || tok.text.is_empty() {
+            continue;
+        }
+
+        let starts_with_space = tok.text.starts_with(' ');
+        let trimmed = tok.text.trim_start_matches(' ');
+        if trimmed.is_empty() {
+            // Pure-whitespace token — drop.
+            continue;
+        }
+        let pure_punct = trimmed.chars().all(is_attaching_punct);
+
+        let starts_new = current.is_none()
+            || (starts_with_space && !pure_punct)
+            || (!pure_punct
+                && trimmed.chars().next().is_some_and(char::is_alphabetic)
+                && current.as_ref().is_some_and(WordBuf::ends_in_terminator));
+
+        if starts_new {
+            if let Some(w) = current.take() {
+                out.push(w.finish());
+            }
+            current = Some(WordBuf::start(trimmed, tok.t0_cs, tok.t1_cs));
+        } else {
+            current
+                .as_mut()
+                .expect("starts_new is false implies current is Some")
+                .extend(trimmed, tok.t1_cs);
+        }
+    }
+    if let Some(w) = current {
+        out.push(w.finish());
+    }
+
+    let seg_lo = cs_to_seconds(seg_t0_cs);
+    let seg_hi = cs_to_seconds(seg_t1_cs);
+    for w in &mut out {
+        if w.start_seconds < seg_lo {
+            w.start_seconds = seg_lo;
+        }
+        if w.end_seconds > seg_hi {
+            w.end_seconds = seg_hi;
+        }
+        if w.end_seconds <= w.start_seconds {
+            // Whisper sometimes emits punctuation tokens with t1 == t0.
+            // Give the word a 50 ms minimum so reading systems can
+            // animate the highlight.
+            w.end_seconds = (w.start_seconds + 0.05).min(seg_hi);
+        }
+    }
+
+    out
+}
+
+fn cs_to_seconds(cs: i64) -> f64 {
+    #[allow(clippy::cast_precision_loss)]
+    {
+        (cs as f64) / 100.0
+    }
+}
+
+/// Special whisper.cpp control tokens like `[_BEG_]`, `<|notimestamps|>`,
+/// `<|0.00|>`, etc. Heuristic: leading `[` or `<`.
+fn is_special(text: &str) -> bool {
+    text.starts_with('[') || text.starts_with('<')
+}
+
+/// Is `c` an ASCII punctuation char that should attach to the previous
+/// word rather than starting a new one? Whitespace is *not* attaching —
+/// it lives between word spans in the rendered XHTML.
+fn is_attaching_punct(c: char) -> bool {
+    matches!(
+        c,
+        '.' | ','
+            | '!'
+            | '?'
+            | ';'
+            | ':'
+            | '…'
+            | ')'
+            | ']'
+            | '}'
+            | '"'
+            | '\''
+            | '»'
+            | '“'
+            | '”'
+            | '‘'
+            | '’'
+    )
+}
+
+/// Is `c` a sentence-terminal character? Used by the defensive
+/// "no-leading-space-after-terminator" branch.
+fn is_terminator(c: char) -> bool {
+    matches!(c, '.' | '!' | '?' | '…')
+}
+
+struct WordBuf {
+    text: String,
+    t0_cs: i64,
+    t1_cs: i64,
+}
+
+impl WordBuf {
+    fn start(text: &str, t0_cs: i64, t1_cs: i64) -> Self {
+        Self {
+            text: text.to_owned(),
+            t0_cs,
+            t1_cs,
+        }
+    }
+
+    fn extend(&mut self, text: &str, t1_cs: i64) {
+        self.text.push_str(text);
+        if t1_cs > self.t1_cs {
+            self.t1_cs = t1_cs;
+        }
+    }
+
+    /// Strip trailing closing punctuation and check if the resulting
+    /// last char is sentence-terminal. Mirrors the trim done by
+    /// `text_cleanup::Builder::ends_at_sentence_terminator`.
+    fn ends_in_terminator(&self) -> bool {
+        let trimmed = self.text.trim_end_matches(['"', '\'', ')', ']']);
+        trimmed
+            .chars()
+            .next_back()
+            .is_some_and(is_terminator)
+    }
+
+    fn finish(self) -> Word {
+        Word {
+            start_seconds: cs_to_seconds(self.t0_cs),
+            end_seconds: cs_to_seconds(self.t1_cs),
+            text: self.text,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn tok(text: &str, t0_cs: i64, t1_cs: i64) -> RawToken<'_> {
+        RawToken { text, t0_cs, t1_cs }
+    }
+
+    #[test]
+    fn coalesces_simple_sentence() {
+        // "Het was een test." emitted with leading-space tokens.
+        let tokens = vec![
+            tok(" Het", 0, 30),
+            tok(" was", 30, 60),
+            tok(" een", 60, 90),
+            tok(" test", 90, 130),
+            tok(".", 130, 130),
+        ];
+        let out = coalesce(&tokens, 0, 200);
+        assert_eq!(out.len(), 4);
+        assert_eq!(out[0].text, "Het");
+        assert_eq!(out[1].text, "was");
+        assert_eq!(out[2].text, "een");
+        assert_eq!(out[3].text, "test.");
+        assert!((out[3].start_seconds - 0.90).abs() < 1e-9);
+        // The "." token had t0==t1==130; the host word's end is the
+        // attached punctuation's t1 (still 130 = 1.30s).
+        assert!(out[3].end_seconds >= 1.30);
+    }
+
+    #[test]
+    fn coalesces_subword_pieces() {
+        // " trans" + "late" → "translate" (one word, span first.t0 → last.t1)
+        let tokens = vec![tok(" trans", 100, 130), tok("late", 130, 180)];
+        let out = coalesce(&tokens, 0, 200);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].text, "translate");
+        assert!((out[0].start_seconds - 1.00).abs() < 1e-9);
+        assert!((out[0].end_seconds - 1.80).abs() < 1e-9);
+    }
+
+    #[test]
+    fn drops_special_tokens() {
+        let tokens = vec![
+            tok("[_BEG_]", 0, 0),
+            tok(" hi", 10, 30),
+            tok("<|notimestamps|>", 30, 30),
+            tok(" there", 30, 60),
+        ];
+        let out = coalesce(&tokens, 0, 100);
+        assert_eq!(out.len(), 2);
+        assert_eq!(out[0].text, "hi");
+        assert_eq!(out[1].text, "there");
+    }
+
+    #[test]
+    fn clamps_zero_duration_to_50ms_minimum() {
+        // A token whose t0 == t1, attached as the only token of a word.
+        let tokens = vec![tok(" ok", 50, 50)];
+        let out = coalesce(&tokens, 0, 200);
+        assert_eq!(out.len(), 1);
+        assert!((out[0].start_seconds - 0.50).abs() < 1e-9);
+        assert!(
+            out[0].end_seconds >= out[0].start_seconds + 0.04,
+            "expected ≥50 ms duration, got start={} end={}",
+            out[0].start_seconds,
+            out[0].end_seconds,
+        );
+    }
+
+    #[test]
+    fn clamps_to_segment_bounds() {
+        // Token t1 runs past segment t1; should clamp.
+        let tokens = vec![tok(" word", 50, 250)];
+        let out = coalesce(&tokens, 0, 200);
+        assert_eq!(out.len(), 1);
+        assert!(
+            (out[0].end_seconds - 2.00).abs() < 1e-9,
+            "expected clamp to segment t1 (2.00s), got {}",
+            out[0].end_seconds,
+        );
+    }
+
+    #[test]
+    fn handles_no_leading_space_after_punctuation() {
+        // " Hi" + "." + "Bye" — the third token has no leading space
+        // but the previous word ends in a terminator and the new token
+        // is alphabetic, so it should start a new word.
+        let tokens = vec![tok(" Hi", 0, 30), tok(".", 30, 30), tok("Bye", 40, 80)];
+        let out = coalesce(&tokens, 0, 100);
+        assert_eq!(out.len(), 2);
+        assert_eq!(out[0].text, "Hi.");
+        assert_eq!(out[1].text, "Bye");
+    }
+
+    #[test]
+    fn first_token_without_leading_space_still_starts_a_word() {
+        // Whisper sometimes drops the segment-initial space.
+        let tokens = vec![tok("Hello", 10, 50), tok(" world", 50, 90)];
+        let out = coalesce(&tokens, 0, 100);
+        assert_eq!(out.len(), 2);
+        assert_eq!(out[0].text, "Hello");
+        assert_eq!(out[1].text, "world");
+    }
+
+    #[test]
+    fn empty_input_yields_empty_output() {
+        let out = coalesce(&[], 0, 100);
+        assert!(out.is_empty());
+    }
+}
diff --git a/crates/dpub-whisper/tests/smoke.rs b/crates/dpub-whisper/tests/smoke.rs
index 83d69b4..b8c8601 100644
--- a/crates/dpub-whisper/tests/smoke.rs
+++ b/crates/dpub-whisper/tests/smoke.rs
@@ -39,7 +39,24 @@ fn transcribes_when_model_and_audio_are_provided() {
             "  [{:>6.2}s – {:>6.2}s] {}",
             s.start_seconds, s.end_seconds, s.text
         );
+        for w in s.words.iter().take(8) {
+            eprintln!(
+                "      [{:>6.2}s – {:>6.2}s] {}",
+                w.start_seconds, w.end_seconds, w.text
+            );
+        }
     }
     // Allow zero segments for pure silence/sine input — that's not a bug,
     // it's whisper correctly recognising "no speech".
+    // For non-empty segments, the per-word coalescer should always
+    // produce at least one word.
+    for s in &segments {
+        if !s.text.is_empty() {
+            assert!(
+                !s.words.is_empty(),
+                "segment with non-empty text {:?} has empty words",
+                s.text,
+            );
+        }
+    }
 }
diff --git a/crates/epub3-writer/tests/minimal_book.rs b/crates/epub3-writer/tests/minimal_book.rs
index fad8ac0..c3fb09e 100644
--- a/crates/epub3-writer/tests/minimal_book.rs
+++ b/crates/epub3-writer/tests/minimal_book.rs
@@ -296,6 +296,113 @@ fn epubcheck_clean_with_cover_when_available() {
     );
 }
 
+/// Build a minimal publication whose Media Overlay uses **word-level**
+/// sync — nested `<seq epub:textref="...#tx-...">` per paragraph
+/// wrapping per-word `<par>` entries. Mirrors what `dpub-convert`
+/// produces with M6.5 word-level sync enabled. Asserts EPUBCheck stays
+/// clean for the new structure.
+#[test]
+fn epubcheck_clean_with_word_level_overlay_when_available() {
+    let Ok(epubcheck) = which("epubcheck") else {
+        eprintln!("epubcheck not on PATH — skipping");
+        return;
+    };
+
+    let dir = tempfile::tempdir().expect("tempdir");
+    let audio_path = dir.path().join("tiny.mp3");
+    std::fs::write(&audio_path, TINY_MP3).expect("write audio fixture");
+
+    let title = "Word-level sync";
+    // Two-word paragraph anchored as `tx-000-000`. The XHTML body
+    // wraps each word in a `<span id="w-...">` so the overlay's
+    // `<text src="...#w-...">` resolves.
+    let body = r#"<h1 id="h1">Word-level sync</h1>
+  <p id="tx-000-000"><span id="w-000-000-000">Hallo</span> <span id="w-000-000-001">wereld.</span></p>"#;
+    let publication = Publication {
+        metadata: PackageMetadata {
+            identifier: "urn:uuid:00000000-0000-4000-8000-000000000002".into(),
+            title: title.into(),
+            language: "nl".into(),
+            modified: "2026-05-06T00:00:00Z".into(),
+            duration_seconds: Some(0.6),
+            access_modes: vec![AccessMode::Auditory, AccessMode::Textual],
+            ..Default::default()
+        },
+        nav: Nav {
+            toc: vec![NavListItem {
+                label: title.into(),
+                href: "section-001.xhtml".into(),
+                children: vec![],
+            }],
+            page_list: None,
+        },
+        sections: vec![SectionPart {
+            id: "section-001".into(),
+            content: ContentDocument {
+                href: "section-001.xhtml".into(),
+                title: title.into(),
+                language: "nl".into(),
+                body_xhtml: body.into(),
+            },
+            overlay: Some(MediaOverlay {
+                href: "media-overlays/section-001.smil".into(),
+                duration_seconds: 0.6,
+                root: OverlaySeq {
+                    textref: Some("../section-001.xhtml".into()),
+                    children: vec![OverlayItem::Seq(OverlaySeq {
+                        textref: Some("../section-001.xhtml#tx-000-000".into()),
+                        children: vec![
+                            OverlayItem::Par(OverlayPar {
+                                id: Some("w-000-000-000".into()),
+                                text_src: "../section-001.xhtml#w-000-000-000".into(),
+                                audio_src: "../audio/tiny.mp3".into(),
+                                clip_begin_seconds: 0.0,
+                                clip_end_seconds: 0.3,
+                            }),
+                            OverlayItem::Par(OverlayPar {
+                                id: Some("w-000-000-001".into()),
+                                text_src: "../section-001.xhtml#w-000-000-001".into(),
+                                audio_src: "../audio/tiny.mp3".into(),
+                                clip_begin_seconds: 0.3,
+                                clip_end_seconds: 0.6,
+                            }),
+                        ],
+                    })],
+                },
+            }),
+        }],
+        audio_files: vec![AudioFile {
+            id: "audio-tiny".into(),
+            href: "audio/tiny.mp3".into(),
+            source_path: audio_path.clone(),
+            media_type: "audio/mpeg".into(),
+        }],
+        cover: None,
+    };
+
+    let epub_path = dir.path().join("word-sync.epub");
+    let mut out = File::create(&epub_path).expect("create");
+    publication.write_zip(&mut out).expect("write");
+
+    let output = Command::new(epubcheck)
+        .arg(&epub_path)
+        .output()
+        .expect("run epubcheck");
+    let combined = format!(
+        "{}\n{}",
+        String::from_utf8_lossy(&output.stdout),
+        String::from_utf8_lossy(&output.stderr),
+    );
+    assert!(
+        output.status.success(),
+        "epubcheck reported errors on word-level overlay:\n{combined}",
+    );
+    assert!(
+        !combined.contains("WARNING"),
+        "epubcheck emitted warnings on word-level overlay:\n{combined}",
+    );
+}
+
 fn which(name: &str) -> std::io::Result<std::path::PathBuf> {
     let path = std::env::var_os("PATH").unwrap_or_default();
     for entry in std::env::split_paths(&path) {