diff --git a/CHANGELOG.md b/CHANGELOG.md index a967665..177d224 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format ## [Unreleased] +### Added + +- **Word-level Media Overlay sync** (M6.5). When `--transcribe` runs and the cleanup path is active, dpub now extracts per-token timestamps from whisper.cpp, coalesces BPE pieces back into whole words via a leading-space rule (with punctuation attachment and degenerate-timing clamping), wraps each word in a `` inside the cleaned `

`, and emits one SMIL `` per word — wrapped in nested `` per paragraph. The result is karaoke-style highlight-along-with-audio in compatible reading systems (Thorium, Readium). Default-on; pass `--no-word-sync` to fall back to per-paragraph sync. Workspace EPUBCheck assertions extended to gate the new overlay shape; reference book stays 0/0/0. +- `dpub-whisper` exposes a public `Word { start_seconds, end_seconds, text }` struct and `Segment.words: Vec` populated by the new BPE coalescer (`crates/dpub-whisper/src/words.rs`). Eight unit tests cover the BPE coalescing rules. + ## [0.5.0] - 2026-05-06 First tagged release. Feature-complete for the v1 candidate: DAISY 2.02 → EPUB 3 conversion with Media Overlays, EPUBCheck-clean output, ACE accessibility validation, MP3 → Opus audio recompression, local Whisper transcription with prose-shaped paragraph cleanup, automatic and explicit cover lookup, parallel batch conversion, JSON output for CI/pipeline use. No API stability commitment yet — that comes with 1.0. diff --git a/README.md b/README.md index 74e2a70..706c35e 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,7 @@ Walks the input directory for every `ncc.html`, converts each book in parallel v | **M4** | Built-in validation (EPUBCheck + ACE) — `dpub validate`, `dpub a11y`. ✅ | | **M5** | Audio recompression (MP3 → Opus) — `dpub convert --audio opus --bitrate `. ✅ | | **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe --whisper-model `. ✅ (segments are merged into prose-shaped paragraphs by default; pass `--no-text-cleanup` for raw output) | +| **M6.5** | Word-level Media Overlay sync — karaoke-style highlight-along-with-audio in reading systems that honour Media Overlays. Default-on with `--transcribe`; pass `--no-word-sync` to fall back to per-paragraph sync. ✅ | | **Tier 1 polish** | Whisper model caching, cover lookup (`--cover` and `--auto-cover`), parallel batch mode, JSON output for validators. ✅ | | **M7** | WASM build for browser-based conversion (planned scope: `info` + `validate` only — Whisper / ffmpeg are too heavy for a browser tab). | | **M8** | 1.0 release: signed binaries for macOS / Linux / Windows. | diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs index 732da8f..8195717 100644 --- a/crates/dpub-cli/src/main.rs +++ b/crates/dpub-cli/src/main.rs @@ -52,6 +52,14 @@ enum Command { /// output; not recommended for distribution. #[arg(long)] no_text_cleanup: bool, + /// Skip per-word Media Overlay sync. Word-level sync (the + /// default for transcribed books) drives karaoke-style + /// highlight-along-with-audio in compatible reading systems + /// (Thorium, Readium). Pass this flag to fall back to + /// per-paragraph sync — produces a smaller SMIL at the cost + /// of a coarser reading experience. + #[arg(long)] + no_word_sync: bool, /// Path to a JPEG or PNG image to embed as the EPUB cover. #[arg(long, value_name = "PATH", conflicts_with = "auto_cover")] cover: Option, @@ -150,6 +158,7 @@ fn main() -> Result<()> { transcribe, whisper_model, no_text_cleanup, + no_word_sync, cover, auto_cover, rights, @@ -163,6 +172,7 @@ fn main() -> Result<()> { transcribe, whisper_model, no_text_cleanup, + no_word_sync, cover, auto_cover, rights, @@ -190,6 +200,7 @@ fn cmd_convert( transcribe: Option, whisper_model: Option, no_text_cleanup: bool, + no_word_sync: bool, cover: Option, auto_cover: bool, rights: Option, @@ -255,6 +266,7 @@ fn cmd_convert( cover, auto_cover, rights, + no_word_sync, }; let start = std::time::Instant::now(); dpub_convert::convert_to_file(&book, output, &opts) @@ -545,6 +557,7 @@ fn cmd_batch( cover: None, auto_cover: false, rights: None, + no_word_sync: false, }; let start = std::time::Instant::now(); let entries: Vec = books diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs index 21d7852..138f130 100644 --- a/crates/dpub-convert/src/lib.rs +++ b/crates/dpub-convert/src/lib.rs @@ -564,6 +564,13 @@ pub struct ConvertOptions { /// is the simplest way to assert rights when the source /// doesn't carry one. pub rights: Option, + /// When `true`, skip per-word Media Overlay sync and fall back + /// to per-paragraph sync. Default `false` — the cleanup path + /// emits one SMIL `` per word, anchored to per-word + /// ``s in the content XHTML, for karaoke-style + /// highlight-along-with-audio. Set this to keep SMIL files + /// small at the cost of a coarser reading experience. + pub no_word_sync: bool, } /// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call. @@ -596,6 +603,7 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res &mut publication, transcribe, opts.raw_transcript_segments, + opts.no_word_sync, )?; } @@ -713,6 +721,7 @@ fn inject_transcripts( publication: &mut Publication, opts: &TranscribeOptions, raw_segments: bool, + no_word_sync: bool, ) -> Result<()> { let whisper_opts = dpub_whisper::TranscribeOptions { model_path: opts.model_path.clone(), @@ -746,8 +755,11 @@ fn inject_transcripts( cache.insert(audio_basename.clone(), segments); } - // Collect the in-range segments in document order. + // Collect the in-range segments in document order, paired with + // the audio basename each came from so the cleanup state machine + // can flush at audio-file boundaries. let mut section_segments: Vec = Vec::new(); + let mut section_audio_srcs: Vec = Vec::new(); for (audio_basename, t0, t1) in &audio_ranges { let Some(segments) = cache.get(audio_basename) else { continue; @@ -756,6 +768,7 @@ fn inject_transcripts( let mid = (seg.start_seconds + seg.end_seconds) * 0.5; if mid >= *t0 && mid <= *t1 && !seg.text.is_empty() { section_segments.push(seg.clone()); + section_audio_srcs.push(audio_basename.clone()); } } } @@ -765,9 +778,27 @@ fn inject_transcripts( } else { let cleaned = text_cleanup::merge_into_paragraphs( §ion_segments, + §ion_audio_srcs, &text_cleanup::CleanupOpts::default(), ); - render_cleaned_paragraphs(idx, &cleaned) + let html = render_cleaned_paragraphs(idx, &cleaned); + // Word-level Media Overlay sync: rebuild this section's + // overlay from the cleaned paragraphs, replacing the + // heading-level shell that build_sections constructed. + // The new overlay drives karaoke-style highlighting in + // reading systems (Thorium etc.) that honour Media Overlays. + if !no_word_sync + && cleaned.iter().any(|p| !p.words.is_empty()) + && let Some(overlay) = section_part.overlay.as_mut() + { + let new_root = build_word_overlay_seq( + §ion_part.content.href, + idx, + &cleaned, + ); + overlay.root = new_root; + } + html }; if !new_paragraphs.is_empty() { section_part.content.body_xhtml.push_str(&new_paragraphs); @@ -791,19 +822,92 @@ fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment]) out } +/// Build a fresh `OverlaySeq` for a section from the cleaned paragraphs +/// produced by `text_cleanup::merge_into_paragraphs`. +/// +/// Shape: an outer `` wrapping +/// one inner `` per paragraph, each +/// wrapping one `` per word. This gives reading +/// systems (Thorium, Readium) a structural place to scope highlight to +/// "current paragraph" while still tracking the spoken word. +/// +/// The `content_href` is EPUB-relative to the OPF (e.g. +/// `content/section-001.xhtml`); SMIL lives in `media-overlays/` so +/// every emitted `src` starts with `../`. +fn build_word_overlay_seq( + content_href: &str, + section_idx: usize, + paragraphs: &[text_cleanup::Paragraph], +) -> OverlaySeq { + let mut top_children: Vec = Vec::with_capacity(paragraphs.len()); + for (para_idx, para) in paragraphs.iter().enumerate() { + if para.words.is_empty() { + continue; + } + let para_anchor = format!("tx-{section_idx:03}-{para_idx:03}"); + let para_textref = format!("../{content_href}#{para_anchor}"); + let audio_src = format!("../audio/{}", para.audio_src); + + let mut word_children: Vec = Vec::with_capacity(para.words.len()); + for (word_idx, word) in para.words.iter().enumerate() { + let word_id = format!("w-{section_idx:03}-{para_idx:03}-{word_idx:03}"); + word_children.push(OverlayItem::Par(OverlayPar { + id: Some(word_id.clone()), + text_src: format!("../{content_href}#{word_id}"), + audio_src: audio_src.clone(), + clip_begin_seconds: word.start_seconds, + clip_end_seconds: word.end_seconds, + })); + } + top_children.push(OverlayItem::Seq(OverlaySeq { + textref: Some(para_textref), + children: word_children, + })); + } + OverlaySeq { + textref: Some(format!("../{content_href}")), + children: top_children, + } +} + fn render_cleaned_paragraphs( section_idx: usize, paragraphs: &[text_cleanup::Paragraph], ) -> String { let mut out = String::new(); for (para_idx, para) in paragraphs.iter().enumerate() { + if para.words.is_empty() { + // Whisper run with no per-word data (or test fixture + // without words). Fall back to a flat

with the + // paragraph text. + let _ = std::fmt::Write::write_fmt( + &mut out, + format_args!( + "

{}

\n", + escape_text(¶.text) + ), + ); + continue; + } + // Emit one per word, separated by single ASCII + // spaces so reading systems render natural word spacing. let _ = std::fmt::Write::write_fmt( &mut out, - format_args!( - "

{}

\n", - escape_text(¶.text) - ), + format_args!("

"), ); + for (word_idx, word) in para.words.iter().enumerate() { + if word_idx > 0 { + out.push(' '); + } + let _ = std::fmt::Write::write_fmt( + &mut out, + format_args!( + "{}", + escape_text(&word.text) + ), + ); + } + out.push_str("

\n"); } out } @@ -948,6 +1052,169 @@ mod tests { assert_eq!(media_type_for(""), "application/octet-stream"); } + #[test] + fn render_cleaned_paragraphs_emits_word_spans_when_words_present() { + let para = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 1.5, + text: "Hallo wereld.".into(), + words: vec![ + dpub_whisper::Word { + start_seconds: 0.0, + end_seconds: 0.5, + text: "Hallo".into(), + }, + dpub_whisper::Word { + start_seconds: 0.5, + end_seconds: 1.5, + text: "wereld.".into(), + }, + ], + audio_src: "a.mp3".into(), + }; + let html = render_cleaned_paragraphs(7, &[para]); + assert!( + html.contains(r#"

"#), + "missing paragraph id: {html}" + ); + assert!( + html.contains(r#"Hallo"#), + "missing first word span: {html}" + ); + assert!( + html.contains(r#"wereld."#), + "missing second word span: {html}" + ); + // Single ASCII space between word spans. + assert!( + html.contains(" Hallo wereld.

"#), + "expected flat

fallback, got: {html}" + ); + assert!(!html.contains(", got {:?}", root.children[0]); + }; + assert_eq!( + para_seq.textref.as_deref(), + Some("../content/sec.xhtml#tx-007-000") + ); + assert_eq!(para_seq.children.len(), 2); + + let OverlayItem::Par(first_par) = ¶_seq.children[0] else { + panic!("expected per-word , got {:?}", para_seq.children[0]); + }; + assert_eq!(first_par.id.as_deref(), Some("w-007-000-000")); + assert_eq!(first_par.text_src, "../content/sec.xhtml#w-007-000-000"); + assert_eq!(first_par.audio_src, "../audio/07_Inleiding.mp3"); + assert!((first_par.clip_begin_seconds - 0.0).abs() < 1e-9); + assert!((first_par.clip_end_seconds - 0.5).abs() < 1e-9); + + let OverlayItem::Par(second_par) = ¶_seq.children[1] else { + panic!("expected per-word , got {:?}", para_seq.children[1]); + }; + assert_eq!(second_par.id.as_deref(), Some("w-007-000-001")); + assert!((second_par.clip_end_seconds - 2.0).abs() < 1e-9); + } + + #[test] + fn build_word_overlay_seq_skips_paragraphs_without_words() { + let p1 = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 1.0, + text: "no words here".into(), + words: vec![], + audio_src: "a.mp3".into(), + }; + let p2 = text_cleanup::Paragraph { + start_seconds: 1.0, + end_seconds: 2.0, + text: "Hi".into(), + words: vec![dpub_whisper::Word { + start_seconds: 1.0, + end_seconds: 2.0, + text: "Hi".into(), + }], + audio_src: "a.mp3".into(), + }; + let root = build_word_overlay_seq("content/x.xhtml", 0, &[p1, p2]); + // Paragraph 0 had no words and was skipped; paragraph 1 produced + // one inner . + assert_eq!(root.children.len(), 1); + let OverlayItem::Seq(inner) = &root.children[0] else { + panic!("expected one inner seq"); + }; + assert_eq!( + inner.textref.as_deref(), + Some("../content/x.xhtml#tx-000-001") + ); + } + + #[test] + fn render_cleaned_paragraphs_escapes_word_text() { + let para = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 0.5, + text: r#"a, + pub audio_src: String, } /// Tunable bounds for [`merge_into_paragraphs`]. See module docs for the @@ -49,19 +62,42 @@ impl Default for CleanupOpts { /// sentence terminator AND the paragraph has accumulated enough sentences /// or characters; force-flush at the upper caps so a hallucinated run /// without punctuation can't grow unbounded. -pub(crate) fn merge_into_paragraphs(segments: &[Segment], opts: &CleanupOpts) -> Vec { +/// +/// `audio_srcs` is a parallel slice giving the audio basename for each +/// segment. The cleanup state machine flushes the current paragraph +/// when the audio file changes, preserving the invariant that every +/// `Paragraph.words[i]` came from the same audio file. +pub(crate) fn merge_into_paragraphs( + segments: &[Segment], + audio_srcs: &[String], + opts: &CleanupOpts, +) -> Vec { + debug_assert_eq!( + segments.len(), + audio_srcs.len(), + "segments and audio_srcs must be parallel slices", + ); + let mut out = Vec::new(); let mut current = Builder::default(); - for seg in segments { + for (seg, audio_src) in segments.iter().zip(audio_srcs.iter()) { let text = seg.text.trim(); if text.is_empty() { continue; } + // Audio-file boundary forces a flush so the resulting paragraph + // doesn't span two audio files (would break per-word SMIL since + // each `` carries one `

Word-level sync

+

Hallo wereld.

"#; + let publication = Publication { + metadata: PackageMetadata { + identifier: "urn:uuid:00000000-0000-4000-8000-000000000002".into(), + title: title.into(), + language: "nl".into(), + modified: "2026-05-06T00:00:00Z".into(), + duration_seconds: Some(0.6), + access_modes: vec![AccessMode::Auditory, AccessMode::Textual], + ..Default::default() + }, + nav: Nav { + toc: vec![NavListItem { + label: title.into(), + href: "section-001.xhtml".into(), + children: vec![], + }], + page_list: None, + }, + sections: vec![SectionPart { + id: "section-001".into(), + content: ContentDocument { + href: "section-001.xhtml".into(), + title: title.into(), + language: "nl".into(), + body_xhtml: body.into(), + }, + overlay: Some(MediaOverlay { + href: "media-overlays/section-001.smil".into(), + duration_seconds: 0.6, + root: OverlaySeq { + textref: Some("../section-001.xhtml".into()), + children: vec![OverlayItem::Seq(OverlaySeq { + textref: Some("../section-001.xhtml#tx-000-000".into()), + children: vec![ + OverlayItem::Par(OverlayPar { + id: Some("w-000-000-000".into()), + text_src: "../section-001.xhtml#w-000-000-000".into(), + audio_src: "../audio/tiny.mp3".into(), + clip_begin_seconds: 0.0, + clip_end_seconds: 0.3, + }), + OverlayItem::Par(OverlayPar { + id: Some("w-000-000-001".into()), + text_src: "../section-001.xhtml#w-000-000-001".into(), + audio_src: "../audio/tiny.mp3".into(), + clip_begin_seconds: 0.3, + clip_end_seconds: 0.6, + }), + ], + })], + }, + }), + }], + audio_files: vec![AudioFile { + id: "audio-tiny".into(), + href: "audio/tiny.mp3".into(), + source_path: audio_path.clone(), + media_type: "audio/mpeg".into(), + }], + cover: None, + }; + + let epub_path = dir.path().join("word-sync.epub"); + let mut out = File::create(&epub_path).expect("create"); + publication.write_zip(&mut out).expect("write"); + + let output = Command::new(epubcheck) + .arg(&epub_path) + .output() + .expect("run epubcheck"); + let combined = format!( + "{}\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + assert!( + output.status.success(), + "epubcheck reported errors on word-level overlay:\n{combined}", + ); + assert!( + !combined.contains("WARNING"), + "epubcheck emitted warnings on word-level overlay:\n{combined}", + ); +} + fn which(name: &str) -> std::io::Result { let path = std::env::var_os("PATH").unwrap_or_default(); for entry in std::env::split_paths(&path) {