11ways · roelvangils · May 5, 2026 · May 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. The format
 
 ### Added
 
+- Whisper transcripts are now post-processed into prose-shaped paragraphs (~3–6 sentences each) before being injected into the EPUB content XHTMLs, instead of one `<p>` per Whisper segment. The merge is a single-pass greedy state machine with sentence-terminator detection, decimal-number / Dutch-abbreviation false-positive guards, and a max-character safety valve for hallucinated unpunctuated runs. Each cleaned paragraph carries a stable `id="tx-<section>-<para>"` so a future per-paragraph Media Overlay sync milestone can reference it without re-rendering the XHTML. Pass `--no-text-cleanup` to keep the raw per-segment output for debugging.
 - `dpub-cli` and `dpub-convert` now expose `metal` and `cuda` Cargo features that forward to `dpub-whisper`. Build with `cargo build --release -p dpub-cli --features metal` on Apple Silicon to GPU-accelerate `--transcribe` runs (5–10× faster against medium / large-v3 models). Off by default so CI and no-GPU builds stay working.
 - `dpub info` and `dpub convert` now accept either an `ncc.html` file or the directory containing it. Spec-mandated `ncc.html` is tried first; legacy uppercase variants (`NCC.HTML`) resolve via a case-insensitive directory scan. Missing-NCC directories produce a clear error instead of `EISDIR`.
 - In-tree synthetic DAISY 2.02 fixture at `crates/dpub-convert/tests/fixtures/minimal_daisy/` (~10 KB total: NCC, master.smil, one section SMIL, one tiny MP3). Three integration tests exercise the full parse → convert → ZIP pipeline against it on every `cargo test` run, including CI. The optional EPUBCheck assertion fires when `epubcheck` is on PATH.

diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ SMIL:
 | **M3** | End-to-end: `dpub convert <ncc.html> -o out.epub`. ✅ |
 | **M4** | Built-in validation (EPUBCheck + ACE) — `dpub validate`. ✅ (EPUBCheck only; ACE deferred) |
 | **M5** | Audio recompression (MP3 → Opus) — `dpub convert --audio opus --bitrate <kbps>`. ✅ |
-| **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe <lang> --whisper-model <path>`. ✅ |
+| **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe <lang> --whisper-model <path>`. ✅ (segments are merged into prose-shaped paragraphs by default; pass `--no-text-cleanup` for raw output) |
 | **M7** | WASM build for browser-based conversion. |
 | **M8** | 1.0 release: macOS / Linux / Windows binaries. |
 

diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs
@@ -43,6 +43,11 @@ enum Command {
         /// `--transcribe`.
         #[arg(long)]
         whisper_model: Option<PathBuf>,
+        /// Emit one `<p>` per Whisper segment instead of merging into
+        /// prose-shaped paragraphs. Useful for debugging the raw model
+        /// output; not recommended for distribution.
+        #[arg(long)]
+        no_text_cleanup: bool,
     },
     /// Validate an existing EPUB 3 publication with EPUBCheck.
     Validate {
@@ -89,6 +94,7 @@ fn main() -> Result<()> {
             bitrate,
             transcribe,
             whisper_model,
+            no_text_cleanup,
         } => cmd_convert(
             &ncc,
             &output,
@@ -97,11 +103,13 @@ fn main() -> Result<()> {
             bitrate,
             transcribe,
             whisper_model,
+            no_text_cleanup,
         ),
         Command::Validate { epub } => cmd_validate(&epub),
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 fn cmd_convert(
     ncc: &std::path::Path,
     output: &std::path::Path,
@@ -110,6 +118,7 @@ fn cmd_convert(
     bitrate_kbps: u32,
     transcribe: Option<String>,
     whisper_model: Option<PathBuf>,
+    no_text_cleanup: bool,
 ) -> Result<()> {
     let ncc = resolve_ncc_path(ncc)?;
     let book = Book::from_ncc(&ncc).with_context(|| format!("loading {}", ncc.display()))?;
@@ -159,6 +168,7 @@ fn cmd_convert(
     let opts = dpub_convert::ConvertOptions {
         audio: audio.into_format(bitrate_kbps),
         transcribe: transcribe_opts,
+        raw_transcript_segments: no_text_cleanup,
     };
     let start = std::time::Instant::now();
     dpub_convert::convert_to_file(&book, output, &opts)

diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs
@@ -19,6 +19,7 @@ use epub3_writer::{
 use rayon::prelude::*;
 
 mod error;
+mod text_cleanup;
 pub use error::{Error, Result};
 
 /// Convert a parsed DAISY 2.02 [`Book`] into an EPUB 3 [`Publication`].
@@ -543,6 +544,10 @@ pub struct TranscribeOptions {
 pub struct ConvertOptions {
     pub audio: AudioFormat,
     pub transcribe: Option<TranscribeOptions>,
+    /// When `true`, transcribed segments are emitted one `<p>` per
+    /// Whisper segment. Default `false` — segments are merged into
+    /// prose-shaped paragraphs of ~3–6 sentences each.
+    pub raw_transcript_segments: bool,
 }
 
 /// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call.
@@ -560,7 +565,12 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res
     // original (typically MP3) bytes, not a lossy Opus pass that throws away
     // information whisper.cpp's frontend re-discards anyway.
     if let Some(transcribe) = &opts.transcribe {
-        inject_transcripts(book, &mut publication, transcribe)?;
+        inject_transcripts(
+            book,
+            &mut publication,
+            transcribe,
+            opts.raw_transcript_segments,
+        )?;
     }
 
     // Recompression has to happen *before* the ZIP write because the writer
@@ -590,17 +600,21 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res
 
 /// For every section, find the audio files referenced by the section's
 /// Media Overlay, transcribe each of them (caching across sections that
-/// share an audio file), and append the time-ordered transcript as a flat
-/// list of `<p>` paragraphs to the section's content XHTML.
+/// share an audio file), and append the time-ordered transcript as a list
+/// of `<p>` paragraphs to the section's content XHTML.
 ///
-/// The Media Overlay structure is left untouched — sync stays at the
-/// original par-anchor granularity (matching the DAISY navigation), and
-/// the new paragraphs are pure prose for readers who want to read along
-/// with the audio. Per-paragraph audio sync is a future refinement (M6.5).
+/// Each paragraph gets a stable `id="tx-<section>-<para>"` so a future
+/// per-paragraph Media Overlay sync milestone (M6.5) can reference them
+/// without re-rendering the XHTML.
+///
+/// When `raw_segments` is `true`, the per-segment Whisper output is emitted
+/// directly (one `<p>` per ~10–30 s segment); the default `false` runs
+/// `text_cleanup::merge_into_paragraphs` to produce prose-shaped output.
 fn inject_transcripts(
     book: &Book,
     publication: &mut Publication,
     opts: &TranscribeOptions,
+    raw_segments: bool,
 ) -> Result<()> {
     let whisper_opts = dpub_whisper::TranscribeOptions {
         model_path: opts.model_path.clone(),
@@ -630,25 +644,29 @@ fn inject_transcripts(
             cache.insert(audio_basename.clone(), segments);
         }
 
-        // Append paragraphs of transcribed text to this section's body in
-        // time-order. `audio_ranges` is already in document order, so we
-        // walk it and pick up segments whose mid-point falls inside each
-        // [t0, t1] range.
-        let mut new_paragraphs = String::new();
+        // Collect the in-range segments in document order.
+        let mut section_segments: Vec<dpub_whisper::Segment> = Vec::new();
         for (audio_basename, t0, t1) in &audio_ranges {
             let Some(segments) = cache.get(audio_basename) else {
                 continue;
             };
             for seg in segments {
                 let mid = (seg.start_seconds + seg.end_seconds) * 0.5;
                 if mid >= *t0 && mid <= *t1 && !seg.text.is_empty() {
-                    let _ = std::fmt::Write::write_fmt(
-                        &mut new_paragraphs,
-                        format_args!("  <p>{}</p>\n", escape_text(&seg.text)),
-                    );
+                    section_segments.push(seg.clone());
                 }
             }
         }
+
+        let new_paragraphs = if raw_segments {
+            render_raw_paragraphs(idx, &section_segments)
+        } else {
+            let cleaned = text_cleanup::merge_into_paragraphs(
+                &section_segments,
+                &text_cleanup::CleanupOpts::default(),
+            );
+            render_cleaned_paragraphs(idx, &cleaned)
+        };
         if !new_paragraphs.is_empty() {
             section_part.content.body_xhtml.push_str(&new_paragraphs);
         }
@@ -657,6 +675,37 @@ fn inject_transcripts(
     Ok(())
 }
 
+fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment]) -> String {
+    let mut out = String::new();
+    for (para_idx, seg) in segments.iter().enumerate() {
+        let _ = std::fmt::Write::write_fmt(
+            &mut out,
+            format_args!(
+                "  <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
+                escape_text(&seg.text)
+            ),
+        );
+    }
+    out
+}
+
+fn render_cleaned_paragraphs(
+    section_idx: usize,
+    paragraphs: &[text_cleanup::Paragraph],
+) -> String {
+    let mut out = String::new();
+    for (para_idx, para) in paragraphs.iter().enumerate() {
+        let _ = std::fmt::Write::write_fmt(
+            &mut out,
+            format_args!(
+                "  <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
+                escape_text(&para.text)
+            ),
+        );
+    }
+    out
+}
+
 /// Walk a SectionSmil's `<seq>` tree collecting (audio basename, t0, t1)
 /// triples for every `<par>` that has an associated audio span. The
 /// audio basename is just the last path segment of the SMIL `audio src`