Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. The format

### Added

- Whisper transcripts are now post-processed into prose-shaped paragraphs (~3–6 sentences each) before being injected into the EPUB content XHTMLs, instead of one `<p>` per Whisper segment. The merge is a single-pass greedy state machine with sentence-terminator detection, decimal-number / Dutch-abbreviation false-positive guards, and a max-character safety valve for hallucinated unpunctuated runs. Each cleaned paragraph carries a stable `id="tx-<section>-<para>"` so a future per-paragraph Media Overlay sync milestone can reference it without re-rendering the XHTML. Pass `--no-text-cleanup` to keep the raw per-segment output for debugging.
- `dpub-cli` and `dpub-convert` now expose `metal` and `cuda` Cargo features that forward to `dpub-whisper`. Build with `cargo build --release -p dpub-cli --features metal` on Apple Silicon to GPU-accelerate `--transcribe` runs (5–10× faster against medium / large-v3 models). Off by default so CI and no-GPU builds stay working.
- `dpub info` and `dpub convert` now accept either an `ncc.html` file or the directory containing it. Spec-mandated `ncc.html` is tried first; legacy uppercase variants (`NCC.HTML`) resolve via a case-insensitive directory scan. Missing-NCC directories produce a clear error instead of `EISDIR`.
- In-tree synthetic DAISY 2.02 fixture at `crates/dpub-convert/tests/fixtures/minimal_daisy/` (~10 KB total: NCC, master.smil, one section SMIL, one tiny MP3). Three integration tests exercise the full parse → convert → ZIP pipeline against it on every `cargo test` run, including CI. The optional EPUBCheck assertion fires when `epubcheck` is on PATH.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ SMIL:
| **M3** | End-to-end: `dpub convert <ncc.html> -o out.epub`. ✅ |
| **M4** | Built-in validation (EPUBCheck + ACE) — `dpub validate`. ✅ (EPUBCheck only; ACE deferred) |
| **M5** | Audio recompression (MP3 → Opus) — `dpub convert --audio opus --bitrate <kbps>`. ✅ |
| **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe <lang> --whisper-model <path>`. ✅ |
| **M6** | Whisper transcription for audio-only books — `dpub convert --transcribe <lang> --whisper-model <path>`. ✅ (segments are merged into prose-shaped paragraphs by default; pass `--no-text-cleanup` for raw output) |
| **M7** | WASM build for browser-based conversion. |
| **M8** | 1.0 release: macOS / Linux / Windows binaries. |

Expand Down
10 changes: 10 additions & 0 deletions crates/dpub-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ enum Command {
/// `--transcribe`.
#[arg(long)]
whisper_model: Option<PathBuf>,
/// Emit one `<p>` per Whisper segment instead of merging into
/// prose-shaped paragraphs. Useful for debugging the raw model
/// output; not recommended for distribution.
#[arg(long)]
no_text_cleanup: bool,
},
/// Validate an existing EPUB 3 publication with EPUBCheck.
Validate {
Expand Down Expand Up @@ -89,6 +94,7 @@ fn main() -> Result<()> {
bitrate,
transcribe,
whisper_model,
no_text_cleanup,
} => cmd_convert(
&ncc,
&output,
Expand All @@ -97,11 +103,13 @@ fn main() -> Result<()> {
bitrate,
transcribe,
whisper_model,
no_text_cleanup,
),
Command::Validate { epub } => cmd_validate(&epub),
}
}

#[allow(clippy::too_many_arguments)]
fn cmd_convert(
ncc: &std::path::Path,
output: &std::path::Path,
Expand All @@ -110,6 +118,7 @@ fn cmd_convert(
bitrate_kbps: u32,
transcribe: Option<String>,
whisper_model: Option<PathBuf>,
no_text_cleanup: bool,
) -> Result<()> {
let ncc = resolve_ncc_path(ncc)?;
let book = Book::from_ncc(&ncc).with_context(|| format!("loading {}", ncc.display()))?;
Expand Down Expand Up @@ -159,6 +168,7 @@ fn cmd_convert(
let opts = dpub_convert::ConvertOptions {
audio: audio.into_format(bitrate_kbps),
transcribe: transcribe_opts,
raw_transcript_segments: no_text_cleanup,
};
let start = std::time::Instant::now();
dpub_convert::convert_to_file(&book, output, &opts)
Expand Down
81 changes: 65 additions & 16 deletions crates/dpub-convert/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use epub3_writer::{
use rayon::prelude::*;

mod error;
mod text_cleanup;
pub use error::{Error, Result};

/// Convert a parsed DAISY 2.02 [`Book`] into an EPUB 3 [`Publication`].
Expand Down Expand Up @@ -543,6 +544,10 @@ pub struct TranscribeOptions {
pub struct ConvertOptions {
pub audio: AudioFormat,
pub transcribe: Option<TranscribeOptions>,
/// When `true`, transcribed segments are emitted one `<p>` per
/// Whisper segment. Default `false` — segments are merged into
/// prose-shaped paragraphs of ~3–6 sentences each.
pub raw_transcript_segments: bool,
}

/// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call.
Expand All @@ -560,7 +565,12 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res
// original (typically MP3) bytes, not a lossy Opus pass that throws away
// information whisper.cpp's frontend re-discards anyway.
if let Some(transcribe) = &opts.transcribe {
inject_transcripts(book, &mut publication, transcribe)?;
inject_transcripts(
book,
&mut publication,
transcribe,
opts.raw_transcript_segments,
)?;
}

// Recompression has to happen *before* the ZIP write because the writer
Expand Down Expand Up @@ -590,17 +600,21 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res

/// For every section, find the audio files referenced by the section's
/// Media Overlay, transcribe each of them (caching across sections that
/// share an audio file), and append the time-ordered transcript as a flat
/// list of `<p>` paragraphs to the section's content XHTML.
/// share an audio file), and append the time-ordered transcript as a list
/// of `<p>` paragraphs to the section's content XHTML.
///
/// The Media Overlay structure is left untouched — sync stays at the
/// original par-anchor granularity (matching the DAISY navigation), and
/// the new paragraphs are pure prose for readers who want to read along
/// with the audio. Per-paragraph audio sync is a future refinement (M6.5).
/// Each paragraph gets a stable `id="tx-<section>-<para>"` so a future
/// per-paragraph Media Overlay sync milestone (M6.5) can reference them
/// without re-rendering the XHTML.
///
/// When `raw_segments` is `true`, the per-segment Whisper output is emitted
/// directly (one `<p>` per ~10–30 s segment); the default `false` runs
/// `text_cleanup::merge_into_paragraphs` to produce prose-shaped output.
fn inject_transcripts(
book: &Book,
publication: &mut Publication,
opts: &TranscribeOptions,
raw_segments: bool,
) -> Result<()> {
let whisper_opts = dpub_whisper::TranscribeOptions {
model_path: opts.model_path.clone(),
Expand Down Expand Up @@ -630,25 +644,29 @@ fn inject_transcripts(
cache.insert(audio_basename.clone(), segments);
}

// Append paragraphs of transcribed text to this section's body in
// time-order. `audio_ranges` is already in document order, so we
// walk it and pick up segments whose mid-point falls inside each
// [t0, t1] range.
let mut new_paragraphs = String::new();
// Collect the in-range segments in document order.
let mut section_segments: Vec<dpub_whisper::Segment> = Vec::new();
for (audio_basename, t0, t1) in &audio_ranges {
let Some(segments) = cache.get(audio_basename) else {
continue;
};
for seg in segments {
let mid = (seg.start_seconds + seg.end_seconds) * 0.5;
if mid >= *t0 && mid <= *t1 && !seg.text.is_empty() {
let _ = std::fmt::Write::write_fmt(
&mut new_paragraphs,
format_args!(" <p>{}</p>\n", escape_text(&seg.text)),
);
section_segments.push(seg.clone());
}
}
}

let new_paragraphs = if raw_segments {
render_raw_paragraphs(idx, &section_segments)
} else {
let cleaned = text_cleanup::merge_into_paragraphs(
&section_segments,
&text_cleanup::CleanupOpts::default(),
);
render_cleaned_paragraphs(idx, &cleaned)
};
if !new_paragraphs.is_empty() {
section_part.content.body_xhtml.push_str(&new_paragraphs);
}
Expand All @@ -657,6 +675,37 @@ fn inject_transcripts(
Ok(())
}

fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment]) -> String {
let mut out = String::new();
for (para_idx, seg) in segments.iter().enumerate() {
let _ = std::fmt::Write::write_fmt(
&mut out,
format_args!(
" <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
escape_text(&seg.text)
),
);
}
out
}

fn render_cleaned_paragraphs(
section_idx: usize,
paragraphs: &[text_cleanup::Paragraph],
) -> String {
let mut out = String::new();
for (para_idx, para) in paragraphs.iter().enumerate() {
let _ = std::fmt::Write::write_fmt(
&mut out,
format_args!(
" <p id=\"tx-{section_idx:03}-{para_idx:03}\">{}</p>\n",
escape_text(&para.text)
),
);
}
out
}

/// Walk a SectionSmil's `<seq>` tree collecting (audio basename, t0, t1)
/// triples for every `<par>` that has an associated audio span. The
/// audio basename is just the last path segment of the SMIL `audio src`
Expand Down
Loading
Loading