diff --git a/CHANGELOG.md b/CHANGELOG.md index d864cd8..f5c069e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file. The format ## [Unreleased] +### Fixed + +- **`--auto-cover` for Dutch (and other) books no longer silently misses.** Open Library tags docs with ISO 639-2/B (e.g. `"dut"` for Dutch), while DAISY 2.02 metadata uses ISO 639-1 (`"nl"`); the previous literal `eq_ignore_ascii_case` dropped every plausible match. `dpub-meta` now treats 639-1, 639-2/B and 639-2/T as equivalent (`nl`/`dut`/`nld`, `fr`/`fre`/`fra`, `de`/`ger`/`deu`, etc.). Real-world miss this surfaced: "Het smelt" by Lize Spit. Regression test added. +- **ISBN search hits are now trusted unconditionally.** When DAISY's `dc:identifier` is ISBN-shaped, the search-by-ISBN already disambiguates the edition, so the language and author filters on the result are noise — and would (incorrectly) reject the cover when Open Library lists a translator under `author_name`. Title+author search remains filtered. +- **Open Library HTTP timeout raised from 8 s to 30 s.** `covers.openlibrary.org` redirects through archive.org and can take ~20 s on first hit for less-popular editions; 8 s caused spurious "lookup failed" misses. +- **Whisper model download no longer times out on slow connections.** The HTTP agent used a 60-second total-request timeout, which was insufficient for the 1.5 GB `ggml-medium.bin` download. Now uses per-read timeouts (60 s idle) so downloads can take as long as needed as long as data keeps flowing. Additionally, downloads now retry up to 3 times on transient failures (CDN stalls, connection resets). + +### Changed + +- **Cover lookup now runs in the background** during conversion. The Open Library HTTP requests overlap with transcription and audio recompression instead of blocking the pipeline, so conversions with `--auto-cover` (now the default) no longer stall waiting for the network. + +### Added + +- **`--transcribe` auto-detects language from book metadata.** Passing `--transcribe` without a language code now reads `dc:language` from the DAISY NCC metadata and normalises it to ISO 639-1 for Whisper. Explicit `--transcribe nl` still works. Config file supports `"transcribe": true` for auto-detect or `"transcribe": "nl"` for a fixed default. +- **Shared ISO 639 normaliser** (`dpub-util/lang`). Maps ISO 639-1, 639-2/B and 639-2/T codes to their canonical two-letter form. Used by both `dpub-meta` (cover lookup language filter) and `dpub-cli` (transcription auto-detect). +- **Persistent config file** (`~/.config/dpub/config.json` on Unix, `%APPDATA%\dpub\config.json` on Windows). Lets users set defaults for `audio`, `bitrate`, `auto_cover`, `no_word_sync`, `rights`, `whisper_model`, `transcribe`, `validate`, `a11y`, `jobs`, and `log_level`. CLI flags always override config values. +- **`dpub config` subcommand** — shows the config file path and contents. `--init` creates a starter file with all defaults; `--path` prints just the file path. +- **`--auto-cover` is now on by default** for both `convert` and `batch`. Pass `--no-auto-cover` to opt out. + ## [0.6.0] - 2026-05-07 Word-level Media Overlay sync (karaoke-style highlight-along-with-audio in compatible reading systems) and a major first-run UX overhaul (`dpub doctor`, `dpub setup --whisper-model `, auto-discovery, `scripts/build.sh`, optional `--install` for missing tools). diff --git a/Cargo.lock b/Cargo.lock index ac5b10c..b85b78e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -374,6 +374,7 @@ dependencies = [ "dpub-convert", "dpub-core", "dpub-meta", + "dpub-util", "dpub-validate", "rayon", "serde", @@ -418,6 +419,7 @@ dependencies = [ name = "dpub-meta" version = "0.6.0" dependencies = [ + "dpub-util", "serde", "serde_json", "thiserror 1.0.69", diff --git a/crates/dpub-cli/Cargo.toml b/crates/dpub-cli/Cargo.toml index 9e94fe4..790b691 100644 --- a/crates/dpub-cli/Cargo.toml +++ b/crates/dpub-cli/Cargo.toml @@ -26,6 +26,7 @@ dpub-audio = { path = "../dpub-audio", version = "0.6.0" } dpub-core = { path = "../dpub-core", version = "0.6.0" } dpub-convert = { path = "../dpub-convert", version = "0.6.0" } dpub-meta = { path = "../dpub-meta", version = "0.6.0" } +dpub-util = { path = "../dpub-util", version = "0.6.0" } dpub-validate = { path = "../dpub-validate", version = "0.6.0" } sha2 = { workspace = true } clap = { workspace = true } diff --git a/crates/dpub-cli/src/config.rs b/crates/dpub-cli/src/config.rs new file mode 100644 index 0000000..6778277 --- /dev/null +++ b/crates/dpub-cli/src/config.rs @@ -0,0 +1,182 @@ +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; + +/// Transcription setting in the config file. Accepts: +/// - `"nl"` / `"en"` / ... → always transcribe with this language +/// - `true` → transcribe, auto-detect language from book metadata +/// - `false` or `null` → do not transcribe +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(untagged)] +pub enum TranscribeSetting { + /// Auto-detect language from book metadata when `true`. + Auto(bool), + /// Explicit language code. + Language(String), +} + +/// Persistent user defaults for dpub. Every field is optional — a missing +/// key in the JSON simply falls through to the hard-coded default. +/// +/// The canonical location is `~/.config/dpub/config.json` on Unix and +/// `%APPDATA%\dpub\config.json` on Windows. CLI flags always override. +#[derive(Debug, Default, Deserialize, Serialize)] +#[serde(default)] +pub struct DpubConfig { + /// Audio handling: `"original"` or `"opus"`. + pub audio: Option, + /// Opus bitrate in kbit/s (32–96 for speech). + pub bitrate: Option, + /// Enable automatic cover lookup via Open Library. + pub auto_cover: Option, + /// Skip per-word Media Overlay sync (fall back to per-paragraph). + pub no_word_sync: Option, + /// Default rights statement for ``. + pub rights: Option, + /// Path to a `ggml-*.bin` Whisper model file. + pub whisper_model: Option, + /// Transcription: `true` (auto-detect language), `"nl"` (explicit), or `null` (off). + pub transcribe: Option, + /// Run EPUBCheck after conversion. + pub validate: Option, + /// Run DAISY ACE after conversion. + pub a11y: Option, + /// Parallel batch job count (`0` = let rayon decide). + pub jobs: Option, + /// Default log level (`"error"`, `"warn"`, `"info"`, `"debug"`, `"trace"`). + pub log_level: Option, +} + +/// Return the platform-appropriate config directory for dpub. +/// +/// - Unix: `$HOME/.config/dpub/` +/// - Windows: `%APPDATA%\dpub\` +pub fn config_dir() -> PathBuf { + if cfg!(target_os = "windows") { + let base = std::env::var_os("APPDATA") + .map_or_else(|| PathBuf::from("."), PathBuf::from); + base.join("dpub") + } else { + let home = std::env::var_os("HOME") + .map_or_else(|| PathBuf::from("."), PathBuf::from); + home.join(".config").join("dpub") + } +} + +/// Full path to the config file. +pub fn config_path() -> PathBuf { + config_dir().join("config.json") +} + +/// Load the config file. Returns `Default` (all `None`) when the file is +/// absent or unparseable — dpub should never fail to start because of a +/// broken config. +pub fn load() -> DpubConfig { + load_from(&config_path()) +} + +fn load_from(path: &Path) -> DpubConfig { + let bytes = match std::fs::read(path) { + Ok(b) => b, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return DpubConfig::default(), + Err(e) => { + tracing::warn!("could not read {}: {e}", path.display()); + return DpubConfig::default(); + } + }; + match serde_json::from_slice(&bytes) { + Ok(cfg) => cfg, + Err(e) => { + tracing::warn!("ignoring {}: {e}", path.display()); + DpubConfig::default() + } + } +} + +/// Example JSON for `dpub config` output and `--init`. +pub fn example_json() -> &'static str { + r#"{ + "audio": "original", + "bitrate": 64, + "auto_cover": true, + "no_word_sync": false, + "rights": null, + "whisper_model": null, + "transcribe": null, + "validate": false, + "a11y": false, + "jobs": 0, + "log_level": "info" +}"# +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn missing_file_returns_default() { + let cfg = load_from(std::path::Path::new("/tmp/dpub-test-nonexistent/config.json")); + assert!(cfg.audio.is_none()); + assert!(cfg.bitrate.is_none()); + assert!(cfg.auto_cover.is_none()); + } + + #[test] + fn partial_json_works() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("config.json"); + std::fs::write(&path, r#"{"bitrate": 48}"#).unwrap(); + let cfg = load_from(&path); + assert_eq!(cfg.bitrate, Some(48)); + assert!(cfg.audio.is_none()); + assert!(cfg.auto_cover.is_none()); + } + + #[test] + fn invalid_json_returns_default() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("config.json"); + std::fs::write(&path, "not json at all").unwrap(); + let cfg = load_from(&path); + assert!(cfg.audio.is_none()); + } + + #[test] + fn full_json_round_trips() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("config.json"); + std::fs::write(&path, example_json()).unwrap(); + let cfg = load_from(&path); + assert_eq!(cfg.audio.as_deref(), Some("original")); + assert_eq!(cfg.bitrate, Some(64)); + assert_eq!(cfg.auto_cover, Some(true)); + assert_eq!(cfg.validate, Some(false)); + assert_eq!(cfg.log_level.as_deref(), Some("info")); + assert!(cfg.transcribe.is_none()); + } + + #[test] + fn transcribe_accepts_bool() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("config.json"); + std::fs::write(&path, r#"{"transcribe": true}"#).unwrap(); + let cfg = load_from(&path); + assert!(matches!(cfg.transcribe, Some(TranscribeSetting::Auto(true)))); + } + + #[test] + fn transcribe_accepts_string() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("config.json"); + std::fs::write(&path, r#"{"transcribe": "nl"}"#).unwrap(); + let cfg = load_from(&path); + assert!(matches!(cfg.transcribe, Some(TranscribeSetting::Language(ref s)) if s == "nl")); + } + + #[test] + fn config_dir_ends_with_dpub() { + let dir = config_dir(); + assert_eq!(dir.file_name().unwrap(), "dpub"); + } +} diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs index 55aba4e..65d4d03 100644 --- a/crates/dpub-cli/src/main.rs +++ b/crates/dpub-cli/src/main.rs @@ -4,6 +4,7 @@ use anyhow::{Context, Result}; use clap::{Parser, Subcommand, ValueEnum}; use dpub_core::{Book, NavItem}; +mod config; mod doctor; mod install; mod setup; @@ -37,15 +38,16 @@ enum Command { #[arg(long)] a11y: bool, /// Audio handling: keep originals or recompress to Opus. - #[arg(long, value_enum, default_value_t = AudioOpt::Original)] - audio: AudioOpt, + #[arg(long, value_enum)] + audio: Option, /// Bitrate (kbit/s) when --audio=opus. Sensible range: 32–96 for speech. - #[arg(long, default_value_t = dpub_audio::DEFAULT_OPUS_BITRATE_KBPS)] - bitrate: u32, - /// Transcribe audio with local Whisper (e.g. `nl`, `en`). Requires - /// `--whisper-model`. The text gets injected into each section's - /// content document as a flat list of paragraphs. #[arg(long)] + bitrate: Option, + /// Transcribe audio with local Whisper. Pass a language code + /// (e.g. `--transcribe nl`) or omit the code to auto-detect + /// from the book's `dc:language` metadata. The text gets + /// injected into each section's content document. + #[arg(long, num_args = 0..=1, default_missing_value = "")] transcribe: Option, /// Path to a `ggml-*.bin` Whisper model file. Required with /// `--transcribe`. @@ -65,14 +67,14 @@ enum Command { #[arg(long)] no_word_sync: bool, /// Path to a JPEG or PNG image to embed as the EPUB cover. - #[arg(long, value_name = "PATH", conflicts_with = "auto_cover")] + #[arg(long, value_name = "PATH", conflicts_with = "no_auto_cover")] cover: Option, - /// Best-effort cover lookup via Open Library using the book's - /// title, author, and identifier. Opt-in: it sends those bits - /// of metadata to a third party (Open Library). A miss is - /// silent — the book ships without a cover. + /// Disable the automatic cover lookup via Open Library. + /// By default dpub tries to fetch a cover using the book's + /// title, author, and identifier. Pass this flag to skip + /// the lookup (no network request is made). #[arg(long)] - auto_cover: bool, + no_auto_cover: bool, /// Free-text rights statement to stamp into the EPUB's /// `` field. Overrides any rights string in the /// source DAISY metadata. @@ -122,6 +124,20 @@ enum Command { #[arg(long, value_name = "SIZE")] whisper_model: Option, }, + /// Show or initialise the dpub configuration file. + /// + /// Without flags, prints the config file path and its contents (or an + /// example if no config file exists yet). Persistent defaults set here + /// are overridden by CLI flags. + Config { + /// Print only the config file path. + #[arg(long)] + path: bool, + /// Create a starter config file with documented defaults. + /// Errors if the file already exists. + #[arg(long)] + init: bool, + }, /// Convert every DAISY 2.02 book under `` to EPUB 3, in /// parallel. A "book" is any directory containing an `ncc.html`. /// Writes a JSON summary to stdout when finished. Per-book errors @@ -135,14 +151,14 @@ enum Command { output: PathBuf, /// Number of conversions to run in parallel. `0` (default) lets /// rayon pick (typically the CPU count). - #[arg(short, long, default_value_t = 0)] - jobs: usize, + #[arg(short, long)] + jobs: Option, /// Audio handling: keep originals or recompress to Opus. - #[arg(long, value_enum, default_value_t = AudioOpt::Original)] - audio: AudioOpt, + #[arg(long, value_enum)] + audio: Option, /// Bitrate (kbit/s) when --audio=opus. Sensible range: 32–96 for speech. - #[arg(long, default_value_t = dpub_audio::DEFAULT_OPUS_BITRATE_KBPS)] - bitrate: u32, + #[arg(long)] + bitrate: Option, }, } @@ -164,10 +180,18 @@ impl AudioOpt { } fn main() -> Result<()> { + // Load config early so we can use log_level before tracing init. + let cfg = config::load(); + + let default_level = cfg + .log_level + .as_deref() + .unwrap_or("info") + .to_owned(); tracing_subscriber::fmt() .with_env_filter( tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(&default_level)), ) .with_target(false) .compact() @@ -188,34 +212,63 @@ fn main() -> Result<()> { no_text_cleanup, no_word_sync, cover, - auto_cover, - rights, - } => cmd_convert( - &ncc, - &output, - validate, - a11y, - audio, - bitrate, - transcribe, - whisper_model, - no_text_cleanup, - no_word_sync, - cover, - auto_cover, + no_auto_cover, rights, - ), + } => { + let audio = audio.unwrap_or_else(|| parse_audio_opt(&cfg)); + let bitrate = bitrate.unwrap_or_else(|| { + cfg.bitrate.unwrap_or(dpub_audio::DEFAULT_OPUS_BITRATE_KBPS) + }); + let validate = validate || cfg.validate.unwrap_or(false); + let a11y = a11y || cfg.a11y.unwrap_or(false); + let no_word_sync = no_word_sync || cfg.no_word_sync.unwrap_or(false); + let auto_cover = if no_auto_cover { + false + } else { + cfg.auto_cover.unwrap_or(true) + }; + let rights = rights.or_else(|| cfg.rights.clone()); + let whisper_model = whisper_model.or_else(|| cfg.whisper_model.clone()); + // Merge transcribe: CLI flag > config > none. + // "" (empty) = auto-detect language from book metadata. + let transcribe = transcribe.or_else(|| match &cfg.transcribe { + Some(config::TranscribeSetting::Auto(true)) => Some(String::new()), + Some(config::TranscribeSetting::Language(lang)) => Some(lang.clone()), + _ => None, + }); + cmd_convert( + &ncc, &output, validate, a11y, audio, bitrate, transcribe, + whisper_model, no_text_cleanup, no_word_sync, cover, + auto_cover, rights, + ) + } Command::Validate { epub, json } => cmd_validate(&epub, json), Command::A11y { epub, json } => cmd_a11y(&epub, json), Command::Doctor { json, install, yes } => cmd_doctor(json, install, yes), Command::Setup { whisper_model } => cmd_setup(whisper_model.as_deref()), + Command::Config { path, init } => cmd_config(path, init), Command::Batch { input, output, jobs, audio, bitrate, - } => cmd_batch(&input, &output, jobs, audio, bitrate), + } => { + let audio = audio.unwrap_or_else(|| parse_audio_opt(&cfg)); + let bitrate = bitrate.unwrap_or_else(|| { + cfg.bitrate.unwrap_or(dpub_audio::DEFAULT_OPUS_BITRATE_KBPS) + }); + let jobs = jobs.unwrap_or_else(|| cfg.jobs.unwrap_or(0)); + cmd_batch(&input, &output, jobs, audio, bitrate) + } + } +} + +/// Parse the `audio` field from config, falling back to `Original`. +fn parse_audio_opt(cfg: &config::DpubConfig) -> AudioOpt { + match cfg.audio.as_deref() { + Some("opus") => AudioOpt::Opus, + _ => AudioOpt::Original, } } @@ -255,36 +308,37 @@ fn cmd_convert( } let transcribe_opts = match (transcribe, whisper_model) { - (Some(language), Some(model_path)) => { - if !model_path.is_file() { - anyhow::bail!( - "Whisper model not found at {} (run `dpub setup --whisper-model medium` to download one)", - model_path.display() - ); - } - println!( - " Transcribe: lang={language} model={}", - model_path.display() - ); - Some(dpub_convert::TranscribeOptions { - model_path, - language, - }) - } - (Some(language), None) => { - // Auto-discover: pick the most-recently-modified ggml-*.bin - // in dpub's per-user cache. If none, prompt on TTY (B.1) - // or fail with a hint. - let Some(model_path) = resolve_or_prompt_for_model()? else { - anyhow::bail!( - "no Whisper model found in {}. \ - Run `dpub setup --whisper-model medium` to download one, \ - or pass `--whisper-model ` directly.", - setup::cache_dir().display(), - ); + (Some(language), model_path) => { + // Resolve language: empty string = auto-detect from book metadata. + let language = if language.is_empty() { + resolve_transcribe_language(&book)? + } else { + language + }; + let model_path = if let Some(p) = model_path { + if !p.is_file() { + anyhow::bail!( + "Whisper model not found at {} (run `dpub setup --whisper-model medium` to download one)", + p.display() + ); + } + p + } else { + // Auto-discover: pick the most-recently-modified ggml-*.bin + // in dpub's per-user cache. If none, prompt on TTY (B.1) + // or fail with a hint. + let Some(p) = resolve_or_prompt_for_model()? else { + anyhow::bail!( + "no Whisper model found in {}. \ + Run `dpub setup --whisper-model medium` to download one, \ + or pass `--whisper-model ` directly.", + setup::cache_dir().display(), + ); + }; + p }; println!( - " Transcribe: lang={language} model={} (auto-discovered)", + " Transcribe: lang={language} model={}", model_path.display() ); Some(dpub_convert::TranscribeOptions { @@ -383,6 +437,22 @@ fn cmd_a11y(epub: &std::path::Path, json: bool) -> Result<()> { Ok(()) } +/// Resolve the transcription language from the book's `dc:language` +/// metadata. Normalises ISO 639-2 codes (e.g. `"dut"`) to ISO 639-1 +/// (e.g. `"nl"`) which is what Whisper expects. +fn resolve_transcribe_language(book: &Book) -> Result { + let raw = book + .metadata() + .language + .as_deref() + .context("cannot auto-detect transcription language: the book has no dc:language metadata. Pass an explicit language code, e.g. --transcribe nl")?; + dpub_util::lang::iso639_to_part1(raw) + .map(String::from) + .with_context(|| format!( + "cannot auto-detect transcription language: dc:language \"{raw}\" is not a recognised ISO 639 code. Pass an explicit language code, e.g. --transcribe nl" + )) +} + /// Look for a Whisper model the user already downloaded via /// `dpub setup`. Returns `Some(path)` if a cached model exists, /// `None` otherwise. On a TTY with no cached model, prompts the @@ -471,6 +541,50 @@ fn cmd_setup(whisper_model: Option<&str>) -> Result<()> { Ok(()) } +fn cmd_config(path_only: bool, init: bool) -> Result<()> { + let path = config::config_path(); + if path_only { + println!("{}", path.display()); + return Ok(()); + } + if init { + if path.exists() { + anyhow::bail!( + "config file already exists at {}. Edit it directly or delete it first.", + path.display(), + ); + } + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent) + .with_context(|| format!("creating {}", parent.display()))?; + } + std::fs::write(&path, config::example_json()) + .with_context(|| format!("writing {}", path.display()))?; + println!("Created {}", path.display()); + return Ok(()); + } + // Default: show path + contents or example. + println!("Config file: {}", path.display()); + println!(); + if path.is_file() { + let contents = std::fs::read_to_string(&path) + .with_context(|| format!("reading {}", path.display()))?; + print!("{contents}"); + if !contents.ends_with('\n') { + println!(); + } + } else { + println!("No config file found. Create one with:"); + println!(); + println!(" dpub config --init"); + println!(); + println!("Or create {} manually:", path.display()); + println!(); + println!("{}", config::example_json()); + } + Ok(()) +} + /// Either print the human-readable summary, or serialise the report to /// stdout as pretty JSON. The JSON shape is the `Report` struct from /// `dpub-validate`; field names are stable as part of the 1.0 contract. @@ -691,7 +805,7 @@ fn cmd_batch( transcribe: None, raw_transcript_segments: false, cover: None, - auto_cover: false, + auto_cover: true, rights: None, no_word_sync: false, }; diff --git a/crates/dpub-cli/src/setup.rs b/crates/dpub-cli/src/setup.rs index 3d2549f..b11d3cd 100644 --- a/crates/dpub-cli/src/setup.rs +++ b/crates/dpub-cli/src/setup.rs @@ -163,9 +163,63 @@ pub fn install_model(spec: &ModelSpec) -> Result { spec.filename(), format_bytes(spec.bytes), ); + const MAX_ATTEMPTS: u32 = 3; + let mut last_err: Option = None; + for attempt in 1..=MAX_ATTEMPTS { + if attempt > 1 { + eprintln!(" Retrying (attempt {attempt}/{MAX_ATTEMPTS}) ..."); + std::thread::sleep(Duration::from_secs(2)); + } + match download_once(spec, &partial_path) { + Ok(hasher) => { + last_err = None; + // Verify SHA256 before promoting to final path. + let actual = hex(hasher.finalize().as_slice()); + if actual != spec.sha256 { + fs::remove_file(&partial_path).ok(); + anyhow::bail!( + "SHA256 mismatch for {}: expected {}, got {}", + spec.filename(), + spec.sha256, + actual, + ); + } + break; + } + Err(e) => { + eprintln!(); // newline after stalled progress bar + eprintln!(" Download interrupted: {e:#}"); + fs::remove_file(&partial_path).ok(); + last_err = Some(e); + } + } + } + if let Some(e) = last_err { + return Err(e).with_context(|| { + format!( + "downloading {} failed after {MAX_ATTEMPTS} attempts", + spec.url() + ) + }); + } + + fs::rename(&partial_path, &final_path) + .with_context(|| format!("renaming {} → {}", partial_path.display(), final_path.display()))?; + eprintln!("Verified SHA256."); + eprintln!("Cached: {}", final_path.display()); + Ok(final_path) +} + +/// Single download attempt. Returns the SHA256 hasher on success so the +/// caller can verify the hash. On I/O or network error the partial file +/// is left on disk (the caller decides whether to retry or clean up). +fn download_once( + spec: &ModelSpec, + partial_path: &Path, +) -> Result { let agent = dpub_meta::agent(); let mut hasher = Sha256::new(); - let mut file = fs::File::create(&partial_path) + let mut file = fs::File::create(partial_path) .with_context(|| format!("creating {}", partial_path.display()))?; let mut last_tick = Instant::now(); let started = Instant::now(); @@ -189,22 +243,7 @@ pub fn install_model(spec: &ModelSpec) -> Result { eprintln!(); // newline after the in-place progress bar file.flush().ok(); drop(file); - - let actual = hex(hasher.finalize().as_slice()); - if actual != spec.sha256 { - fs::remove_file(&partial_path).ok(); - anyhow::bail!( - "SHA256 mismatch for {}: expected {}, got {}", - spec.filename(), - spec.sha256, - actual, - ); - } - fs::rename(&partial_path, &final_path) - .with_context(|| format!("renaming {} → {}", partial_path.display(), final_path.display()))?; - eprintln!("Verified SHA256."); - eprintln!("Cached: {}", final_path.display()); - Ok(final_path) + Ok(hasher) } /// Verify an existing file's SHA256 against `expected_hex` without diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs index 138f130..a2751c4 100644 --- a/crates/dpub-convert/src/lib.rs +++ b/crates/dpub-convert/src/lib.rs @@ -584,10 +584,21 @@ pub struct ConvertOptions { pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Result<()> { let mut publication = convert(book)?; + // Kick off the cover lookup on a background thread so it overlaps + // with transcription and audio recompression. The lookup only needs + // metadata (available now) and the result is just bytes we slot in + // before the ZIP write. + let cover_handle = if opts.cover.is_some() { + None // explicit cover — no network lookup needed + } else if opts.auto_cover { + let metadata = book.metadata().clone(); + Some(std::thread::spawn(move || auto_lookup_cover_from_meta(&metadata))) + } else { + None + }; + if let Some(cover_path) = &opts.cover { publication.cover = Some(load_cover_image(cover_path)?); - } else if opts.auto_cover { - publication.cover = auto_lookup_cover(book); } if let Some(rights) = &opts.rights { @@ -616,6 +627,13 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res } }; + // Join the cover thread before writing the ZIP. + if let Some(handle) = cover_handle { + if let Ok(cover) = handle.join() { + publication.cover = cover; + } + } + if let Some(parent) = output.parent() && !parent.as_os_str().is_empty() { @@ -653,13 +671,15 @@ fn load_cover_image(path: &Path) -> Result { }) } -/// Best-effort cover lookup against Open Library. Logs a single line -/// describing the outcome — successful match, miss, or transport +/// Best-effort cover lookup against Open Library. Takes owned +/// [`Metadata`] so it can run on a background thread. Logs a single +/// line describing the outcome — successful match, miss, or transport /// error — and returns `None` for everything except a successful /// match. Network errors are deliberately swallowed: `--auto-cover` /// is best-effort and a missing cover is normal for older DAISY books. -fn auto_lookup_cover(book: &Book) -> Option { - let m = book.metadata(); +fn auto_lookup_cover_from_meta( + m: &dpub_core::Metadata, +) -> Option { let Some(title) = m.title.as_deref() else { eprintln!("--auto-cover: no title in metadata, skipping"); return None; diff --git a/crates/dpub-meta/Cargo.toml b/crates/dpub-meta/Cargo.toml index b9adde5..ed7915f 100644 --- a/crates/dpub-meta/Cargo.toml +++ b/crates/dpub-meta/Cargo.toml @@ -11,6 +11,7 @@ description = "Best-effort book-cover lookup against external metadata services workspace = true [dependencies] +dpub-util = { path = "../dpub-util" } serde = { workspace = true } serde_json = { workspace = true } thiserror = { workspace = true } diff --git a/crates/dpub-meta/src/lib.rs b/crates/dpub-meta/src/lib.rs index f74a607..1c1cfb5 100644 --- a/crates/dpub-meta/src/lib.rs +++ b/crates/dpub-meta/src/lib.rs @@ -32,16 +32,24 @@ pub use error::{Error, Result}; const SEARCH_URL: &str = "https://openlibrary.org/search.json"; const COVERS_URL: &str = "https://covers.openlibrary.org/b"; const USER_AGENT_BASE: &str = "dpub"; -const TIMEOUT: Duration = Duration::from_secs(8); +// covers.openlibrary.org occasionally takes ~20 s on the +// CDN-redirect chain, especially for less-popular editions, so 8 s +// was too tight in practice. 30 s leaves headroom without making a +// totally-down API hang the convert pipeline. +const TIMEOUT: Duration = Duration::from_secs(30); const MAX_COVER_BYTES: usize = 4 * 1024 * 1024; -/// Build a fresh `ureq::Agent` with dpub's standard configuration: -/// 8-second timeout, identifying User-Agent. Callers that need a -/// generic HTTP-download path (Whisper model fetch, etc.) can use -/// this directly via [`download_to_writer`]. +/// Build a fresh `ureq::Agent` for large downloads (Whisper models). +/// +/// Uses per-read/write timeouts instead of a total-request timeout so +/// that multi-gigabyte downloads don't time out as long as data keeps +/// flowing. The 60-second per-read timeout gives ample room for CDN +/// hiccups while still failing promptly on a truly stalled connection. pub fn agent() -> ureq::Agent { ureq::AgentBuilder::new() - .timeout(Duration::from_secs(60)) + .timeout_connect(Duration::from_secs(30)) + .timeout_read(Duration::from_secs(60)) + .timeout_write(Duration::from_secs(30)) .user_agent(&format!( "{USER_AGENT_BASE}/{} (+https://github.com/11ways/dpub)", env!("CARGO_PKG_VERSION") @@ -146,7 +154,11 @@ fn try_isbn(agent: &ureq::Agent, hints: &LookupHints<'_>) -> Result) -> Result> { @@ -158,25 +170,43 @@ fn try_title_creator(agent: &ureq::Agent, hints: &LookupHints<'_>) -> Result Option { + for doc in &resp.docs { + if let Some(cover_id) = doc.cover_i { + return Some(Candidate { + cover_id, + edition_key: doc.cover_edition_key.clone(), + title: doc.title.clone().unwrap_or_default(), + author: doc.author_name.first().cloned(), + by, + }); + } + } + None } /// Pick the first hit whose language matches the hints (when given) and -/// whose author overlaps the requested creator's last name. Returns -/// `None` if no hit clears the bar. -fn pick(resp: &SearchResponse, hints: &LookupHints<'_>, by: &'static str) -> Option { - let want_lang = hints.language.map(str::to_lowercase); - let want_lastname = hints - .creator - .map(last_name) - .map(str::to_lowercase); +/// whose author overlaps the requested creator's last name. Used for +/// title+author search, where false positives (popular English book +/// outranking the actual translation we want) are a real risk. +fn pick_filtered( + resp: &SearchResponse, + hints: &LookupHints<'_>, + by: &'static str, +) -> Option { + let want_lastname = hints.creator.map(last_name).map(str::to_lowercase); for doc in &resp.docs { let Some(cover_id) = doc.cover_i else { continue; }; - if let Some(lang) = &want_lang - && !doc.language.iter().any(|l| l.eq_ignore_ascii_case(lang)) + if let Some(lang) = hints.language + && !language_matches(lang, &doc.language) { continue; } @@ -199,6 +229,23 @@ fn pick(resp: &SearchResponse, hints: &LookupHints<'_>, by: &'static str) -> Opt None } +/// True if `want` (typically ISO 639-1, the form DAISY 2.02 metadata +/// uses) names the same language as any code in `doc_langs` (Open +/// Library typically returns ISO 639-2/B, e.g. `"dut"` for Dutch). +/// Treats 639-1 / 639-2/B / 639-2/T as equivalent via the shared +/// normaliser in `dpub_util::lang`. +fn language_matches(want: &str, doc_langs: &[String]) -> bool { + let Some(want_norm) = dpub_util::lang::iso639_to_part1(want) else { + // Unknown code — fall back to literal comparison. + return doc_langs + .iter() + .any(|l| l.eq_ignore_ascii_case(want.trim())); + }; + doc_langs.iter().any(|l| { + dpub_util::lang::iso639_to_part1(l).is_some_and(|n| n == want_norm) + }) +} + fn fetch_cover(agent: &ureq::Agent, candidate: &Candidate) -> Result { let url = if let Some(olid) = &candidate.edition_key { format!("{COVERS_URL}/olid/{olid}-L.jpg") @@ -337,7 +384,7 @@ mod tests { language: Some("nl"), identifier: None, }; - assert!(pick(&resp, &hints, "test").is_none()); + assert!(pick_filtered(&resp, &hints, "test").is_none()); } #[test] @@ -357,7 +404,7 @@ mod tests { language: Some("nl"), identifier: None, }; - let got = pick(&resp, &hints, "test").expect("matched"); + let got = pick_filtered(&resp, &hints, "test").expect("matched"); assert_eq!(got.cover_id, 99); assert_eq!(got.edition_key.as_deref(), Some("OL12345M")); } @@ -388,6 +435,66 @@ mod tests { language: Some("nl"), identifier: None, }; - assert_eq!(pick(&resp, &hints, "test").map(|c| c.cover_id), Some(7)); + assert_eq!( + pick_filtered(&resp, &hints, "test").map(|c| c.cover_id), + Some(7) + ); + } + + /// Regression: Open Library tags Dutch books `"dut"` (ISO 639-2/B) + /// but DAISY metadata uses `"nl"` (ISO 639-1). The naive + /// `eq_ignore_ascii_case` we used before silently dropped every + /// Dutch book that had a perfectly fine cover. Real-world miss: + /// "Het smelt" by Lize Spit (cover_i 13303384). + #[test] + fn pick_accepts_iso_639_2_b_against_iso_639_1() { + let resp = SearchResponse { + docs: vec![SearchDoc { + title: Some("Het smelt".into()), + cover_i: Some(13_303_384), + cover_edition_key: Some("OL46543686M".into()), + language: vec!["dut".into()], + author_name: vec!["Lize Spit".into()], + }], + }; + let hints = LookupHints { + title: "HET SMELT", + creator: Some("Lize Spit"), + language: Some("nl"), + identifier: Some("13247A"), // not ISBN-shaped → forces title+author path + }; + let got = pick_filtered(&resp, &hints, "test").expect("matched"); + assert_eq!(got.cover_id, 13_303_384); + } + + #[test] + fn language_matches_handles_iso_639_variants() { + assert!(language_matches("nl", &["dut".into()])); + assert!(language_matches("nl", &["nld".into()])); + assert!(language_matches("dut", &["nl".into()])); + assert!(language_matches("fr", &["fre".into()])); + assert!(language_matches("de", &["ger".into(), "eng".into()])); + assert!(language_matches("EN", &["eng".into()])); // case-insensitive + assert!(!language_matches("nl", &["eng".into()])); + assert!(!language_matches("nl", &[])); + } + + /// ISBN uniquely identifies an edition; trust the search result. + /// Open Library sometimes lists a translator under `author_name` + /// or tags the language oddly, and we shouldn't reject the cover + /// over that. + #[test] + fn isbn_path_skips_language_and_author_filters() { + let resp = SearchResponse { + docs: vec![SearchDoc { + title: Some("Some Book".into()), + cover_i: Some(42), + cover_edition_key: None, + language: vec!["eng".into()], // doesn't match hint + author_name: vec!["Translator Name".into()], // doesn't match hint + }], + }; + let got = pick_first_with_cover(&resp, "isbn").expect("matched"); + assert_eq!(got.cover_id, 42); } } diff --git a/crates/dpub-util/src/lang.rs b/crates/dpub-util/src/lang.rs new file mode 100644 index 0000000..33a85f3 --- /dev/null +++ b/crates/dpub-util/src/lang.rs @@ -0,0 +1,141 @@ +//! ISO 639 language-code normalisation. +//! +//! DAISY 2.02 metadata typically uses ISO 639-1 two-letter codes (`nl`, +//! `en`), but real-world books occasionally carry ISO 639-2/B (`dut`) or +//! 639-2/T (`nld`) variants instead. Whisper (and most modern tooling) +//! expects ISO 639-1. This module provides a single normaliser that maps +//! any recognised variant to its canonical two-letter form. + +/// Normalise an ISO 639-1, 639-2/B or 639-2/T language code to its +/// canonical ISO 639-1 (two-letter) form. Case-insensitive. +/// +/// Returns `None` for codes that are not in the lookup table. +/// +/// ``` +/// assert_eq!(dpub_util::lang::iso639_to_part1("dut"), Some("nl")); +/// assert_eq!(dpub_util::lang::iso639_to_part1("NL"), Some("nl")); +/// assert_eq!(dpub_util::lang::iso639_to_part1("nld"), Some("nl")); +/// assert_eq!(dpub_util::lang::iso639_to_part1("xyz"), None); +/// ``` +pub fn iso639_to_part1(code: &str) -> Option<&'static str> { + // Lowercase + trim once; the match arms are all-lowercase. + let code = code.trim(); + // Fast path: stack-allocated lowercase for short codes. + let mut buf = [0u8; 8]; + if code.len() > buf.len() { + return None; + } + for (i, b) in code.bytes().enumerate() { + buf[i] = b.to_ascii_lowercase(); + } + let lc = std::str::from_utf8(&buf[..code.len()]).ok()?; + + match lc { + // Whisper-supported languages that appear in DAISY books. + // Each group: part-1 | part-2/B | part-2/T (where they differ). + "af" | "afr" => Some("af"), + "ar" | "ara" => Some("ar"), + "be" | "bel" => Some("be"), + "bg" | "bul" => Some("bg"), + "bn" | "ben" => Some("bn"), + "ca" | "cat" => Some("ca"), + "cs" | "cze" | "ces" => Some("cs"), + "cy" | "wel" | "cym" => Some("cy"), + "da" | "dan" => Some("da"), + "de" | "ger" | "deu" => Some("de"), + "el" | "gre" | "ell" => Some("el"), + "en" | "eng" => Some("en"), + "es" | "spa" => Some("es"), + "et" | "est" => Some("et"), + "eu" | "baq" | "eus" => Some("eu"), + "fa" | "per" | "fas" => Some("fa"), + "fi" | "fin" => Some("fi"), + "fr" | "fre" | "fra" => Some("fr"), + "gl" | "glg" => Some("gl"), + "he" | "heb" => Some("he"), + "hi" | "hin" => Some("hi"), + "hr" | "hrv" => Some("hr"), + "hu" | "hun" => Some("hu"), + "hy" | "arm" | "hye" => Some("hy"), + "id" | "ind" => Some("id"), + "is" | "ice" | "isl" => Some("is"), + "it" | "ita" => Some("it"), + "ja" | "jpn" => Some("ja"), + "ka" | "geo" | "kat" => Some("ka"), + "kk" | "kaz" => Some("kk"), + "ko" | "kor" => Some("ko"), + "lt" | "lit" => Some("lt"), + "lv" | "lav" => Some("lv"), + "mk" | "mac" | "mkd" => Some("mk"), + "mr" | "mar" => Some("mr"), + "ms" | "may" | "msa" => Some("ms"), + "ne" | "nep" => Some("ne"), + "nl" | "dut" | "nld" => Some("nl"), + "no" | "nor" => Some("no"), + "pl" | "pol" => Some("pl"), + "pt" | "por" => Some("pt"), + "ro" | "rum" | "ron" => Some("ro"), + "ru" | "rus" => Some("ru"), + "sk" | "slo" | "slk" => Some("sk"), + "sl" | "slv" => Some("sl"), + "sr" | "srp" => Some("sr"), + "sv" | "swe" => Some("sv"), + "sw" | "swa" => Some("sw"), + "ta" | "tam" => Some("ta"), + "th" | "tha" => Some("th"), + "tl" | "tgl" => Some("tl"), + "tr" | "tur" => Some("tr"), + "uk" | "ukr" => Some("uk"), + "ur" | "urd" => Some("ur"), + "vi" | "vie" => Some("vi"), + "zh" | "chi" | "zho" => Some("zh"), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn part1_passes_through() { + assert_eq!(iso639_to_part1("nl"), Some("nl")); + assert_eq!(iso639_to_part1("en"), Some("en")); + assert_eq!(iso639_to_part1("fr"), Some("fr")); + } + + #[test] + fn part2b_normalises() { + assert_eq!(iso639_to_part1("dut"), Some("nl")); + assert_eq!(iso639_to_part1("fre"), Some("fr")); + assert_eq!(iso639_to_part1("ger"), Some("de")); + assert_eq!(iso639_to_part1("cze"), Some("cs")); + } + + #[test] + fn part2t_normalises() { + assert_eq!(iso639_to_part1("nld"), Some("nl")); + assert_eq!(iso639_to_part1("fra"), Some("fr")); + assert_eq!(iso639_to_part1("deu"), Some("de")); + assert_eq!(iso639_to_part1("ces"), Some("cs")); + } + + #[test] + fn case_insensitive() { + assert_eq!(iso639_to_part1("NL"), Some("nl")); + assert_eq!(iso639_to_part1("DUT"), Some("nl")); + assert_eq!(iso639_to_part1("Eng"), Some("en")); + } + + #[test] + fn trims_whitespace() { + assert_eq!(iso639_to_part1(" nl "), Some("nl")); + } + + #[test] + fn unknown_returns_none() { + assert_eq!(iso639_to_part1("xyz"), None); + assert_eq!(iso639_to_part1(""), None); + assert_eq!(iso639_to_part1("this-is-too-long-to-be-a-code"), None); + } +} diff --git a/crates/dpub-util/src/lib.rs b/crates/dpub-util/src/lib.rs index 0d5adce..7fe7018 100644 --- a/crates/dpub-util/src/lib.rs +++ b/crates/dpub-util/src/lib.rs @@ -5,4 +5,5 @@ //! `epub3_writer::writers`, `dpub_convert`) which is a recipe for bug //! drift — fix one, forget the others. +pub mod lang; pub mod xml;