
# Audio Duration Overview

Dieses Notebook lädt die bekannten Datasets (BAS RVG1, Switchboard Benchmark, Verbmobil Small) und berechnet die insgesamt vorliegende Audiomenge in Stunden. Für jedes Dataset werden zusätzlich die Anzahl der Segmente und der Anteil fehlender Dauerangaben ausgewiesen.


In [None]:

from __future__ import annotations

import contextlib
import math
import wave
from pathlib import Path
from typing import Iterable, Optional

import numpy as np
import pandas as pd

from dataset_loaders.BAS_RVG1 import load_sp1_dataframe
from dataset_loaders.switchboard_benchmark import load_switchboard_benchmark_dataframe
from dataset_loaders.verbmobil_small import load_verbmobil_small_dataframe
try:
    from audio_utils import InMemoryAudio
except ImportError:  # fallback when helper is unavailable
    InMemoryAudio = None

DATASET_LOADERS = {
    "bas_rvg1": load_sp1_dataframe,
    "switchboard_benchmark": load_switchboard_benchmark_dataframe,
    "verbmobil_small": load_verbmobil_small_dataframe,
}


In [None]:

def _safe_soundfile_duration(path: Path) -> Optional[float]:
    try:
        import soundfile as sf  # type: ignore
    except Exception:
        return None
    try:
        info = sf.info(str(path))
        if info.frames > 0 and info.samplerate > 0:
            return info.frames / float(info.samplerate)
    except Exception:
        return None
    return None


def _wave_duration(path: Path) -> Optional[float]:
    if not path.is_file():
        return None
    try:
        with contextlib.closing(wave.open(str(path), "rb")) as handle:
            frames = handle.getnframes()
            framerate = handle.getframerate()
            if frames > 0 and framerate > 0:
                return frames / float(framerate)
    except wave.Error:
        return None
    return None


def estimate_duration_seconds(audio_path: Optional[str], *, raw_sample_rate: int = 16_000) -> Optional[float]:
    if not audio_path:
        return None
    path = Path(audio_path)
    if not path.exists():
        return None

    suffix = path.suffix.lower()
    if suffix == ".wav":
        duration = _safe_soundfile_duration(path)
        if duration is None:
            duration = _wave_duration(path)
        return duration

    if suffix in {".flac", ".ogg", ".mp3", ".m4a"}:
        duration = _safe_soundfile_duration(path)
        if duration is not None:
            return duration
        return None

    if suffix in {".raw", ".ssg", ""}:
        try:
            size_bytes = path.stat().st_size
        except OSError:
            return None
        # 16-bit PCM mono, fallback assumption for Verbmobil raw segments
        bytes_per_sample = 2
        frames = size_bytes / bytes_per_sample
        return frames / raw_sample_rate

    return None


def compute_row_duration(row: pd.Series) -> Optional[float]:
    if "duration_s" in row and not pd.isna(row["duration_s"]):
        return float(row["duration_s"])

    if "audio_source" in row and row["audio_source"] is not None:
        audio = row["audio_source"]
        if InMemoryAudio is not None and isinstance(audio, InMemoryAudio):
            return len(audio.samples) / float(audio.sample_rate)
        if isinstance(audio, np.ndarray):
            return len(audio) / float(getattr(audio, "sample_rate", 16_000))

    return estimate_duration_seconds(row.get("audio_path"))


def summarize_dataset(name: str, loader) -> dict:
    df = loader()
    if df.empty:
        raise RuntimeError(f"Dataset '{name}' returned an empty DataFrame")

    durations = df.apply(compute_row_duration, axis=1)
    durations = durations.astype(float)
    total_seconds = durations.fillna(0.0).sum()
    missing = durations.isna().sum()

    return {
        "dataset": name,
        "segments": len(df),
        "hours": total_seconds / 3600.0,
        "missing_segments": int(missing),
        "missing_percent": 100.0 * missing / len(df),
    }


In [None]:

summaries = []
for dataset_name, loader in DATASET_LOADERS.items():
    print(f"Loading {dataset_name} ...")
    summary = summarize_dataset(dataset_name, loader)
    summaries.append(summary)

summary_df = pd.DataFrame(summaries).sort_values("dataset").reset_index(drop=True)
summary_df


In [None]:

pretty_df = summary_df.copy()
pretty_df["hours"] = pretty_df["hours"].map(lambda value: round(value, 2))
pretty_df["missing_percent"] = pretty_df["missing_percent"].map(lambda value: round(value, 2))
pretty_df



## Hinweise

- Für Rohdaten ohne explizite Dauerangabe wird 16 kHz, 16-bit PCM Mono angenommen.
- Falls zusätzliche Formate (z. B. FLAC) vorkommen, benötigt das Notebook optional `soundfile`.
- Fehlende Dauerwerte (`missing_segments`) weisen auf Dateien hin, die nicht eingelesen werden konnten.
