# Audio Extraction Review Pipeline

Stages 1-2: extraction plus local Whisper transcription with reusable JSON transcripts.


In [None]:
from __future__ import annotations
import json, os, shlex, subprocess, time, traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


In [None]:
CONFIG: dict[str, Any] = {
    "input_mode": "single",
    "single_input": "large-files/Doug and Twitch Chat TAKE OVER EUROPE-VpmmuHlLPM0.mkv",
    "batch_inputs": [],
    "batch_glob": "*.mkv",
    "force_reextract": False,
    "force_retranscribe": False,
    "ffmpeg": {"audio_codec": "flac", "sample_rate": 16000, "channels": 1, "overwrite": True},
    "transcription": {
        "model_name": "tiny.en",
        "device": "cpu",
        "compute_type": "int8",
        "beam_size": 5,
        "vad_filter": True,
        "word_timestamps": True,
    },
    "diarization": {
        "enabled": True,
        "provider": "whisperx",
        "device": "cpu",
        "min_overlap_seconds": 0.2,
    },
}


In [None]:
def now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")

def resolve_project_paths(start: Path | None = None) -> dict[str, Path]:
    start_path = (start or Path.cwd()).resolve()
    candidates = [start_path, *start_path.parents]
    anchor = next((p / "00-supporting-files" for p in candidates if (p / "00-supporting-files").exists()), None)
    if anchor is None:
        raise FileNotFoundError("Could not locate 00-supporting-files anchor")
    project_root = anchor.parent
    data_root = anchor / "data" / "audio-extraction-review"
    paths = {
        "project_root": project_root,
        "supporting_files": anchor,
        "data_root": data_root,
        "audio_dir": data_root / "audio",
        "logs_dir": data_root / "logs",
        "runs_dir": data_root / "runs",
        "transcripts_dir": data_root / "transcripts",
    }
    for key in ("data_root", "audio_dir", "logs_dir", "runs_dir", "transcripts_dir"):
        paths[key].mkdir(parents=True, exist_ok=True)
    return paths

def as_project_relative(path: Path, project_root: Path) -> str:
    try:
        return str(path.resolve().relative_to(project_root))
    except Exception:
        return str(path.resolve())

def discover_inputs(config: dict[str, Any], project_root: Path) -> list[Path]:
    mode = config["input_mode"].strip().lower()
    if mode == "single":
        p = Path(config["single_input"])
        if not p.is_absolute():
            p = (project_root / p).resolve()
        return [p]
    if mode == "batch":
        items: list[Path] = []
        for raw in config.get("batch_inputs", []):
            p = Path(raw)
            if not p.is_absolute():
                p = (project_root / p).resolve()
            items.append(p)
        glob_pattern = config.get("batch_glob")
        if glob_pattern:
            large_files_dir = project_root / "large-files"
            if large_files_dir.exists():
                items.extend(sorted(large_files_dir.glob(glob_pattern)))
        return sorted({p.resolve() for p in items})
    raise ValueError("input_mode must be single or batch")

def append_jsonl(path: Path, payload: dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as fh:
        fh.write(json.dumps(payload, ensure_ascii=True) + "\n")

def output_audio_path(input_media: Path, audio_dir: Path) -> Path:
    return audio_dir / f"{input_media.stem.replace(' ', '_')}.flac"

def ffprobe_duration_seconds(path: Path) -> float | None:
    cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", str(path)]
    proc = subprocess.run(cmd, capture_output=True, text=True)
    if proc.returncode != 0:
        return None
    try:
        return float(proc.stdout.strip()) if proc.stdout.strip() else None
    except ValueError:
        return None

def extract_audio(input_media: Path, output_audio: Path, ffmpeg_cfg: dict[str, Any]) -> subprocess.CompletedProcess[str]:
    cmd = [
        "ffmpeg", "-v", "error", "-y" if ffmpeg_cfg.get("overwrite", True) else "-n", "-i", str(input_media),
        "-vn", "-ac", str(ffmpeg_cfg.get("channels", 1)), "-ar", str(ffmpeg_cfg.get("sample_rate", 16000)),
        "-c:a", str(ffmpeg_cfg.get("audio_codec", "flac")), str(output_audio),
    ]
    return subprocess.run(cmd, capture_output=True, text=True)

def run_extraction_stage(*, inputs: list[Path], paths: dict[str, Path], config: dict[str, Any], run_id: str) -> dict[str, Any]:
    started = time.perf_counter()
    run_log = paths["logs_dir"] / f"extraction-{run_id}.jsonl"
    failure_log = paths["logs_dir"] / "extraction-failures.jsonl"
    records: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []

    for input_media in inputs:
        record = {"run_id": run_id, "timestamp": now_iso(), "stage": "extract", "input_media": str(input_media), "status": "pending"}
        if not input_media.exists():
            record.update({"status": "failed", "error": "input_not_found"})
            append_jsonl(run_log, record)
            append_jsonl(failure_log, record)
            records.append(record)
            failures.append(record)
            continue

        out_audio = output_audio_path(input_media, paths["audio_dir"])
        if out_audio.exists() and not config.get("force_reextract", False):
            record.update({"status": "reused", "audio_path": str(out_audio), "resume_marker": True})
            append_jsonl(run_log, record)
            records.append(record)
            continue

        proc = extract_audio(input_media, out_audio, config["ffmpeg"])
        if proc.returncode != 0:
            record.update({"status": "failed", "error": "ffmpeg_failed", "stderr": proc.stderr.strip(), "command": " ".join(shlex.quote(p) for p in proc.args)})
            append_jsonl(run_log, record)
            append_jsonl(failure_log, record)
            records.append(record)
            failures.append(record)
            continue

        record.update({"status": "ok", "audio_path": str(out_audio), "audio_duration_seconds": ffprobe_duration_seconds(out_audio), "resume_marker": False})
        append_jsonl(run_log, record)
        records.append(record)

    return {
        "stage": "extract",
        "duration_seconds": round(time.perf_counter() - started, 3),
        "records": records,
        "failures": failures,
        "log_path": str(run_log),
        "failure_log_path": str(failure_log),
    }

def _segment_overlap_seconds(segment_start: float, segment_end: float, diar_start: float, diar_end: float) -> float:
    return max(0.0, min(segment_end, diar_end) - max(segment_start, diar_start))

def best_effort_diarization(*, audio_path: Path, config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    diar_cfg = config.get("diarization", {})
    if not diar_cfg.get("enabled", True):
        return [], {"attempted": False, "provider": diar_cfg.get("provider", "whisperx"), "used": False, "fallback_reason": "disabled_in_config"}

    token = os.getenv("HUGGINGFACE_TOKEN")
    if not token:
        return [], {"attempted": True, "provider": diar_cfg.get("provider", "whisperx"), "used": False, "fallback_reason": "missing_huggingface_token"}

    try:
        import whisperx
        from whisperx.diarize import DiarizationPipeline
    except Exception:
        return [], {"attempted": True, "provider": diar_cfg.get("provider", "whisperx"), "used": False, "fallback_reason": "whisperx_or_pyannote_not_installed"}

    try:
        audio = whisperx.load_audio(str(audio_path))
        diarize_model = DiarizationPipeline(token=token, device=diar_cfg.get("device", "cpu"))
        diar_df = diarize_model(audio)
        diar_segments = [{"start": float(r["start"]), "end": float(r["end"]), "speaker": str(r["speaker"])} for _, r in diar_df.iterrows()]
        return diar_segments, {"attempted": True, "provider": diar_cfg.get("provider", "whisperx"), "used": True, "fallback_reason": None}
    except Exception as exc:
        return [], {"attempted": True, "provider": diar_cfg.get("provider", "whisperx"), "used": False, "fallback_reason": f"diarization_failed: {exc}"}

def pick_segment_speaker(*, segment_start: float, segment_end: float, diar_segments: list[dict[str, Any]], min_overlap_seconds: float) -> str:
    best: tuple[float, str] | None = None
    for diar in diar_segments:
        overlap = _segment_overlap_seconds(segment_start, segment_end, diar["start"], diar["end"])
        if overlap <= 0:
            continue
        if best is None or overlap > best[0]:
            best = (overlap, diar["speaker"])
    if best is None or best[0] < min_overlap_seconds:
        return "UNKNOWN"
    return best[1]

def transcribe_audio_with_faster_whisper(audio_path: Path, config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    try:
        from faster_whisper import WhisperModel
    except Exception as exc:
        raise RuntimeError("faster_whisper_not_installed") from exc

    tcfg = config["transcription"]
    model = WhisperModel(tcfg.get("model_name", "tiny.en"), device=tcfg.get("device", "cpu"), compute_type=tcfg.get("compute_type", "int8"))
    seg_iter, info = model.transcribe(str(audio_path), beam_size=tcfg.get("beam_size", 5), vad_filter=tcfg.get("vad_filter", True), word_timestamps=tcfg.get("word_timestamps", True))

    segments = []
    for idx, seg in enumerate(seg_iter, start=1):
        words = []
        for w in (seg.words or []):
            words.append({"start": float(w.start), "end": float(w.end), "word": w.word, "probability": float(w.probability)})
        segments.append({"id": idx, "start": float(seg.start), "end": float(seg.end), "text": seg.text.strip(), "words": words})

    info_payload = {
        "language": getattr(info, "language", None),
        "language_probability": float(getattr(info, "language_probability", 0.0) or 0.0),
        "duration": float(getattr(info, "duration", 0.0) or 0.0),
        "duration_after_vad": float(getattr(info, "duration_after_vad", 0.0) or 0.0),
    }
    return segments, info_payload

def transcript_output_path(audio_path: Path, transcripts_dir: Path) -> Path:
    return transcripts_dir / f"{audio_path.stem}.json"

def run_transcription_stage(*, extraction_records: list[dict[str, Any]], paths: dict[str, Path], config: dict[str, Any], run_id: str) -> dict[str, Any]:
    started = time.perf_counter()
    run_log = paths["logs_dir"] / f"transcription-{run_id}.jsonl"
    records: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []

    for item in extraction_records:
        if item.get("status") not in {"ok", "reused"}:
            continue

        audio_path = Path(item["audio_path"])
        transcript_path = transcript_output_path(audio_path, paths["transcripts_dir"])
        record = {
            "run_id": run_id,
            "timestamp": now_iso(),
            "stage": "transcribe",
            "audio_path": str(audio_path),
            "transcript_path": str(transcript_path),
            "status": "pending",
        }

        if transcript_path.exists() and not config.get("force_retranscribe", False):
            record.update({"status": "reused", "resume_marker": True})
            append_jsonl(run_log, record)
            records.append(record)
            continue

        try:
            segments, info_payload = transcribe_audio_with_faster_whisper(audio_path, config)
            diar_segments, diar_meta = best_effort_diarization(audio_path=audio_path, config=config)
            min_overlap = float(config.get("diarization", {}).get("min_overlap_seconds", 0.2))

            normalized = []
            for seg in segments:
                normalized.append({
                    "id": seg["id"],
                    "start": seg["start"],
                    "end": seg["end"],
                    "speaker": pick_segment_speaker(segment_start=seg["start"], segment_end=seg["end"], diar_segments=diar_segments, min_overlap_seconds=min_overlap),
                    "text": seg["text"],
                    "words": seg["words"],
                })

            payload = {
                "schema_version": "1.0",
                "run_id": run_id,
                "created_at": now_iso(),
                "source": {
                    "media_path": item.get("input_media"),
                    "audio_path": str(audio_path),
                    "audio_duration_seconds": ffprobe_duration_seconds(audio_path),
                },
                "transcription": {
                    "engine": "faster-whisper",
                    "model_name": config["transcription"].get("model_name"),
                    "device": config["transcription"].get("device"),
                    "compute_type": config["transcription"].get("compute_type"),
                    "language": info_payload.get("language"),
                    "language_probability": info_payload.get("language_probability"),
                    "duration_seconds": info_payload.get("duration"),
                    "duration_after_vad_seconds": info_payload.get("duration_after_vad"),
                },
                "diarization": diar_meta,
                "segments": normalized,
            }
            transcript_path.write_text(json.dumps(payload, indent=2, ensure_ascii=True), encoding="utf-8")

            record.update({
                "status": "ok",
                "segment_count": len(normalized),
                "word_timestamp_count": sum(len(s["words"]) for s in normalized),
                "resume_marker": False,
                "diarization_fallback_reason": diar_meta.get("fallback_reason"),
            })
            append_jsonl(run_log, record)
            records.append(record)
        except Exception as exc:
            record.update({"status": "failed", "error": str(exc), "traceback": traceback.format_exc()})
            append_jsonl(run_log, record)
            records.append(record)
            failures.append(record)

    return {
        "stage": "transcribe",
        "duration_seconds": round(time.perf_counter() - started, 3),
        "records": records,
        "failures": failures,
        "log_path": str(run_log),
    }


In [None]:
paths = resolve_project_paths()
{key: as_project_relative(value, paths["project_root"]) for key, value in paths.items()}


In [None]:
# Example:
# inputs = discover_inputs(CONFIG, paths["project_root"])
# extraction = run_extraction_stage(inputs=inputs, paths=paths, config=CONFIG, run_id="manual-run")
# transcription = run_transcription_stage(extraction_records=extraction["records"], paths=paths, config=CONFIG, run_id="manual-run")
# transcription
