# Audio Extraction Review Pipeline

Stage 1: deterministic input discovery and ffmpeg extraction with batch-safe failure logging.


In [None]:
from __future__ import annotations
import json, shlex, subprocess, time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


In [None]:
CONFIG: dict[str, Any] = {
    "input_mode": "single",
    "single_input": "large-files/Doug and Twitch Chat TAKE OVER EUROPE-VpmmuHlLPM0.mkv",
    "batch_inputs": [],
    "batch_glob": "*.mkv",
    "force_reextract": False,
    "ffmpeg": {"audio_codec": "flac", "sample_rate": 16000, "channels": 1, "overwrite": True},
}


In [None]:
def now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")

def resolve_project_paths(start: Path | None = None) -> dict[str, Path]:
    start_path = (start or Path.cwd()).resolve()
    candidates = [start_path, *start_path.parents]
    anchor = next((p / "00-supporting-files" for p in candidates if (p / "00-supporting-files").exists()), None)
    if anchor is None:
        raise FileNotFoundError("Could not locate 00-supporting-files anchor")
    project_root = anchor.parent
    data_root = anchor / "data" / "audio-extraction-review"
    paths = {
        "project_root": project_root,
        "supporting_files": anchor,
        "data_root": data_root,
        "audio_dir": data_root / "audio",
        "logs_dir": data_root / "logs",
        "runs_dir": data_root / "runs",
    }
    for key in ("data_root", "audio_dir", "logs_dir", "runs_dir"):
        paths[key].mkdir(parents=True, exist_ok=True)
    return paths

def as_project_relative(path: Path, project_root: Path) -> str:
    try:
        return str(path.resolve().relative_to(project_root))
    except Exception:
        return str(path.resolve())

def discover_inputs(config: dict[str, Any], project_root: Path) -> list[Path]:
    mode = config["input_mode"].strip().lower()
    if mode == "single":
        p = Path(config["single_input"])
        if not p.is_absolute():
            p = (project_root / p).resolve()
        return [p]
    if mode == "batch":
        items: list[Path] = []
        for raw in config.get("batch_inputs", []):
            p = Path(raw)
            if not p.is_absolute():
                p = (project_root / p).resolve()
            items.append(p)
        glob_pattern = config.get("batch_glob")
        if glob_pattern:
            large_files_dir = project_root / "large-files"
            if large_files_dir.exists():
                items.extend(sorted(large_files_dir.glob(glob_pattern)))
        return sorted({p.resolve() for p in items})
    raise ValueError("input_mode must be single or batch")

def append_jsonl(path: Path, payload: dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as fh:
        fh.write(json.dumps(payload, ensure_ascii=True) + "\n")

def output_audio_path(input_media: Path, audio_dir: Path) -> Path:
    return audio_dir / f"{input_media.stem.replace(' ', '_')}.flac"

def ffprobe_duration_seconds(path: Path) -> float | None:
    cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", str(path)]
    proc = subprocess.run(cmd, capture_output=True, text=True)
    if proc.returncode != 0:
        return None
    try:
        return float(proc.stdout.strip()) if proc.stdout.strip() else None
    except ValueError:
        return None

def extract_audio(input_media: Path, output_audio: Path, ffmpeg_cfg: dict[str, Any]) -> subprocess.CompletedProcess[str]:
    cmd = [
        "ffmpeg", "-v", "error", "-y" if ffmpeg_cfg.get("overwrite", True) else "-n", "-i", str(input_media),
        "-vn", "-ac", str(ffmpeg_cfg.get("channels", 1)), "-ar", str(ffmpeg_cfg.get("sample_rate", 16000)),
        "-c:a", str(ffmpeg_cfg.get("audio_codec", "flac")), str(output_audio),
    ]
    return subprocess.run(cmd, capture_output=True, text=True)

def run_extraction_stage(*, inputs: list[Path], paths: dict[str, Path], config: dict[str, Any], run_id: str) -> dict[str, Any]:
    started = time.perf_counter()
    run_log = paths["logs_dir"] / f"extraction-{run_id}.jsonl"
    failure_log = paths["logs_dir"] / "extraction-failures.jsonl"
    records: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []

    for input_media in inputs:
        record = {"run_id": run_id, "timestamp": now_iso(), "stage": "extract", "input_media": str(input_media), "status": "pending"}
        if not input_media.exists():
            record.update({"status": "failed", "error": "input_not_found"})
            append_jsonl(run_log, record)
            append_jsonl(failure_log, record)
            records.append(record)
            failures.append(record)
            continue

        out_audio = output_audio_path(input_media, paths["audio_dir"])
        if out_audio.exists() and not config.get("force_reextract", False):
            record.update({"status": "reused", "audio_path": str(out_audio), "resume_marker": True})
            append_jsonl(run_log, record)
            records.append(record)
            continue

        proc = extract_audio(input_media, out_audio, config["ffmpeg"])
        if proc.returncode != 0:
            record.update({"status": "failed", "error": "ffmpeg_failed", "stderr": proc.stderr.strip(), "command": " ".join(shlex.quote(p) for p in proc.args)})
            append_jsonl(run_log, record)
            append_jsonl(failure_log, record)
            records.append(record)
            failures.append(record)
            continue

        record.update({"status": "ok", "audio_path": str(out_audio), "audio_duration_seconds": ffprobe_duration_seconds(out_audio), "resume_marker": False})
        append_jsonl(run_log, record)
        records.append(record)

    return {
        "stage": "extract",
        "duration_seconds": round(time.perf_counter() - started, 3),
        "records": records,
        "failures": failures,
        "log_path": str(run_log),
        "failure_log_path": str(failure_log),
    }


In [None]:
paths = resolve_project_paths()
{key: as_project_relative(value, paths["project_root"]) for key, value in paths.items()}


In [None]:
# Example:
# inputs = discover_inputs(CONFIG, paths["project_root"])
# extraction = run_extraction_stage(inputs=inputs, paths=paths, config=CONFIG, run_id="manual-run")
# extraction
