In [1]:
!apt-get update -y && apt-get install -y ffmpeg
!pip -q install yt-dlp faster-whisper soundfile numpy tqdm
!pip -q install tensorflow tensorflow_hub


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease      
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 126 not upgraded.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[3

In [2]:
import os, json, shlex, subprocess, re
from pathlib import Path
from datetime import datetime
import numpy as np
import soundfile as sf
from tqdm import tqdm

ROOT = Path("./standup_dataset")
DIRS = {
    "manifest": ROOT / "manifest",
    "audio": ROOT / "audio",
    "whisper": ROOT / "whisper",
    "yamnet": ROOT / "yamnet",
    "json": ROOT / "json",
}
for d in DIRS.values():
    d.mkdir(parents=True, exist_ok=True)

# ===== 파라미터 =====
TARGET_N = 30
MIN_DURATION_SEC = 40 * 60

# 검색 쿼리(필요하면 더 추가)
QUERIES = [
    "stand up comedy full special",
    "standup comedy full special",
    "comedy special full show",
]

# Whisper
WHISPER_MODEL_SIZE = "small"  # 먼저 small로 안정화 -> medium/large-v3로 상향
DEVICE = "cuda" if os.path.exists("/dev/nvidia0") else "cpu"
COMPUTE = "float16" if DEVICE == "cuda" else "int8"

# YAMNet
YAMNET_THRESHOLD = 0.35

def run_live(cmd):
    """실시간 로그 출력용 (다운로드/ffmpeg 진행 상황 확인)"""
    if isinstance(cmd, str):
        cmd = shlex.split(cmd)
    print(">>", " ".join(cmd))
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
    for line in p.stdout:
        print(line, end="")
    rc = p.wait()
    if rc != 0:
        raise RuntimeError(f"Command failed (rc={rc}): {' '.join(cmd)}")

def run_quiet(cmd):
    """조용히 실행(Whisper/YAMNet 등)"""
    if isinstance(cmd, str):
        cmd = shlex.split(cmd)
    p = subprocess.run(cmd, capture_output=True, text=True)
    if p.returncode != 0:
        raise RuntimeError(p.stderr[:1200])
    return p.stdout


In [3]:
!pip -q install -U yt-dlp
!yt-dlp --version


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
2025.12.08


In [4]:
import json, subprocess
from datetime import datetime
from pathlib import Path

def ytdlp_has_option(opt: str) -> bool:
    h = subprocess.run(["yt-dlp", "--help"], capture_output=True, text=True).stdout
    return opt in h

def ytdlp_search(query: str, max_results: int = 120):
    """
    ytsearch로 후보 수집.
    - 연령제한/삭제/지역제한 등은 --ignore-errors로 스킵
    - --flat-playlist 지원되면 사용(얕게 뽑기) :contentReference[oaicite:1]{index=1}
    """
    cmd = [
        "yt-dlp",
        f"ytsearch{max_results}:{query}",
        "--dump-json",
        "--skip-download",
        "--no-playlist",
        "--ignore-errors",
        "--no-warnings",
    ]

    # ✅ extract-flat 대신 flat-playlist 사용(지원할 때만)
    if ytdlp_has_option("--flat-playlist"):
        cmd.append("--flat-playlist")

    p = subprocess.run(cmd, capture_output=True, text=True)

    # rc != 0 이어도 stdout에 일부 결과가 남는 경우가 있어서 파싱은 진행
    if p.returncode != 0 and p.stderr:
        print("[ytdlp_search warning]", p.stderr.splitlines()[-1])

    items = []
    for line in p.stdout.splitlines():
        try:
            j = json.loads(line)
            vid = j.get("id")
            if not vid:
                continue
            title = j.get("title") or ""
            channel = j.get("uploader") or j.get("channel") or ""
            dur = int(j.get("duration") or 0)  # flat-playlist에선 0일 수 있음
            url = j.get("webpage_url") or j.get("url") or f"https://www.youtube.com/watch?v={vid}"

            items.append({
                "video_id": vid,
                "duration_sec": dur,
                "title": title,
                "channel": channel,
                "webpage_url": url,
            })
        except Exception:
            pass

    return items

def build_candidate_manifest(want=30, oversample=8, max_per_query=160):
    target_candidates = want * oversample
    seen = set()
    cand = []

    for q in QUERIES:
        items = ytdlp_search(q, max_results=max_per_query)
        for it in items:
            vid = it["video_id"]
            if vid in seen:
                continue
            seen.add(vid)
            cand.append(it)
            if len(cand) >= target_candidates:
                break
        if len(cand) >= target_candidates:
            break

    out_path = DIRS["manifest"] / "candidates.jsonl"
    with open(out_path, "w", encoding="utf-8") as f:
        for it in cand:
            it["picked_at"] = datetime.utcnow().isoformat() + "Z"
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

    print("candidate manifest saved:", out_path, "count:", len(cand))
    return out_path

candidate_path = build_candidate_manifest(want=TARGET_N, oversample=8, max_per_query=160)


candidate manifest saved: standup_dataset/manifest/candidates.jsonl count: 240


In [5]:
def get_audio_duration_sec(path: Path) -> float:
    cmd = [
        "ffprobe", "-v", "error",
        "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1",
        str(path)
    ]
    p = subprocess.run(cmd, capture_output=True, text=True)
    if p.returncode != 0:
        return 0.0
    try:
        return float(p.stdout.strip())
    except:
        return 0.0

def download_audio_16k(video_id: str):
    """
    src 파일과 최종 파일 분리(파일명 문제 방지).
    """
    url = f"https://www.youtube.com/watch?v={video_id}"
    src_tpl = str(DIRS["audio"] / f"{video_id}.src.%(ext)s")
    out_wav = DIRS["audio"] / f"{video_id}.16k.wav"

    if out_wav.exists() and out_wav.stat().st_size > 0:
        return out_wav

    # 1) src 추출
    run_live([
        "yt-dlp", url,
        "-x", "--audio-format", "wav",
        "-o", src_tpl,
        "--no-playlist",
        "--ignore-errors",
        "--newline", "--progress"
    ])

    # src 파일 찾기
    src_files = list(DIRS["audio"].glob(f"{video_id}.src.*"))
    if not src_files:
        raise FileNotFoundError(f"yt-dlp output not found for {video_id}")
    src_any = src_files[0]

    # 2) 16k mono 변환
    run_live(["ffmpeg","-y","-i",str(src_any),"-ac","1","-ar","16000",str(out_wav)])

    # 3) src 정리
    try:
        src_any.unlink()
    except Exception:
        pass

    if not out_wav.exists() or out_wav.stat().st_size == 0:
        raise FileNotFoundError(f"ffmpeg failed: {out_wav}")

    return out_wav

def stage1_finalize(candidate_jsonl: Path, need=30, min_sec=40*60):
    """
    candidates에서 순회하면서
    - 다운로드 성공
    - 실제 오디오 길이 40분+
    만족하는 것만 모아서 final_videos.jsonl 생성.
    """
    final_path = DIRS["manifest"] / "final_videos.jsonl"
    ok = []
    bad = []

    cand = []
    with open(candidate_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            cand.append(json.loads(line))

    for it in tqdm(cand, desc="Stage1 Finalize"):
        if len(ok) >= need:
            break
        vid = it["video_id"]
        try:
            wav_path = download_audio_16k(vid)
            dur = get_audio_duration_sec(wav_path)

            if dur < min_sec:
                # 짧으면 삭제하고 스킵
                try:
                    wav_path.unlink()
                except:
                    pass
                continue

            it2 = dict(it)
            it2["audio_path"] = str(wav_path)
            it2["audio_duration_sec"] = dur
            it2["finalized_at"] = datetime.utcnow().isoformat() + "Z"
            ok.append(it2)

        except Exception as e:
            bad.append((vid, str(e)[:300]))

    with open(final_path, "w", encoding="utf-8") as f:
        for it in ok:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

    print("final manifest saved:", final_path, "count:", len(ok))
    print("download errors:", len(bad))
    if bad:
        print("sample errors:", bad[:3])

    return final_path, ok, bad

final_path, final_videos, stage1_errors = stage1_finalize(candidate_path, need=TARGET_N, min_sec=MIN_DURATION_SEC)


Stage1 Finalize:   0%|          | 0/240 [00:00<?, ?it/s]

>> yt-dlp https://www.youtube.com/watch?v=7SajHU6mMrM -x --audio-format wav -o standup_dataset/audio/7SajHU6mMrM.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=7SajHU6mMrM
[youtube] 7SajHU6mMrM: Downloading webpage
[youtube] 7SajHU6mMrM: Downloading android sdkless player API JSON
[youtube] 7SajHU6mMrM: Downloading web safari player API JSON
[youtube] 7SajHU6mMrM: Downloading m3u8 information
[info] 7SajHU6mMrM: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/7SajHU6mMrM.src.webm
[download]   0.0% of   24.28MiB at  343.91KiB/s ETA 01:12
[download]   0.0% of   24.28MiB at  897.75KiB/s ETA 00:27
[download]   0.0% of   24.28MiB at    1.89MiB/s ETA 00:12
[download]   0.1% of   24.28MiB at    3.77MiB/s ETA 00:06
[download]   0.1% of   24.28MiB at    7.20MiB/s ETA 00:03
[download]   0.3% of   24.28MiB at    3.34MiB/s ETA 00:07
[download]   0.5% of   24.28MiB at    3.72MiB/s ETA 00:06
[downloa

Stage1 Finalize:   1%|          | 2/240 [00:12<24:45,  6.24s/it]

size=   51200kB time=00:27:21.94 bitrate= 255.4kbits/s speed= 818x    
size=   65768kB time=00:35:04.56 bitrate= 256.0kbits/s speed= 848x    
video:0kB audio:65768kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000116%
>> yt-dlp https://www.youtube.com/watch?v=3gkYzgWQY84 -x --audio-format wav -o standup_dataset/audio/3gkYzgWQY84.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=3gkYzgWQY84
[youtube] 3gkYzgWQY84: Downloading webpage
[youtube] 3gkYzgWQY84: Downloading android sdkless player API JSON
[youtube] 3gkYzgWQY84: Downloading web safari player API JSON
[youtube] 3gkYzgWQY84: Downloading m3u8 information
[info] 3gkYzgWQY84: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/3gkYzgWQY84.src.webm
[download]   0.0% of   72.55MiB at  328.63KiB/s ETA 03:47
[download]   0.0% of   72.55MiB at  803.61KiB/s ETA 01:32
[download]   0.0% of   72.55MiB at    1.60MiB/s ETA 00:

Stage1 Finalize:   7%|▋         | 16/240 [00:37<02:36,  1.43it/s]

>> yt-dlp https://www.youtube.com/watch?v=52n6-PGdvII -x --audio-format wav -o standup_dataset/audio/52n6-PGdvII.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=52n6-PGdvII
[youtube] 52n6-PGdvII: Downloading webpage
[youtube] 52n6-PGdvII: Downloading android sdkless player API JSON
[youtube] 52n6-PGdvII: Downloading web safari player API JSON
[youtube] 52n6-PGdvII: Downloading m3u8 information
[info] 52n6-PGdvII: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/52n6-PGdvII.src.webm
[download]   0.0% of   96.34MiB at  267.97KiB/s ETA 06:09
[download]   0.0% of   96.34MiB at  661.15KiB/s ETA 02:29
[download]   0.0% of   96.34MiB at    1.32MiB/s ETA 01:13
[download]   0.0% of   96.34MiB at    2.53MiB/s ETA 00:38
[download]   0.0% of   96.34MiB at    4.52MiB/s ETA 00:21
[download]   0.1% of   96.34MiB at    2.88MiB/s ETA 00:33
[download]   0.1% of   96.34MiB at    3.86MiB/s ETA 00:24
[downloa

Stage1 Finalize:   7%|▋         | 16/240 [00:49<02:36,  1.43it/s]

Deleting original file standup_dataset/audio/52n6-PGdvII.src.webm (pass -k to keep)
>> ffmpeg -y -i standup_dataset/audio/52n6-PGdvII.src.wav -ac 1 -ar 16000 standup_dataset/audio/52n6-PGdvII.16k.wav
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --en

Stage1 Finalize:   8%|▊         | 18/240 [01:11<21:26,  5.80s/it]

>> yt-dlp https://www.youtube.com/watch?v=at-LN8PXQGE -x --audio-format wav -o standup_dataset/audio/at-LN8PXQGE.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=at-LN8PXQGE
[youtube] at-LN8PXQGE: Downloading webpage
[youtube] at-LN8PXQGE: Downloading android sdkless player API JSON
[youtube] at-LN8PXQGE: Downloading web safari player API JSON
[youtube] at-LN8PXQGE: Downloading m3u8 information
[info] at-LN8PXQGE: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/at-LN8PXQGE.src.webm
[download]   0.0% of   81.62MiB at  283.11KiB/s ETA 04:56
[download]   0.0% of   81.62MiB at  708.58KiB/s ETA 01:58
[download]   0.0% of   81.62MiB at    1.43MiB/s ETA 00:57
[download]   0.0% of   81.62MiB at    2.77MiB/s ETA 00:29
[download]   0.0% of   81.62MiB at    5.13MiB/s ETA 00:15
[download]   0.1% of   81.62MiB at    3.86MiB/s ETA 00:21
[download]   0.2% of   81.62MiB at    3.80MiB/s ETA 00:21
[downloa

Stage1 Finalize:   8%|▊         | 19/240 [01:39<36:17,  9.85s/it]

>> yt-dlp https://www.youtube.com/watch?v=1Lik3hSyhrY -x --audio-format wav -o standup_dataset/audio/1Lik3hSyhrY.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=1Lik3hSyhrY
[youtube] 1Lik3hSyhrY: Downloading webpage
[youtube] 1Lik3hSyhrY: Downloading android sdkless player API JSON
[youtube] 1Lik3hSyhrY: Downloading web safari player API JSON
[youtube] 1Lik3hSyhrY: Downloading m3u8 information
[info] 1Lik3hSyhrY: Downloading 1 format(s): 251-12
[download] Destination: standup_dataset/audio/1Lik3hSyhrY.src.webm
[download]   0.0% of   37.56MiB at  163.67KiB/s ETA 03:55
[download]   0.0% of   37.56MiB at  449.07KiB/s ETA 01:25
[download]   0.0% of   37.56MiB at  984.31KiB/s ETA 00:39
[download]   0.0% of   37.56MiB at    1.95MiB/s ETA 00:19
[download]   0.1% of   37.56MiB at    3.81MiB/s ETA 00:09
[download]   0.2% of   37.56MiB at    4.22MiB/s ETA 00:08
[download]   0.3% of   37.56MiB at    4.10MiB/s ETA 00:09
[down

Stage1 Finalize:   8%|▊         | 20/240 [01:54<40:02, 10.92s/it]

>> yt-dlp https://www.youtube.com/watch?v=FSbE9EAHiGU -x --audio-format wav -o standup_dataset/audio/FSbE9EAHiGU.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=FSbE9EAHiGU
[youtube] FSbE9EAHiGU: Downloading webpage
[youtube] FSbE9EAHiGU: Downloading android sdkless player API JSON
[youtube] FSbE9EAHiGU: Downloading web safari player API JSON
[youtube] FSbE9EAHiGU: Downloading m3u8 information
[info] FSbE9EAHiGU: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/FSbE9EAHiGU.src.webm
[download]   0.0% of   54.54MiB at  279.99KiB/s ETA 03:20
[download]   0.0% of   54.54MiB at  675.67KiB/s ETA 01:22
[download]   0.0% of   54.54MiB at    1.33MiB/s ETA 00:41
[download]   0.0% of   54.54MiB at    2.52MiB/s ETA 00:21
[download]   0.1% of   54.54MiB at    4.65MiB/s ETA 00:11
[download]   0.1% of   54.54MiB at    4.34MiB/s ETA 00:12
[download]   0.2% of   54.54MiB at    4.16MiB/s ETA 00:13
[downloa

Stage1 Finalize:  12%|█▏        | 28/240 [02:15<08:36,  2.44s/it]

>> yt-dlp https://www.youtube.com/watch?v=PMGWVyM2NJo -x --audio-format wav -o standup_dataset/audio/PMGWVyM2NJo.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=PMGWVyM2NJo
[youtube] PMGWVyM2NJo: Downloading webpage
[youtube] PMGWVyM2NJo: Downloading android sdkless player API JSON
[youtube] PMGWVyM2NJo: Downloading web safari player API JSON
[youtube] PMGWVyM2NJo: Downloading m3u8 information
[info] PMGWVyM2NJo: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/PMGWVyM2NJo.src.webm
[download]   0.0% of   58.18MiB at  156.11KiB/s ETA 06:23
[download]   0.0% of   58.18MiB at  375.92KiB/s ETA 02:38
[download]   0.0% of   58.18MiB at  780.17KiB/s ETA 01:16
[download]   0.0% of   58.18MiB at    1.48MiB/s ETA 00:39
[download]   0.1% of   58.18MiB at    1.20MiB/s ETA 00:48
[download]   0.1% of   58.18MiB at    1.35MiB/s ETA 00:43
[download]   0.2% of   58.18MiB at    1.77MiB/s ETA 00:32
[downloa

Stage1 Finalize:  12%|█▏        | 28/240 [02:30<08:36,  2.44s/it]

Deleting original file standup_dataset/audio/PMGWVyM2NJo.src.webm (pass -k to keep)
>> ffmpeg -y -i standup_dataset/audio/PMGWVyM2NJo.src.wav -ac 1 -ar 16000 standup_dataset/audio/PMGWVyM2NJo.16k.wav
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --en

Stage1 Finalize:  12%|█▎        | 30/240 [02:36<17:39,  5.05s/it]

>> yt-dlp https://www.youtube.com/watch?v=coC4t7nCGPs -x --audio-format wav -o standup_dataset/audio/coC4t7nCGPs.src.%(ext)s --no-playlist --ignore-errors --newline --progress
[youtube] Extracting URL: https://www.youtube.com/watch?v=coC4t7nCGPs
[youtube] coC4t7nCGPs: Downloading webpage
[youtube] coC4t7nCGPs: Downloading android sdkless player API JSON
[youtube] coC4t7nCGPs: Downloading web safari player API JSON
[youtube] coC4t7nCGPs: Downloading m3u8 information
[info] coC4t7nCGPs: Downloading 1 format(s): 251
[download] Destination: standup_dataset/audio/coC4t7nCGPs.src.webm
[download]   0.0% of   55.88MiB at  172.42KiB/s ETA 05:33
[download]   0.0% of   55.88MiB at  439.79KiB/s ETA 02:10
[download]   0.0% of   55.88MiB at  877.89KiB/s ETA 01:05
[download]   0.0% of   55.88MiB at    1.68MiB/s ETA 00:33
[download]   0.1% of   55.88MiB at    3.18MiB/s ETA 00:17
[download]   0.1% of   55.88MiB at    4.21MiB/s ETA 00:13
[download]   0.2% of   55.88MiB at    2.93MiB/s ETA 00:19
[downloa

Stage1 Finalize:  13%|█▎        | 32/240 [02:56<19:05,  5.51s/it]

final manifest saved: standup_dataset/manifest/final_videos.jsonl count: 30
download errors: 0





In [6]:
from faster_whisper import WhisperModel

whisper_model = WhisperModel(WHISPER_MODEL_SIZE, device=DEVICE, compute_type=COMPUTE)
print("Whisper model:", WHISPER_MODEL_SIZE, "| device:", DEVICE, "| compute:", COMPUTE)

def stage2_whisper_one(video_id: str, language=None):
    in_wav = DIRS["audio"] / f"{video_id}.16k.wav"
    out_json = DIRS["whisper"] / f"{video_id}.json"

    if out_json.exists() and out_json.stat().st_size > 0:
        return out_json

    segments, info = whisper_model.transcribe(
        str(in_wav),
        language=language,
        vad_filter=True,
        beam_size=5,
        temperature=0.0,
        condition_on_previous_text=True,
    )

    segs = []
    for seg in segments:
        txt = (seg.text or "").strip()
        if txt:
            segs.append({"start": float(seg.start), "end": float(seg.end), "text": txt})

    payload = {
        "video_id": video_id,
        "created_at": datetime.utcnow().isoformat() + "Z",
        "model": WHISPER_MODEL_SIZE,
        "device": DEVICE,
        "segments": segs,
    }
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

    return out_json

def load_final_manifest(final_jsonl: Path):
    vids = []
    with open(final_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            vids.append(json.loads(line))
    return vids

final_videos = load_final_manifest(final_path)

def stage2_whisper_all(videos):
    ok, bad = 0, []
    for it in tqdm(videos, desc="Stage2 Whisper"):
        vid = it["video_id"]
        try:
            stage2_whisper_one(vid, language=None)
            ok += 1
        except Exception as e:
            bad.append((vid, str(e)[:300]))
    print("whisper ok:", ok, "bad:", len(bad))
    if bad:
        print("sample errors:", bad[:3])
    return bad

stage2_errors = stage2_whisper_all(final_videos)


Whisper model: small | device: cpu | compute: int8


Stage2 Whisper: 100%|██████████| 30/30 [48:00<00:00, 96.03s/it]  

whisper ok: 30 bad: 0





In [7]:
# 로그 줄이기(선택)
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow_hub as hub
import tensorflow as tf

yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
class_map_path = yamnet.class_map_path().numpy().decode("utf-8")
class_names = [line.strip().split(",")[2] for line in open(class_map_path, "r", encoding="utf-8").read().splitlines()[1:]]

TARGET_CLASSES = {
    "laughter": ["Laughter"],
    "applause": ["Applause"],
    "cheering": ["Cheering"],
}

def load_wav_16k(path: Path):
    wav, sr = sf.read(str(path))
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    if sr != 16000:
        raise ValueError("input must be 16k wav")
    return wav.astype(np.float32), sr

def stage3_yamnet_one(video_id: str, threshold=0.35):
    in_wav = DIRS["audio"] / f"{video_id}.16k.wav"
    out_json = DIRS["yamnet"] / f"{video_id}.json"

    if out_json.exists() and out_json.stat().st_size > 0:
        return out_json

    wav, sr = load_wav_16k(in_wav)
    duration = len(wav) / sr

    scores, embeddings, spectrogram = yamnet(wav)
    scores = scores.numpy()
    frames = scores.shape[0]

    # YAMNet 프레임 hop 근사(보통 ~0.48s)
    hop = 0.48

    events = []
    for rtype, names in TARGET_CLASSES.items():
        idxs = [i for i, nm in enumerate(class_names) if nm in names]
        if not idxs:
            continue
        rscore = scores[:, idxs].max(axis=1)
        active = rscore >= threshold

        i = 0
        while i < frames:
            if not active[i]:
                i += 1
                continue
            j = i
            peak = float(rscore[i])
            while j < frames and active[j]:
                peak = max(peak, float(rscore[j]))
                j += 1
            start_t = min(i * hop, duration)
            end_t = min(j * hop, duration)
            if end_t - start_t >= 0.3:
                events.append({
                    "start": float(start_t),
                    "end": float(end_t),
                    "reaction_type": rtype,
                    "score": float(peak)
                })
            i = j

    events.sort(key=lambda x: x["start"])

    payload = {
        "video_id": video_id,
        "created_at": datetime.utcnow().isoformat() + "Z",
        "threshold": threshold,
        "events": events
    }
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

    return out_json

def stage3_yamnet_all(videos, threshold=0.35):
    ok, bad = 0, []
    for it in tqdm(videos, desc="Stage3 YAMNet"):
        vid = it["video_id"]
        try:
            stage3_yamnet_one(vid, threshold=threshold)
            ok += 1
        except Exception as e:
            bad.append((vid, str(e)[:300]))
    print("yamnet ok:", ok, "bad:", len(bad))
    if bad:
        print("sample errors:", bad[:3])
    return bad

stage3_errors = stage3_yamnet_all(final_videos, threshold=YAMNET_THRESHOLD)


  if not hasattr(np, "object"):
I0000 00:00:1766925557.984587    2234 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43710 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:56:00.0, compute capability: 8.6
Stage3 YAMNet:   0%|          | 0/30 [00:00<?, ?it/s]2025-12-28 12:39:22.643216: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:466] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
Stage3 YAMNet:   3%|▎         | 1/30 [00:01<00:48,  1.69s/it]2025-12-28 12:39:23.606203: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:466] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to

yamnet ok: 0 bad: 30
sample errors: [('Zqv-yKgUbCA', 'Graph execution error:\n\nDetected at node yamnet_frames/layer1/conv/Conv2D defined at (most recent call last):\n<stack traces unavailable>\nNo DNN in stream executor.\n\t [[{{node yamnet_frames/layer1/conv/Conv2D}}]] [Op:__inference_restored_function_body_15213]'), ('zBlytYhNCKg', 'Graph execution error:\n\nDetected at node yamnet_frames/layer1/conv/Conv2D defined at (most recent call last):\n<stack traces unavailable>\nNo DNN in stream executor.\n\t [[{{node yamnet_frames/layer1/conv/Conv2D}}]] [Op:__inference_restored_function_body_15213]'), ('3gkYzgWQY84', 'Graph execution error:\n\nDetected at node yamnet_frames/layer1/conv/Conv2D defined at (most recent call last):\n<stack traces unavailable>\nNo DNN in stream executor.\n\t [[{{node yamnet_frames/layer1/conv/Conv2D}}]] [Op:__inference_restored_function_body_15213]')]





In [11]:
def build_final_timeline(video_id: str):
    wpath = DIRS["whisper"] / f"{video_id}.json"
    ypath = DIRS["yamnet"] / f"{video_id}.json"
    outpath = DIRS["json"] / f"script_{video_id}.json"

    with open(wpath, "r", encoding="utf-8") as f:
        w = json.load(f)
    with open(ypath, "r", encoding="utf-8") as f:
        y = json.load(f)

    timeline = []

    # comedian speech
    for s in w.get("segments", []):
        timeline.append({
            "start": float(s["start"]),
            "end": float(s["end"]),
            "role": "comedian",
            "content": s["text"],
            "event_type": "speech",
            "delivery_tag": None
        })

    # audience reactions
    for r in y.get("events", []):
        rt = r["reaction_type"]
        timeline.append({
            "start": float(r["start"]),
            "end": float(r["end"]),
            "role": "audience",
            "content": f"[{rt}]",
            "reaction_type": rt
        })

    timeline.sort(key=lambda x: (x["start"], x["end"]))

    payload = {"video_id": video_id, "timeline": timeline}
    with open(outpath, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

    return outpath

def stage4_merge_all(videos):
    ok, bad = 0, []
    for it in tqdm(videos, desc="Stage4 Merge"):
        vid = it["video_id"]
        try:
            build_final_timeline(vid)
            ok += 1
        except Exception as e:
            bad.append((vid, str(e)[:300]))
    print("merge ok:", ok, "bad:", len(bad))
    if bad:
        print("sample errors:", bad[:3])
    return bad

stage4_errors = stage4_merge_all(final_videos)


Stage4 Merge: 100%|██████████| 30/30 [00:03<00:00,  7.74it/s]

merge ok: 30 bad: 0





In [14]:
%%writefile yamnet_detect.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""          # ✅ TF가 GPU 못 쓰게 (cuDNN 문제 회피)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import json, argparse
from pathlib import Path
import numpy as np
import soundfile as sf
import tensorflow as tf
import tensorflow_hub as hub

TARGET_CLASSES = {
    "laughter": ["Laughter"],
    "applause": ["Applause"],
    "cheering": ["Cheering"],
}

def load_wav_16k(path: Path):
    wav, sr = sf.read(str(path))
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    if sr != 16000:
        raise ValueError("input must be 16k wav")
    return wav.astype(np.float32), sr

def merge_events(events, gap=0.25):
    """같은 reaction_type이 서로 가까우면 병합"""
    if not events:
        return []
    events.sort(key=lambda x: (x["reaction_type"], x["start"], x["end"]))
    merged = []
    cur = dict(events[0])
    for e in events[1:]:
        if e["reaction_type"] == cur["reaction_type"] and e["start"] <= cur["end"] + gap:
            cur["end"] = max(cur["end"], e["end"])
            cur["score"] = max(cur["score"], e.get("score", 0.0))
        else:
            merged.append(cur)
            cur = dict(e)
    merged.append(cur)
    merged.sort(key=lambda x: x["start"])
    return merged

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", required=True)
    ap.add_argument("--output", required=True)
    ap.add_argument("--threshold", type=float, default=0.15)
    ap.add_argument("--chunk_sec", type=float, default=10.0)
    args = ap.parse_args()

    in_wav = Path(args.input)
    out_json = Path(args.output)

    yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
    class_map_path = yamnet.class_map_path().numpy().decode("utf-8")
    class_names = [line.strip().split(",")[2] for line in open(class_map_path, "r", encoding="utf-8").read().splitlines()[1:]]

    wav, sr = load_wav_16k(in_wav)
    total_dur = len(wav) / sr

    # YAMNet frame hop은 보통 약 0.48초
    hop = 0.48

    all_events = []
    chunk_samples = int(args.chunk_sec * sr)

    for start_samp in range(0, len(wav), chunk_samples):
        end_samp = min(start_samp + chunk_samples, len(wav))
        chunk = wav[start_samp:end_samp]
        offset_sec = start_samp / sr

        scores, embeddings, spectrogram = yamnet(chunk)
        scores = scores.numpy()
        frames = scores.shape[0]

        for rtype, names in TARGET_CLASSES.items():
            idxs = [i for i, nm in enumerate(class_names) if nm in names]
            if not idxs:
                continue

            rscore = scores[:, idxs].max(axis=1)
            active = rscore >= args.threshold

            i = 0
            while i < frames:
                if not active[i]:
                    i += 1
                    continue
                j = i
                peak = float(rscore[i])
                while j < frames and active[j]:
                    peak = max(peak, float(rscore[j]))
                    j += 1

                s = offset_sec + i * hop
                e = offset_sec + j * hop
                s = min(s, total_dur)
                e = min(e, total_dur)

                if e - s >= 0.3:
                    all_events.append({
                        "start": float(s),
                        "end": float(e),
                        "reaction_type": rtype,
                        "score": float(peak),
                    })
                i = j

    all_events = merge_events(all_events, gap=0.25)

    payload = {
        "video_id": in_wav.stem.replace(".16k", ""),
        "created_at": __import__("datetime").datetime.utcnow().isoformat() + "Z",
        "threshold": args.threshold,
        "events": all_events,
        "note": "CPU-only TF (CUDA_VISIBLE_DEVICES='')"
    }
    out_json.parent.mkdir(parents=True, exist_ok=True)
    out_json.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

if __name__ == "__main__":
    main()


Overwriting yamnet_detect.py


In [15]:
import subprocess

def stage3_yamnet_one(video_id: str, threshold=0.15):
    in_wav = DIRS["audio"] / f"{video_id}.16k.wav"
    out_json = DIRS["yamnet"] / f"{video_id}.json"

    if out_json.exists() and out_json.stat().st_size > 0:
        return out_json

    cmd = [
        "python", "yamnet_detect.py",
        "--input", str(in_wav),
        "--output", str(out_json),
        "--threshold", str(threshold),
        "--chunk_sec", "60"
    ]
    # 조용히 실행하고 싶으면 capture_output=True로 바꿔도 됨
    p = subprocess.run(cmd, capture_output=True, text=True)
    if p.returncode != 0:
        raise RuntimeError(p.stderr[:1200] or p.stdout[:1200])

    return out_json

def stage3_yamnet_all(videos, threshold=0.15):
    ok, bad = 0, []
    for it in tqdm(videos, desc="Stage3 YAMNet (CPU subprocess)"):
        vid = it["video_id"]
        try:
            stage3_yamnet_one(vid, threshold=threshold)
            ok += 1
        except Exception as e:
            bad.append((vid, str(e)[:300]))
    print("yamnet ok:", ok, "bad:", len(bad))
    if bad:
        print("sample errors:", bad[:3])
    return bad

stage3_errors = stage3_yamnet_all(final_videos, threshold=YAMNET_THRESHOLD)


Stage3 YAMNet (CPU subprocess): 100%|██████████| 30/30 [00:00<00:00, 364.16it/s]

yamnet ok: 30 bad: 0



