In [1]:
# 🧱 BLOCK 1: Setup + Google Drive Connection
# (Run this block first in Google Colab before doing anything else!)

# ✅ Install dependencies
# Includes yt-dlp (for YouTube downloads), pydub (for audio handling), ffmpeg (for format conversion), and pyannote.audio (for speaker diarization)
# pyannote is optional but enables separating speakers — it takes ~2-3 mins to install in Colab
!pip install -q yt-dlp pydub pyannote.audio
!apt-get -qq install ffmpeg

# ✅ Mount Google Drive for saving outputs
from google.colab import drive
from pathlib import Path
import os

print("📂 Mounting Google Drive...")
drive.mount('/content/drive')
drive_export_path = Path("/content/drive/MyDrive/YouTubeAudio")
drive_export_path.mkdir(parents=True, exist_ok=True)

if drive_export_path.exists():
    print("✅ Google Drive mounted successfully!")
    print(f"📁 Folder for audio output is ready: {drive_export_path}\n")
else:
    print("❌ ERROR: Failed to access or create the export folder in Google Drive.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.2/172.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.7/898.7 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m818.9/818.9 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m6.7 MB/s[0m eta [36m

In [2]:
# 🧱 BLOCK 2: Audio Processing + Diarization + Upload
# (Run this block AFTER Block 1 has completed successfully!)

import shutil
import urllib.parse
import subprocess
import time
from pydub import AudioSegment
from contextlib import contextmanager
import signal
from pathlib import Path
import os

# ✅ Load Hugging Face Token Securely from User Input
from getpass import getpass
HF_TOKEN = getpass("🔐 Enter your Hugging Face token (read access): ")
os.environ["HF_TOKEN"] = HF_TOKEN

# ✅ Import diarization pipeline with token
try:
    from pyannote.audio import Pipeline
    diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.environ["HF_TOKEN"])
except Exception as e:
    print(f"⚠️ pyannote.audio not available or failed to initialize: {e}. Diarization disabled.")
    diarization_pipeline = None

# ⏱️ Timeout handler to prevent hangs
@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutError("⏱️ Process exceeded maximum allowed time.")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


🔐 Enter your Hugging Face token (read access): ··········


config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [3]:
# 🧱 BLOCK 3: Pipeline Initialization + Runtime Prep
# Sets up diarization and overlapping speech detection models
# and defines all necessary helper functions for later processing.

# ✅ Import everything we need once
import os
import wave
import torch
import shutil
import signal
import tempfile
import subprocess
import torchaudio
from pydub import AudioSegment, effects
from huggingface_hub import login
from pyannote.audio import Pipeline
from pyannote.audio.pipelines import OverlappedSpeechDetection
from pyannote.core import Segment
from google.colab import drive
from pathlib import Path

# 🔐 Hugging Face login (reads token from Block 2)
login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)

# ✅ Set up diarization pipeline
try:
    diarization_pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization",
        use_auth_token=os.environ["HF_TOKEN"]
    )
except Exception as e:
    print(f"⚠️ pyannote.audio diarization pipeline could not be initialized: {e}")
    diarization_pipeline = None

# ✅ Set up overlapping speech detection (OSD) pipeline
try:
    osd_pipeline = OverlappedSpeechDetection(segmentation="pyannote/segmentation")
    osd_pipeline.instantiate({})
except Exception as e:
    print(f"⚠️ OSD pipeline could not be initialized: {e}")
    osd_pipeline = None

# 📁 Google Drive export folder (already created in Block 1)
export_folder = "/content/drive/MyDrive/YouTubeAudio"

# 🧠 Utility: Get YouTube video title
def get_youtube_title(url):
    try:
        result = subprocess.check_output(["yt-dlp", "--get-title", url], stderr=subprocess.DEVNULL).decode().strip()
        return result.replace(" ", "_").replace("/", "_")[:100]
    except subprocess.CalledProcessError:
        print(f"❌ Failed to get title for {url}")
        return None

# 🧠 Utility: Download best audio stream from YouTube
def download_audio(url, output_path):
    try:
        ydl_opts = {"format": "bestaudio[ext=webm]", "outtmpl": output_path}
        with subprocess.Popen(["yt-dlp", "-f", "bestaudio[ext=webm]", "-o", output_path, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) as proc:
            proc.wait()
        if not os.path.exists(output_path):
            raise FileNotFoundError("Audio download failed")
    except Exception as e:
        raise RuntimeError(f"yt-dlp download failed: {e}")

# 🎧 Convert audio to WAV (48kHz, mono)
def convert_audio(in_path, out_path):
    command = ["ffmpeg", "-y", "-i", in_path, "-ar", "48000", "-ac", "1", out_path]
    subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# ✨ Trim silence + normalize
def trim_and_normalize(path):
    audio = AudioSegment.from_wav(path)
    trimmed = effects.strip_silence(audio, silence_len=400, silence_thresh=-40)
    normalized = effects.normalize(trimmed)
    normalized.export(path, format="wav")

# 📤 Chunk WAV into 10MB segments, tagged with speaker if given
def chunk_and_export(path, speaker_label=None):
    file_base = Path(path).stem
    audio = AudioSegment.from_wav(path)
    max_chunk_size = 10 * 1024 * 1024  # 10MB
    bytes_per_ms = len(audio.raw_data) / len(audio)
    max_ms = max_chunk_size / bytes_per_ms

    for i in range(0, len(audio), int(max_ms)):
        chunk = audio[i:i + int(max_ms)]
        out_name = f"{file_base}_chunk{i//int(max_ms)+1}.wav"
        if speaker_label:
            out_name = f"{speaker_label}_{out_name}"
        out_path = os.path.join(export_folder, out_name)
        chunk.export(out_path, format="wav")
        print(f"📤 Uploaded chunk to Google Drive: {out_path}")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/660b9e20307a2b0cdb400d0f80aadc04a701fc54/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


In [None]:
# 🧱 BLOCK 4: Main Engine — Download, Diarize, and Export
# This block processes YouTube URLs end-to-end and saves diarized speaker chunks to Google Drive.

import os
import tempfile
import subprocess
from pathlib import Path
from pyannote.core import Segment

# 🎯 CONFIG: Middle segment to extract (in seconds)
MIDDLE_SEGMENT_DURATION = 180  # 3 minutes

# 🎯 URL(s) to process
YOUTUBE_URLS = [
    "https://www.youtube.com/watch?v=dj2gAjNU1ro",  # 👈👈👈 REPLACE OR ADD LINKS HERE
]

for url_index, url in enumerate(YOUTUBE_URLS):
    print(f"\n📥 Processing URL {url_index+1}/{len(YOUTUBE_URLS)}")
    title = get_youtube_title(url)
    if not title:
        continue

    webm_path = f"audio_{url_index}.webm"
    wav_path = f"audio_{url_index}.wav"

    print(f"📥 Downloading best audio from: {url}")
    try:
        download_audio(url, webm_path)
        print(f"✅ Audio downloaded: {webm_path}")
    except Exception as e:
        print(f"❌ Failed to download audio from: {url} — {e}")
        continue

    print(f"🎧 Converting {webm_path} to 48000Hz WAV")
    convert_audio(webm_path, wav_path)
    print("✨ Trimming silence and normalizing volume")
    trim_and_normalize(wav_path)

    print("🧠 Running diarization and saving speaker-separated files...")
    try:
        diarization = diarization_pipeline(wav_path)
        osd = osd_pipeline(wav_path)

        # 🧠 Filter out overlapping speech regions
        osd_timeline = osd.get_timeline()
        diarized = []

        for turn, _, speaker in diarization.itertracks(yield_label=True):
            overlap = False
            for overlapping in osd_timeline:
                if turn.overlaps(overlapping):
                    overlap = True
                    break
            if not overlap and turn.duration > 3.0:  # Only keep clean, long enough segments
                diarized.append((turn, speaker))

        # 🧠 Extract only the middle 3 minutes of the audio
        audio = AudioSegment.from_wav(wav_path)
        total_duration = len(audio) / 1000  # in seconds
        middle_start = max(0, (total_duration - MIDDLE_SEGMENT_DURATION) / 2)
        middle_end = middle_start + MIDDLE_SEGMENT_DURATION

        for turn, speaker in diarized:
            if turn.end < middle_start or turn.start > middle_end:
                continue

            # Clip segment
            start_ms = max(turn.start, middle_start) * 1000
            end_ms = min(turn.end, middle_end) * 1000
            segment_audio = audio[start_ms:end_ms]

            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
                segment_audio.export(tmpfile.name, format="wav")
                chunk_and_export(tmpfile.name, speaker)
                os.unlink(tmpfile.name)

    except Exception as e:
        print(f"⚠️ Diarization failed: {e}. Proceeding with full audio.")
        chunk_and_export(wav_path)

print("✅ All YouTube URLs have been processed!")



📥 Processing URL 1/1
📥 Downloading best audio from: https://www.youtube.com/watch?v=dj2gAjNU1ro
✅ Audio downloaded: audio_0.webm
🎧 Converting audio_0.webm to 48000Hz WAV
✨ Trimming silence and normalizing volume
🧠 Running diarization and saving speaker-separated files...
