In [2]:
# Run this cell. If torch is missing it'll print recommended install commands.
try:
    import torch
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Detected device ->", device)
except Exception as e:
    print("PyTorch not found or failed to import.")
    print()
    print("If you have a CUDA GPU (example for CUDA 11.8):")
    print("  pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
    print()
    print("If you want CPU-only (slower):")
    print("  pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
    raise

Torch version: 2.8.0+cpu
CUDA available: False
Detected device -> cpu


In [3]:
# Make sure you have a .env file in your project root or set HUGGINGFACE_TOKEN env var.
import os
from dotenv import load_dotenv
load_dotenv()  # loads .env into environment if present

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN") or os.getenv("HF_ACCESS_TOKEN")

if not HUGGINGFACE_TOKEN:
    raise RuntimeError(
        "HUGGINGFACE_TOKEN not found in env or .env. "
        "Create a token at https://huggingface.co/settings/tokens and place it in your .env as HUGGINGFACE_TOKEN=\"hf_xxx\""
    )

print("Found HuggingFace token (hidden).")


Found HuggingFace token (hidden).


In [13]:
import os, json, math
from pathlib import Path
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline
from tqdm.auto import tqdm

# Paths & model settings
PROJECT_ROOT = Path.cwd()
EMMA_DIR = Path("D:/Calling agent/S3 bucket Extraction/EMMA_ES")
OUTPUT_DIR = Path("D:/Calling agent/S3 bucket Extraction/whisper_diarization_out")
OUTPUT_DIR.mkdir(exist_ok=True)

# Model choices (tweak if needed)
MODEL_SIZE = "small"          # "medium" is a good T4 balance; use "small" if you're low on VRAM or CPU-only
COMPUTE_TYPE = "int8"          # memory-friendly; fallback handled when loading
DEVICE = "cuda" if ( __import__("torch").cuda.is_available() ) else "cpu"

# File extensions to process
AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".webm", ".aac"}

# Helpers: time formatting for SRT
from datetime import timedelta
def sec_to_srt_time(t: float) -> str:
    if t is None:
        t = 0.0
    total_seconds = int(t)
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    millis = int((t - total_seconds) * 1000)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"

In [5]:
# Load Faster-Whisper model with a safe fallback if compute_type fails
print("Device:", DEVICE)
print("Loading Faster-Whisper model:", MODEL_SIZE, "compute_type:", COMPUTE_TYPE)

try:
    asr_model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
except Exception as e:
    print("Failed to load with compute_type =", COMPUTE_TYPE, "-> trying fallback compute_type='int8_float16' then 'float32'")
    try:
        asr_model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type="int8_float16")
    except Exception:
        asr_model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type="float32")

print("ASR model loaded.")

# Load pyannote diarization pipeline
print("Loading pyannote speaker-diarization pipeline (this may download model weights)...")
diar_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
)

# Move pipeline to GPU if available
if DEVICE == "cuda":
    try:
        diar_pipeline = diar_pipeline.to("cuda")
        print("Pyannote pipeline moved to CUDA.")
    except Exception:
        print("Could not move pyannote to CUDA. Proceeding on CPU.")
else:
    print("Pyannote running on CPU (this is slower).")


Device: cpu
Loading Faster-Whisper model: small compute_type: int8


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ASR model loaded.
Loading pyannote speaker-diarization pipeline (this may download model weights)...

Could not download 'pyannote/speaker-diarization-3.1' pipeline.
It might be because the pipeline is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Pipeline.from_pretrained('pyannote/speaker-diarization-3.1',
   ...                          use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the pipeline is gated:
visit https://hf.co/pyannote/speaker-diarization-3.1 to accept the user conditions.
Pyannote running on CPU (this is slower).


In [6]:
def assign_speakers_to_asr(asr_segments, diarization_annotation):
    """
    asr_segments: list of dicts with keys 'start','end','text'
    diarization_annotation: pyannote.annotation.Annotation (returned by pipeline)
    returns merged list of dicts with 'start','end','speaker','text'
    """
    # Build diarization segments list [(start,end,speaker_label), ...]
    diar_segments = []
    for turn, _, label in diarization_annotation.itertracks(yield_label=True):
        diar_segments.append({"start": turn.start, "end": turn.end, "label": label})
    # Map raw labels to friendly names (Speaker 1, Speaker 2, ...)
    unique_labels = sorted({d["label"] for d in diar_segments})
    label_map = {lab: f"Speaker {i+1}" for i, lab in enumerate(unique_labels)}

    merged = []
    for s in asr_segments:
        s_start, s_end = s["start"], s["end"]
        best_label = None
        best_overlap = 0.0
        # score by overlap duration
        for d in diar_segments:
            overlap = max(0.0, min(s_end, d["end"]) - max(s_start, d["start"]))
            if overlap > best_overlap:
                best_overlap = overlap
                best_label = d["label"]

        # fallback: if no overlap, pick diar segment containing midpoint
        if best_overlap == 0.0:
            midpoint = (s_start + s_end) / 2.0
            for d in diar_segments:
                if d["start"] <= midpoint < d["end"]:
                    best_label = d["label"]
                    break

        speaker = label_map.get(best_label, "Unknown")
        merged.append({
            "start": s_start,
            "end": s_end,
            "speaker": speaker,
            "text": s["text"]
        })
    return merged


In [7]:
def save_json(merged_segments, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(merged_segments, f, ensure_ascii=False, indent=2)

def save_srt(merged_segments, out_path):
    lines = []
    for i, seg in enumerate(merged_segments, start=1):
        start_s = sec_to_srt_time(seg["start"])
        end_s = sec_to_srt_time(seg["end"])
        text = f"{seg['speaker']}: {seg['text']}"
        lines.append(str(i))
        lines.append(f"{start_s} --> {end_s}")
        lines.append(text)
        lines.append("")  # blank line
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))


In [14]:
from pathlib import Path
from tqdm.auto import tqdm

if not EMMA_DIR.exists():
    raise RuntimeError(f"EMMA_ES folder not found at {EMMA_DIR}. Update EMMA_DIR path and re-run.")

audio_files = sorted([p for p in EMMA_DIR.iterdir() if p.suffix.lower() in AUDIO_EXTS])
print("Files to process:", len(audio_files))

for audio_path in tqdm(audio_files, desc="Files"):
    print("\nProcessing:", audio_path.name)
    # 1) ASR (Faster-Whisper)
    asr_segments = []
    segments, info = asr_model.transcribe(str(audio_path), beam_size=5, vad_filter=True)  # adjust beam_size as needed
    for seg in segments:
        asr_segments.append({
            "start": seg.start,
            "end": seg.end,
            "text": seg.text.strip()
        })

    # 2) Diarization (pyannote)
    diar = diar_pipeline(str(audio_path))

    # 3) Merge
    merged = assign_speakers_to_asr(asr_segments, diar)

    # 4) Save
    base = OUTPUT_DIR / audio_path.stem
    save_json(merged, str(base.with_suffix(".json")))
    save_srt(merged, str(base.with_suffix(".srt")))
    print("Saved:", base.with_suffix(".json").name, "and", base.with_suffix(".srt").name)

Files to process: 16


Files:   0%|          | 0/16 [00:00<?, ?it/s]


Processing: 2020_12_21_LUFR_529988843072___EMMA_ES__.mp3


Files:   0%|          | 0/16 [00:11<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Print a quick summary of all outputs
outs = sorted(OUTPUT_DIR.glob("*"))
print("Output files in", OUTPUT_DIR)
for f in outs:
    print("-", f.name)

In [19]:
%pip install -U openai-whisper

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
