# **Voice Activity Detection (VAD) with Whisper Transcription**
This notebook uses Whisper to transcribe audio and allows chunk-level re-transcription.

**You can access TEST-1.mp3 audio here.**

---

[Audio Link](https://drive.google.com/file/d/1ODHCbW7LjuTUxd0njylHHHObwQxQVK6D/view?usp=drive_link)

# **INSTALLING MODULES**


In [1]:
!pip install pyannote.audio
!pip install faster-whisper
!pip install torchaudio
!pip install ffmpeg-python pydub

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metada

# **IMPORT NECESSARY LIBRARIES**


In [6]:
from pyannote.audio import Pipeline
from faster_whisper import WhisperModel
from google.colab import files
import torch
import torchaudio
import os
import datetime
from datetime import timedelta
from pydub import AudioSegment
import json

# **Upload file and .mp3 to .wav conversion**

*   Sample Rate - 16 KHz
*   Channel - 1 (mono)
*   Audio Codec - pcm_s16le

In [8]:
#  Upload audio file
uploaded = files.upload()  # User selects a local file (e.g., .mp3)

# Get uploaded file path
audio_path = list(uploaded.keys())[0]
file_ext = audio_path.rsplit('.', 1)[-1].lower()
wav_path = audio_path.rsplit('.', 1)[0] + ".wav"

# Convert only if input is MP3
if file_ext == "mp3":
    print(f"Converting '{audio_path}' to WAV format...")
    !ffmpeg -i "{audio_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{wav_path}"
    print(f"Conversion done: {wav_path}")
else:
    print(f"Uploaded file is not an MP3. Skipping conversion.")

Saving TEST-1.mp3 to TEST-1.mp3
Converting 'TEST-1.mp3' to WAV format...
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable

# **Detecting Speech segments & saving audio chunks using pyannote.audio**

In [None]:
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection",
                                    use_auth_token="HF_TOKEN")

vad_result = pipeline(wav_path)

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


In [10]:
print(vad_result)

[ 00:00:00.030 -->  00:00:03.794] A SPEECH
[ 00:00:03.996 -->  00:00:26.777] B SPEECH
[ 00:00:26.929 -->  00:00:30.760] C SPEECH
[ 00:00:31.283 -->  00:00:33.224] D SPEECH
[ 00:00:33.342 -->  00:00:51.904] E SPEECH
[ 00:00:52.495 -->  00:00:54.132] F SPEECH
[ 00:00:54.368 -->  00:00:56.579] G SPEECH
[ 00:00:56.697 -->  00:01:21.199] H SPEECH
[ 00:01:21.638 -->  00:02:00.974] I SPEECH


In [11]:
audio = AudioSegment.from_file(audio_path)
chunks = []

for i, speech_turn in enumerate(vad_result.get_timeline()):
    start_ms = int(speech_turn.start * 1000)
    end_ms = int(speech_turn.end * 1000)
    chunk = audio[start_ms:end_ms]
    chunk_path = f"chunk_{i}.wav"
    chunk.export(chunk_path, format="wav")
    chunks.append((chunk_path, speech_turn.start, speech_turn.end))


In [12]:
chunks

[('chunk_0.wav', 0.03096875, 3.79409375),
 ('chunk_1.wav', 3.99659375, 26.777843750000002),
 ('chunk_2.wav', 26.929718750000003, 30.76034375),
 ('chunk_3.wav', 31.28346875, 33.22409375),
 ('chunk_4.wav', 33.34221875, 51.90471875),
 ('chunk_5.wav', 52.49534375, 54.13221875),
 ('chunk_6.wav', 54.368468750000005, 56.579093750000006),
 ('chunk_7.wav', 56.697218750000005, 81.19971875),
 ('chunk_8.wav', 81.63846875, 120.97409375000001)]

# **Transcribing Audio Chunks with Whisper and Timestamp Formatting**

In [None]:
HF_TOKEN = "HF_TOKEN"

os.environ["HF_TOKEN"] = HF_TOKEN

In [31]:
def convert_time(secs):
    return str(datetime.timedelta(seconds=round(secs)))

def transcribe_chunks_with_timestamps(chunks, whisper_model="base", compute_type="float16"):

    model = WhisperModel(whisper_model, compute_type=compute_type)
    transcripts = []

    for  path, start, end in chunks:
        segments, _ = model.transcribe(path)
        text = " ".join(segment.text.strip() for segment in segments)

        transcripts.append({
            "start": convert_time(start),
            "end": convert_time(end),
            "duration": str(datetime.timedelta(seconds=round(end - start))),
            "text": text
        })

    return transcripts


In [32]:
transcripts = transcribe_chunks_with_timestamps(chunks, whisper_model="base", compute_type="float16")

# **Saving Transcription Results to Text File & metadata to JSON**

In [24]:
# Saving Transcripts to txt file

with open("transcript.txt", "w", encoding="utf-8") as f:
    f.write(f"{'START':<10} {'END':<10} Text\n")
    f.write("=" * 80 + "\n")
    for segment in transcripts:
        f.write(f"{segment['start']:<10} {segment['end']:<10} {segment['text']}\n\n")  # double newline for spacing

files.download("transcript.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Metadata to JSON

with open("transcript.json", "w") as f:
    json.dump(transcripts, f, indent=4)

files.download("transcript.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>