In [None]:
# here we are checking if we can detect a voice at a certain instance of the song
import webrtcvad 
import wave
import contextlib
import os
from pydub import AudioSegment
AudioSegment.converter = "/opt/homebrew/bin/ffmpeg" # this will be different for you

In [42]:
if not AudioSegment.converter:
    raise EnvironmentError("ffmpeg not found")

In [43]:
base_path = "../Music/wav_files"
song = "stargazing-kygo.wav"
song_path = os.path.join(base_path, song)
output_path = "../Music/cleaned_wav_files"

In [44]:
# we need to clean the wav files first
def convert_to_vad_format(base_input_path, base_output_path):
    os.makedirs(base_output_path, exist_ok=True)
    for filename in os.listdir(base_input_path):
        if filename.endswith('.wav'):
            song_path = os.path.join(base_input_path, filename)
            output_path = os.path.join(base_output_path, filename)
            
            sound = AudioSegment.from_file(song_path)
            sound = sound.set_channels(1)
            sound = sound.set_frame_rate(16000)
            sound.export(output_path, format="wav", codec="pcm_s16le")
            
    print("done")

In [45]:
convert_to_vad_format(base_path, output_path)

done


In [None]:
# the current music we have is not compatible with webrtcvad
def check_wav_format(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        sample_rate = wf.getframerate()
        n_frames = wf.getnframes()
        duration = n_frames / float(sample_rate)

        print(f"Channels: {channels}")
        print(f"Sample width: {sample_width * 8} bits")
        print(f"Sample rate: {sample_rate} Hz")
        print(f"Duration: {duration:.2f} sec")

        ok = (
            channels == 1
            and sample_width == 2
            and sample_rate in (8000, 16000, 32000, 48000)
        )
        if ok:
            print("This WAV file is compatible with WebRTC VAD.")
        else:
            print("Not compatible. You may need to convert it.")
    return ok


In [46]:
# just try a song now
results = check_wav_format('../Music/cleaned_wav_files/stargazing-kygo.wav')
results

Channels: 1
Sample width: 16 bits
Sample rate: 16000 Hz
Duration: 236.85 sec
This WAV file is compatible with WebRTC VAD.


True

In [68]:
# now we can attempt to check time segments and see if a voice is present
def is_speech_at_timestamp(file, timestamp, frame_duration_ms=30, aggressiveness=3):
    vad = webrtcvad.Vad(aggressiveness)
    vad = webrtcvad.Vad(aggressiveness)

    with wave.open(file, 'rb') as wf:
        sample_rate = wf.getframerate()
        channels = wf.getnchannels()
        width = wf.getsampwidth()

        assert channels == 1, "Audio must be mono"
        assert width == 2, "Must be 16-bit PCM"
        assert sample_rate in (8000, 16000, 32000, 48000), "Unsupported sample rate"

        frame_bytes = int(sample_rate * (frame_duration_ms / 1000.0) * 2)

        frame_index = int(timestamp * 1000 // frame_duration_ms)

        wf.setpos(frame_index * int(sample_rate * (frame_duration_ms / 1000.0)))
        frame = wf.readframes(int(sample_rate * (frame_duration_ms / 1000.0)))

        if len(frame) < frame_bytes:
            raise ValueError("Timestamp is near the end of the file.")

        return vad.is_speech(frame, sample_rate)

In [None]:
is_speech_at_timestamp('../Music/cleaned_wav_files/wakemeup-avicii.wav', 3.22)


# THIS IS COMPLETE GARBAGE

True