In [15]:
import os
from autovc.speaker_encoder.utils import *
import soundfile as sf
from autovc.utils.hparams import SpeakerEncoderParams as params
from autovc.utils.preprocess_wav import audio_to_melspectrogram
import numpy as np

In [None]:
walk = [w for w in os.walk("data/yang_test")][0]
root, dirs, files = [w for w in walk]

for file in files:
    waveform = preprocess_wav(os.path.join(root, file))
    sf.write(f"test_yang_silence/{file}", np.asarray(waveform), samplerate = 16000)

In [3]:
walk = [w for w in os.walk("test_yang_silence")][0]
_, _, files = [w for w in walk]

specs_org = []
specs_new = []

for file in files:
    specs_org.append(audio_to_melspectrogram(os.path.join("data/yang_test", file)))
    specs_new.append(audio_to_melspectrogram(os.path.join("test_yang_silence", file)))



In [13]:
lens_new = np.array([np.shape(s)[1] for s in specs_new])
lens_org = np.array([np.shape(s)[1] for s in specs_org])


# (lens_org-lens_new)/lens_org
lens_new.std()
lens_new.mean()


186.2

In [16]:
hparams = params()

def chop(wav):
    """
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats 
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (hparams.vad_window_length * hparams.sampling_rate) // 1000
    
    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    
    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
    
    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                         sample_rate=hparams.sampling_rate))
    voice_flags = np.array(voice_flags)
    
    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, hparams.vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)
    
    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(hparams.vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    
    # return wav[audio_mask == True]
    return audio_mask

wav, source_sr = librosa.load("data/yang_test/aaa_z0030_011.wav", sr=None)
chop(wav)



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  audio_mask = np.round(audio_mask).astype(np.bool)


array([False, False, False, ..., False, False, False])