<a href="https://colab.research.google.com/github/20wh1a1230/WEB-PROGRAMMIMG/blob/main/silero_vad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:01<00:00, 1.72MB/s]


In [None]:
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip


## Full Audio

**Speech timestapms from full audio**

In [None]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

[{'end': 31200, 'start': 1568},
 {'end': 73696, 'start': 42528},
 {'end': 108512, 'start': 79392},
 {'end': 163808, 'start': 149024},
 {'end': 181728, 'start': 166944},
 {'end': 211936, 'start': 183328},
 {'end': 227808, 'start': 216608},
 {'end': 241120, 'start': 229920},
 {'end': 252896, 'start': 245280},
 {'end': 285664, 'start': 260640},
 {'end': 301024, 'start': 294432},
 {'end': 311776, 'start': 303648},
 {'end': 420320, 'start': 325664},
 {'end': 455136, 'start': 422432},
 {'end': 490976, 'start': 458784},
 {'end': 520160, 'start': 493088},
 {'end': 566752, 'start': 523808},
 {'end': 601056, 'start': 572448},
 {'end': 621024, 'start': 607264},
 {'end': 669152, 'start': 638496},
 {'end': 691680, 'start': 671776},
 {'end': 712672, 'start': 697888},
 {'end': 748512, 'start': 720928},
 {'end': 798688, 'start': 781856},
 {'end': 853984, 'start': 817696},
 {'end': 865248, 'start': 856608},
 {'end': 903648, 'start': 871968},
 {'end': 916960, 'start': 906272},
 {'end': 952288, 'start': 

In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

## Stream imitation example

In [None]:
## using VADIterator class

vad_iterator = VADIterator(model)
wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 1536 # number of samples in a single audio chunk
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.3} {'end': 2.0} {'start': 2.8} {'end': 4.7} {'start': 5.1} {'end': 6.8} {'start': 9.4} {'end': 13.4} {'start': 13.6} {'end': 15.2} {'start': 15.4} {'end': 15.9} {'start': 16.4} {'end': 18.0} {'start': 18.5} {'end': 19.6} {'start': 20.4} {'end': 28.5} {'start': 28.8} {'end': 32.6} {'start': 32.8} {'end': 35.5} {'start': 35.9} {'end': 37.7} {'start': 38.1} {'end': 38.9} {'start': 40.0} {'end': 43.3} {'start': 43.7} {'end': 44.7} {'start': 45.2} {'end': 46.9} {'start': 48.9} {'end': 50.0} {'start': 51.2} {'end': 54.2} {'start': 54.6} {'end': 59.6} {'start': 60.0} 

In [7]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 1536
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.06508486717939377, 0.43142661452293396, 0.9363492131233215, 0.9912925362586975, 0.9910984635353088, 0.7554672360420227, 0.9901331067085266, 0.9961254000663757, 0.9948359131813049, 0.9947713017463684]
