# Microphone Streaming Examples

A simple notebook that uses Serial/Pyaudio to get the microphone audio and feeds this audio then to Silero VAD.

I created it as an example on how binary data from a stream could be feed into Silero VAD.

In [1]:
# !pip install numpy==1.20.2
# !pip install torch==1.9.0
# !pip install matplotlib==3.4.2
# !pip install torchaudio==0.9.0
# !pip install soundfile==0.10.3.post1
# !pip install pyaudio==0.2.11

In [3]:
# Imports 
import numpy as np
import matplotlib.pylab as plt
import torch
torch.set_num_threads(60)
import torchaudio
from time import time
torchaudio.set_audio_backend("soundfile")
import pyaudio

import serial
from jupyterplot import ProgressPlot
import threading

ENABLE_LAPTOP_MIC = True # Uses laptop microphone instead of arduino microphone
SAMPLE_250ms_audio = False # Reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced.

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

# Helper Functions

# Provided by Alexander Veysov
def int2float(sound):
    abs_max = np.abs(sound).max()
    sound = sound.astype('float32')
    if abs_max > 0:
        sound *= 1/abs_max
    sound = sound.squeeze()  # depends on the use case
    return sound

if ENABLE_LAPTOP_MIC:
    # Pyaudio setup
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    SAMPLE_RATE = 16000
    CHUNK = int(SAMPLE_RATE / 10)

    audio = pyaudio.PyAudio()
    num_samples = 1536

    if SAMPLE_250ms_audio:
        stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=SAMPLE_RATE,
                        input=True,
                        frames_per_buffer=CHUNK,
                    input_device_index=0)
        data = []
        voiced_confidences = []

        print("Started Recording")
        for i in range(0, 100):
            
            audio_chunk = stream.read(num_samples)
            
            # in case you want to save the audio later
            data.append(audio_chunk)
            
            audio_int16 = np.frombuffer(audio_chunk, np.int16);

            audio_float32 = int2float(audio_int16)
            
            # get the confidences and add them to the list to plot them later
            new_confidence = model(torch.from_numpy(audio_float32), 16000).item()
            voiced_confidences.append(new_confidence)
            
        print("Stopped the recording")

        # plot the confidences for the speech
        plt.figure(figsize=(20,6))
        plt.plot(voiced_confidences)
        plt.show()
    else:
        # Real Time Visualization
        # In contrast to the simeple one, this records the audio until to stop the recording by pressing enter.
        
        continue_recording = True

        def stop():
            input("Press Enter to stop the recording:")
            global continue_recording
            continue_recording = False

        def start_recording():
            
            stream = audio.open(format=FORMAT,
                            channels=CHANNELS,
                            rate=SAMPLE_RATE,
                            input=True,
                            frames_per_buffer=CHUNK)
            data = []
            voiced_confidences = []
            
            global continue_recording
            continue_recording = True
            
            pp = ProgressPlot(plot_names=["Silero VAD"],line_names=["speech probabilities"], x_label="audio chunks")
            
            stop_listener = threading.Thread(target=stop)
            stop_listener.start()
            
            while continue_recording:
            
                # 3072 bytes
                audio_chunk = stream.read(num_samples)
                # print("audio_chunk",audio_chunk)

                # 1536 samples 
                audio_int16 = np.frombuffer(audio_chunk, np.int16);
                # print("audio_int16",audio_int16)

                # 1536 samples 
                audio_float32 = int2float(audio_int16)
                # print("audio_float32",audio_float32)
            
                # get the confidences and add them to the list to plot them later
                new_confidence = model(torch.from_numpy(audio_float32), 16000).item()
                voiced_confidences.append(new_confidence)
            
                pp.update(new_confidence)

            pp.finalize()
        start_recording()

else:
    ser = serial.Serial('COM4', 115200)
    pp = ProgressPlot(plot_names=["Silero VAD"],line_names=["speech probabilities"], x_label="audio chunks")
    counter = 0 
    start = time()
    lst = []
    data = []
    voiced_confidences = []
    while True:
        sample = ser.readline()
        try:
            sample = sample.decode('utf-8') # decode from byte to string
            sample = sample.split(',')
            arr = np.array(sample)
            arr = arr.astype(np.int16)
            audio_float32 = int2float(arr)
            # get the confidences and add them to the list to plot them later
            new_confidence = model(torch.from_numpy(audio_float32), 16000).item()
            voiced_confidences.append(new_confidence)
            pp.update(new_confidence)
        except:
            pass
    pp.finalize()       

RuntimeError: Backend "soundfile" is not one of available backends: [].