In [5]:
import ipywidgets as widgets
from IPython.display import display
from threading import Thread
# Pass messages between threads
from queue import Queue

messages = Queue()
recordings = Queue()

record_button = widgets.Button(
    description='Record',
    disabled=False,
    button_style='success',
    icon='microphone'
)

stop_button = widgets.Button(
    description='Stop',
    disabled=False,
    button_style='warning',
    icon='stop'
)

output = widgets.Output()

def start_recording(data):
    # Keep running and recording the microphone
    messages.put(True)
    
    with output:
        display('Starting...')
        record = Thread(target=record_microphone)
        record.start()
        
        transcribe = Thread(target=speech_recognition, args=(output,))
        transcribe.start()

def stop_recording(data):
    with output:
        # Takes message off the queue
        messages.get()
        display('Stopped.')

record_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(record_button, stop_button, output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle())



Output()

In [2]:
# Recording from microphone
!pip install pyaudio

Defaulting to user installation because normal site-packages is not writeable






In [2]:
import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))
    
p.terminate()

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone Array (IntelÂ® Smart ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microphone (IntelÂ® Smart Sound ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputCha

In [4]:
# Values that are optimal for speech recognition
CHANNELS = 1
# Audio sampled
FRAME_RATE = 16000
# Every 20 seconds, generate a transcript
RECORD_SECONDS = 20
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2

# Chunk: how often we are going to read frames from the microphone
def record_microphone(chunk=1024):
    p = pyaudio.PyAudio()
    
    stream = p.open(format=AUDIO_FORMAT, channels=CHANNELS, rate=FRAME_RATE, input=True, input_device_index=8, frames_per_buffer=chunk)
    
    frames = []
    
    # Keep recording if message in queue
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)
        
        # Recorded more than 20 seconds of audio, add audio data to recordings queue
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk:
            recordings.put(frames.copy())
            frames = []
            
    stream.stop_stream()
    stream.close()
    p.terminate()

In [5]:
!pip install vosk torch transformers

Defaulting to user installation because normal site-packages is not writeable




In [6]:
# Call punctuation model
import subprocess
import json
from vosk import Model, KaldiRecognizer
import time

model = Model(model_name="vosk-model-en-us-0.22")
# Speech recognizer
recognizer = KaldiRecognizer(model, FRAME_RATE)
recognizer.SetWords(True)

def speech_recognition(output):
    while not messages.empty():
        # Grabbing our microphone audio off the queue
        frames = recordings.get()
        
        # Joining all chunks together into one single binary string
        recognizer.AcceptWaveform(b''.join(frames))
        # Json format
        result = recognizer.Result()
        # Text key
        text = json.loads(result)['text']
        
        cased = subprocess.check_output('python recasepunc/recasepunc.py predict recasepunc/checkpoint', shell=True, text=True, input=text)
        # Add transcript to output widget
        output.append_stdout(cased)
        time.sleep(1)

vosk-model-en-us-0.22.zip: 100%|██████████████████████████████████████████████████| 1.78G/1.78G [19:52<00:00, 1.60MB/s]
