In [1]:
import os
import speech_recognition as sr
import soundfile as sf
import sounddevice as sd

In [2]:
CUE_IN_PATH = "./data/cue_intro.wav"
CUE_OUT_PATH = "./data/cue_outro.wav"
SYNC_FILE = "./data/cloudsync.wav"

In [3]:
def play_sound(filename):

    data, samplerate = sf.read(filename)
    sd.play(data, samplerate)

def recognize_command(rec_text, keywords):
    """
    This method splits the recognized text into the first word and the rest
    It then cleans the first word and stores it as self.command and stores the rest of the text as self.content
    """
    rec_text = rec_text.lstrip()
    split_text = rec_text.split(" ")
    command, content = "", ""

    if len(split_text) > 1:
        
        for word in split_text:
            formatted_word = "".join(letter for letter in word if letter.isalnum())
            formatted_word = formatted_word.lower()
            if formatted_word in keywords:
                command = formatted_word
                content = " ".join(split_text[split_text.index(word) + 1:])
                print('Command: ', command)
                print('Content: ', content)
                break
    
    else:
        command = "".join(letter for letter in split_text[0] if letter.isalnum())
        command = command.lower()

    if command in keywords:

        rec_command = command

        if len(split_text) > 1:
            rec_content = content
        else:
            rec_content = ''

        print(f"Command [{command}] successfully recognized !")
        return rec_command, rec_content

    else:
        print(f"Command [{command}] was not recognized")
        return 'none', ''

In [4]:
microphone = sr.Microphone()
recognizer = sr.Recognizer()
keywords = ["insert", "delete", "type", "select", "click", "strong", "italic"]

In [5]:
with microphone as source:

    # Calibrate recognizer
    print('Calibrating...')
    recognizer.adjust_for_ambient_noise(source)
    print('...Done')

    play_sound(CUE_IN_PATH)
    print("Say something...")
    audio = recognizer.listen(source)

try:
    recognized_text = recognizer.recognize_whisper(audio, language='english')
    print(f"Whisper thinks you said: {recognized_text}")
    play_sound(CUE_OUT_PATH)
    command, content = recognize_command(recognized_text, keywords)
    print(command, content)
except sr.UnknownValueError:
    print('Speech recognition could not understand audio')
except sr.RequestError as e:
    print(f"Speech recognition error: {e}")

Calibrating...
...Done
Say something...
Whisper thinks you said:  Hello computer, please type Strip.
Command:  type
Content:  Strip.
Command [type] successfully recognized !
type Strip.
