In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import sounddevice as sd
import threading
import queue
import time
import numpy as np

In [2]:
# Constants
device = "mps"
model_path = "Your model path"
sampling_rate = 16000
duration = 1
audio_queue = queue.Queue()
torch_dtype = torch.float16

RECORD_THRESHOLD = 200

In [3]:
# load the model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_path)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

#### Function Definitions
 - audio_callback(indata)                     (Enqueue Audio Data)
 - record_audio(duration, sampling_rate)      (Automatic Audio Recording)
 - process_audio_stream(sampling_rate)        (Speech Recognition)
 - debug_player()                             (Play what just recorded)

In [4]:
def audio_callback(indata):
    audio_queue.put(indata.copy())  # Put the captured audio 

# record audio trunk into queue
def record_audio(duration, sampling_rate):
    can_record:bool = False
    sound_amp_queue = queue.Queue()
    
    input_stream = sd.InputStream(channels=1, samplerate=sampling_rate)
    input_stream.start()
    try:
        while True:
            # detect sound amplitude to determine if we should record
            if (sound_amp_queue.qsize() > 15):
                sound_amp_queue.get()
            # Data part
            data, overflowed = input_stream.read(sampling_rate * duration)
            volume_norm = np.linalg.norm(data) * 10
            
            """This code section can dynamically adapt ambient noise and determine """
            if sound_amp_queue.qsize() < 3:
                sound_amp_queue.put(volume_norm)
                continue
            # Determine Part
            sound_amp = sum(sound_amp_queue.queue) / sound_amp_queue.qsize()
            if abs(volume_norm - sound_amp) > RECORD_THRESHOLD:
                can_record = True
            else:
                sound_amp_queue.put(volume_norm)

            # print("Sound amplitude: ", volume_norm, "Sound average: ", sound_amp)
            # Record part
            if can_record:
                print("Recording...")
            audio_array = np.empty((0, 1)) 
            record_amp_queue = queue.Queue()
            while can_record:
                audio_array = np.append(audio_array, data)
                data, overflowed = input_stream.read(sampling_rate * duration)
                rec_volume_norm = np.linalg.norm(data) * 10
                # determine when to stop recording
                record_amp_queue.put(rec_volume_norm)
                if record_amp_queue.qsize() > 3:
                    record_amp = sum(record_amp_queue.queue) / record_amp_queue.qsize()
                    # terminate recording if the amplitude back to normal
                    if abs(sound_amp - record_amp) < RECORD_THRESHOLD:
                        can_record = False
                        audio_callback(audio_array)
                        print("Recording stopped.")
                        break
                    record_amp_queue.get()
                # audio_array = np.append(audio_array, data)
            
    finally:
        input_stream.stop()  
        input_stream.close()  
    
def process_audio_stream(sampling_rate):
    """Continuously processes audio chunks from the queue and transcribes them."""
    while True:
        try:
            audio_data = audio_queue.get(timeout=1) 
        except queue.Empty:
            time.sleep(1)
            continue  
        # construct the audio input
        audio_input = {"array": audio_data, "sampling_rate": sampling_rate}
        result = pipe(audio_input)
        print("    ----> Result: " + result["text"], end=" ")
        audio_queue.task_done()

def debug_player():
    """Debug function to play the audio from the queue."""
    while True:
        try:
            audio_data = audio_queue.get()
        except queue.Empty:
            time.sleep(1)
            continue
        print("Playing audio...")
        sd.play(audio_data, sampling_rate)


#### Multi-thread Recording & Speech Recognition
i.e. Code Demo.

In [None]:
# Create threads for recording and processing
recording_thread = threading.Thread(target=record_audio, args=[duration, sampling_rate])
processing_thread = threading.Thread(target=process_audio_stream, args=[sampling_rate])
# debug_thread = threading.Thread(target=debug_player)
# Start the threads
recording_thread.start()
processing_thread.start()
# debug_thread.start()
