# ASR: using pyaudio and ... whisper is to be used

- New a pyaudio object and find the api index we want

In [1]:
import numpy as np
import pyaudio
import time
import wave
import matplotlib.pyplot as plt
# import whisper
from faster_whisper import WhisperModel
from queue import Queue

- Load asr model

In [2]:
asr_model = WhisperModel('medium', device='cuda', compute_type="int8")

- Find mic

In [3]:
p = pyaudio.PyAudio()
def find_api(audio:pyaudio.PyAudio, name) -> int:
    api_cnt = audio.get_host_api_count()
    for i in range(api_cnt):
        inf = audio.get_host_api_info_by_index(i)
        for key in inf.keys():
            if key=='name':
                if inf[key]==name:
                    return i
    return -1
apii = find_api(audio=p, name='MME')
def find_mic_in_api(audio:pyaudio.PyAudio, apii)->dict:
    dc = audio.get_host_api_info_by_index(apii)['deviceCount']
    mic_in_words = '麥克風'
    ret = None
    for i in range(dc):
        dv = audio.get_device_info_by_host_api_device_index(apii, i)
        indx = int(dv['index'])
        hstApi=int(dv['hostApi'])
        inpuCh=int(dv['maxInputChannels'])
        smplRa=int(dv['defaultSampleRate'])
        name=dv['name']
        if inpuCh>0:
            if mic_in_words in name:
              ret = {'index': i, 'input channels': inpuCh, 'sample rate': smplRa, 'name': name}
              break
    return ret
devi = find_mic_in_api(audio=p, apii=apii)
if not devi:
    print(f"Panic! Could not find mic!")
    exit(-1)

- Stream callback and VU calculation

In [4]:
def calculate_vu(data)->int:
    x = np.frombuffer(data, dtype=np.int16).astype('i4')  # Convert from int16 to int32
    rms = np.sqrt(np.mean(np.square(x)))  # Calculate RMS
    vu = 20 * np.log10(rms)  # Convert RMS to VU (decibels)
    return int(vu)
# Stream callback function
def process(data, fc, tim_inf, flag):
    global to_abort
    global frms
    global vus
    global dat_q
    vu = calculate_vu(data=data)
    vus.append(vu)
    dat_q.put(data)
    if to_abort == False:  # 150 counts are about 3sec
        return (None, pyaudio.paContinue)
    else:
        print(f"To abort. ")
        return (None, pyaudio.paAbort)

- Open stream

In [5]:
to_abort = False
frms = list()
vus = list()
dat_q = Queue()
stream = p.open(
    format=pyaudio.paInt16,\
    channels = devi['input channels'],\
    rate=16000,\
    frames_per_buffer=1024,\
    input=True,\
    input_device_index=devi['index'],\
    stream_callback=process,\
    start=False
)

- Loop

In [6]:
try:
    stream.start_stream()
    print(f"Long time loop started, press ctrl + c to stop.")
    i = 0
    while stream.is_active():
        time.sleep(1.5)
        if not dat_q.empty():
            aud_dat = b''.join(dat_q.queue)
            dat_q.queue.clear()
            frms.append(aud_dat)
            # np_audio = np.frombuffer(aud_dat, dtype=np.int16).astype(np.float32) / 32768.0
            # result = asr_model.transcribe(np_audio, fp16=True)
            # text = result['text'].strip()
            # print(text)
        i += 1
        if i > 5:
            to_abort = True
            time.sleep(1.0)
            break
except KeyboardInterrupt:
    to_abort = True
finally:
    time.sleep(1.0)
    stream.stop_stream()

Long time loop started, press ctrl + c to stop.
To abort. 


In [7]:
# x = np.array(vus)
# plt.plot(x)
# plt.show()

- Transcribe

In [8]:
np_audio = np.frombuffer(b''.join(frms), dtype=np.int16).astype(np.float32) / 32768.0
segments, info = asr_model.transcribe(np_audio, language='zh')
for segment in segments:
    print(segment.text)

TypeError: WhisperModel.transcribe() got an unexpected keyword argument 'fp16'

- Write frames to wav file and close everything

In [7]:
w = wave.open(".\\to.wav",'wb')
w.setnchannels(1)
w.setsampwidth(p.get_sample_size(pyaudio.paInt16))
w.setframerate(16000)
w.writeframes(b''.join(frms))

w.close()
stream.close()
p.terminate()

- More example: https://github.com/davabase/whisper_real_time/blob/master/transcribe_demo.py