In [4]:
# internal imports
from LM import LMBackend
from TTS import TTSBackend
from SpeechRecognition import SpeechRecognitionBackend
from QuestionClassifier import QuestionClassifierBackend
from database_mgr import DatabaseQABackend

# external imports
import queue
import sounddevice as sd
import numpy as np
import time

In [5]:
# Global Variables
audio_queue = queue.Queue()
# Constants
RECORD_THRESHOLD = 200
SAMPLING_RATE = 16000
REC_DURATION = 1

# Start All Backend Services

In [6]:
# Load Database
db = DatabaseQABackend("Database/HistoricalQA_DB.sqlite3")

In [7]:
# Load Language Model Backend
lm_instance = LMBackend()
lm_instance.init("/Users/lipeihong/Desktop/IT Project/py3/Language_Model/LM/gemma-2-2b-it")

[+] initializing LMBackend
    -> Using device:  mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

    -> LMBackend loaded


In [8]:
# Load TTS Backend
tts_instance = TTSBackend()
tts_instance.init("espnet/fastspeech2_conformer")

[+] initializing TTS System
    -> Using device:  mps


Some weights of FastSpeech2ConformerHifiGan were not initialized from the model checkpoint at espnet/fastspeech2_conformer_hifigan and are newly initialized: ['mean', 'scale']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


    -> TTS System loaded


In [9]:
# Load Speech Recognition Backend
sr_instance = SpeechRecognitionBackend()
sr_instance.init("/Users/lipeihong/Downloads/whisper-small.en")

[+] initializing SpeechRecognitionBackend
    -> Using device:  mps
    -> SpeechRecognitionBackend loaded


In [19]:
# Load Question Classifier Backend
qc_instance = QuestionClassifierBackend()
qc_instance.init("q_classification_model")

qc_instance.classify("when did world war 2 end")

'historicalQuestion'

# Key Functions of Chatting Service

In [15]:
# Function definitions
""" These functions are from Speech-Recognition branch, with process_audio_stream modified to utlise all backends to generate voice response """
""" Different ways to coordinate these functions. """

def audio_callback(indata):
    audio_queue.put(indata.copy())  # Put the captured audio

def get_mic_amplitude(input_stream, duration):
    data, overflowed = input_stream.read(SAMPLING_RATE * duration)
    return np.linalg.norm(data) * 10

def process_audio_stream(audio_input:dict) -> None:
    """ from voice input to voice response """
    # recognize audio
    sr_result = sr_instance.recognize(audio_input)["text"]
    print("**** Debug ****: ", sr_result)
    # classify question
    question_type = qc_instance.classify(sr_result)
    print("**** Debug ****: ", question_type)
    
    if question_type == "historicalQuestion":
        # TODO: add historical question answering pipeline.
        # Logic: 1. Serch clue in database
        #        2. Generate prompt
        #        3. update sr_result, which is the prompt.
        db_result = db.get_answer(sr_result)
        print("**** Debug ****: ")
        for row in db_result:
            print("    " + row)
        print("**** Debug ****")
        exit(0)
        pass
    
    # generate response
    lm_result = lm_instance.generate_text(sr_result)
    
    # generate voice response
    tts_result = tts_instance.synthesize(lm_result)
    
    # play audio
    sd.play(tts_result["array"], samplerate=tts_result["sampling_rate"])
    sd.wait()

def debug_player(audio_data):
    """Debug function to play the audio from the queue."""
    print("Playing audio...")
    sd.play(audio_data, SAMPLING_RATE)

# Dedicated Debug Block For main function
cuz bugs really easy to 'be produced' in this block....

In [16]:
def main():
    """ This function controls when to record and when to stop recording """
    voice_input_stream = sd.InputStream(channels=1, samplerate=SAMPLING_RATE)
    voice_input_stream.start()
    
    sound_amp_queue = queue.Queue()
    can_record:bool = False
    
    try:
        while True:
            # detect sound amplitude to determine if we should record
            if (sound_amp_queue.qsize() > 15):
                sound_amp_queue.get()
            
            data, overflowed = voice_input_stream.read(SAMPLING_RATE * REC_DURATION)
            volume_norm = np.linalg.norm(data) * 10
            
            if sound_amp_queue.qsize() < 3:
                sound_amp_queue.put(volume_norm)
                continue
            
            # print("amplitude queue: ", sound_amp_queue.queue)
            avg_mic_amplitude = sum(sound_amp_queue.queue) / sound_amp_queue.qsize()
            # only collect background noise level, not outliers.
            if abs(volume_norm - avg_mic_amplitude) > RECORD_THRESHOLD:
                can_record = True
            else:
                sound_amp_queue.put(volume_norm)
            
            if can_record:
                print("[+] Recording...")
                audio_array = np.empty((0, 1)) 
                record_amp_queue = queue.Queue()
                while can_record:
                    audio_array = np.append(audio_array, data)
                    data, overflowed = voice_input_stream.read(SAMPLING_RATE * REC_DURATION)
                    rec_volume_norm = np.linalg.norm(data) * 10
                    # determine when to stop recording
                    record_amp_queue.put(rec_volume_norm)
                    if record_amp_queue.qsize() > 3:
                        record_amp = sum(record_amp_queue.queue) / record_amp_queue.qsize()
                        # terminate recording if the amplitude back to normal
                        if abs(avg_mic_amplitude - record_amp) < RECORD_THRESHOLD:
                            can_record = False
                            audio_data = {"array": audio_array, "sampling_rate": SAMPLING_RATE}
                            # process audio stream
                            # debug_player(audio_data["array"])
                            process_audio_stream(audio_data.copy())
                            print("Recording stopped.")
                            break
                        record_amp_queue.get()
    finally:
        voice_input_stream.stop()
        voice_input_stream.close()

# Start it UP!

In [20]:
main()

[+] Recording...
**** Debug ****:   When did World War 2 end?
**** Debug ****:  historicalQuestion
**** Debug ****:  ['The killing heightened tensions between the U.S. and Iran, leading to military confrontations, regional instability, and global concerns about escalation.', 'The killing heightened tensions between the U.S. and Iran, leading to military confrontations, regional instability, and global concerns about escalation.', 'Through military conquest, administrative reforms, and the establishment of a unified code of laws.', 'Through military conquest, administrative reforms, and the establishment of a unified code of laws.', 'She expanded trade networks, commissioned monumental architecture, and maintained peace during her reign.']
    -> Generated response:  1945! 💥  
Recording stopped.


KeyboardInterrupt: 

: 