In [1]:
# internal imports
from LM import LMBackend
from TTS import TTSBackend
from SpeechRecognition import SpeechRecognitionBackend
from QuestionClassifier import QuestionClassifierBackend
from database_mgr import DatabaseQABackend
from AnswerClassifier import Answer_Classifier

# external imports
import queue
import sounddevice as sd
import numpy as np
import time
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Global Variables
audio_queue = queue.Queue()

# Constants
RECORD_THRESHOLD = 200
SAMPLING_RATE = 16000
REC_DURATION = 1
GEMMA_PATH:str = "/home/aggddmm/Desktop/ITPJ_Aux/gemma-2-2b-it"
WHISPER_PATH:str = "/home/aggddmm/Desktop/ITPJ_Aux/whisper-small.en"
DB_PATH:str = "Database/HistoricalQA_DB.sqlite3"

# Switches
robot_enable:bool = True
# if set to False, use robotSay()
USE_DEFAULT_TTS:bool = False
ENABLE_POSTURE:bool = False

IP_TITLE = "ip"
PORT_TITLE = "port"
MESSAGE_TITLE = "message"
ERROR_TITLE = "error"
DURATION_TITLE = "time"
GET = "GET"
POST = "POST"
ERROR = -1

PY2_SERVER_IP = "127.0.0.1"
PY2_SERVER_PORT = 26386

ROBOT_IP = "192.168.1.113"
ROBOT_PORT = 9559

# Start All Backend Services

In [3]:
# Load Database
db = DatabaseQABackend(DB_PATH)

In [4]:
# Load Language Model Backend
lm_instance = LMBackend()
lm_instance.init(GEMMA_PATH)

[+] initializing LMBackend
    -> Using device:  cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  7.89it/s]


    -> LMBackend loaded


In [5]:
# Load Speech Recognition Backend
sr_instance = SpeechRecognitionBackend()
sr_instance.init(WHISPER_PATH)

[+] initializing SpeechRecognitionBackend
    -> Using device:  cuda
    -> SpeechRecognitionBackend loaded


In [6]:
# Load Question Classifier Backend
qc_instance = QuestionClassifierBackend()
qc_instance.init("q_classification_model")
# Load Answer Classifier Backend
ac_instance = Answer_Classifier()
ac_instance.init("a_classification_model")

In [7]:
# block to instantiate Python 3 Server
def connect_server(ip, port, method, api_entry='/checkConnection', data=None):
    if method == GET:
        try:
            respond = requests.get("http://" + ip + ":" + str(port) + api_entry)
        except requests.exceptions.RequestException as e:
            print ("[-] Error: ", e)
            return ERROR
    if method == POST:
        try:
            respond = requests.post("http://" + ip + ":" + str(port) + api_entry, json=data)
        except requests.exceptions.RequestException as e:
            print( "[-] Error: ", e)
            return ERROR
    if respond.status_code != 200:
        print ("[-] Error: ", respond.status_code)
        return ERROR
    return respond

# Connection to Py2 Server
if (connect_server(PY2_SERVER_IP, PY2_SERVER_PORT, GET).status_code == 200):
    print ("[+] Py2 - Py3 Server Connection Established")
    # Set Robot IP and Port
    
    robot_connection_data:dict = {IP_TITLE:ROBOT_IP, PORT_TITLE:ROBOT_PORT}
    robot_ip_port = connect_server(ip=PY2_SERVER_IP, port=PY2_SERVER_PORT, method=POST, api_entry='/setRobotIPPort', data=robot_connection_data).json()
    
    robot_enable = True
else:
    print ("[-] Failed to connect to Py2 Server, frezzing robot related functions...")
    robot_enable = False

# List all postures, more like a connection test   
if (robot_enable):
    avail_posture:list = connect_server(ip=PY2_SERVER_IP, port=PY2_SERVER_PORT, method=GET, api_entry='/getAllAvailBehavior').json()[MESSAGE_TITLE]
    print ("[+] Available Postures: ")   
    for posture in avail_posture:
        print ("    --> " + posture)
    

[+] Py2 - Py3 Server Connection Established
[+] Available Postures: 
    --> animationMode
    --> animations/SitOnPod/BodyTalk/Listening/Listening_6
    --> animations/SitOnPod/BodyTalk/Listening/Listening_2
    --> animations/SitOnPod/BodyTalk/Listening/Listening_3
    --> animations/SitOnPod/BodyTalk/Listening/Listening_7
    --> animations/SitOnPod/BodyTalk/Listening/Listening_5
    --> animations/SitOnPod/BodyTalk/Listening/Listening_4
    --> animations/SitOnPod/BodyTalk/Listening/Listening_1
    --> animations/SitOnPod/BodyTalk/Listening/Listening_8
    --> animations/SitOnPod/BodyTalk/Thinking/Remember_3
    --> animations/SitOnPod/BodyTalk/Thinking/ThinkingLoop_2
    --> animations/SitOnPod/BodyTalk/Thinking/Remember_2
    --> animations/SitOnPod/BodyTalk/Thinking/Remember_1
    --> animations/SitOnPod/BodyTalk/Thinking/ThinkingLoop_1
    --> animations/SitOnPod/BodyTalk/Speaking/BodyTalk_12
    --> animations/SitOnPod/BodyTalk/Speaking/BodyTalk_4
    --> animations/SitOnPod/B

In [8]:
# defines posture name
thinking_posture:str = "dummy"

In [9]:
# Load TTS Backend
tts_instance = None
if (robot_enable and (not USE_DEFAULT_TTS)):
    tts_instance = None
else:
    tts_instance = TTSBackend()
    tts_instance.init("/home/aggddmm/Desktop/ITPJ_Aux/fastspeech2_conformer")

In [10]:
qc_instance.classify("What is the capital of Italy?")

'otherQuestion'

# Key Functions of Chatting Service

In [11]:
# Function definitions
""" These functions are from Speech-Recognition branch, with process_audio_stream modified to utlise all backends to generate voice response """
""" Different ways to coordinate these functions. """

def audio_callback(indata):
    audio_queue.put(indata.copy())  # Put the captured audio

def get_mic_amplitude(input_stream, duration):
    data, overflowed = input_stream.read(SAMPLING_RATE * duration)
    return np.linalg.norm(data) * 10

def process_audio_stream(audio_input:dict) -> None:
    """ from voice input to voice response """
    if(robot_enable and ENABLE_POSTURE):
        connect_server(ip=PY2_SERVER_IP, port=PY2_SERVER_PORT, method=POST, api_entry='/startBehavior', data={MESSAGE_TITLE:thinking_posture})
    # recognize audio
    sr_result = sr_instance.recognize(audio_input)["text"]
    print("**** Debug ****: ", sr_result)
    # classify question
    question_type = qc_instance.classify(sr_result)
    print("**** Debug ****: ", question_type)
    
    # Historical Question Pipeline
    extra_prompt:str = ""
    if question_type == "historicalQuestion":
        db_result = db.get_answer(sr_result)
        extra_prompt = "This question is a historical question. Here is the answer fetched from the database: \n"
        print("**** DB Fetched ****")
        index:int = 1
        for row in db_result:
            print("    " + str(index) + '. ' + row)
            extra_prompt += row + "\n"
        extra_prompt += "If answers above are not relevant, **clearly** state answer not found in database.\n"
        extra_prompt += "\n"
        print("**** Debug ****")
    
    # generate response
    lm_result = lm_instance.generate_text(extra_prompt + sr_result)
    
    # End Posture
    if(robot_enable and ENABLE_POSTURE):
        connect_server(ip=PY2_SERVER_IP, port=PY2_SERVER_PORT, method=POST, api_entry='/stopBehavior', data={MESSAGE_TITLE:thinking_posture})
        
    # generate voice response
    if tts_instance is not None:
        tts_result = tts_instance.synthesize(lm_result)
        # play audio
        sd.play(tts_result["array"], samplerate=tts_result["sampling_rate"])
        sd.wait()
    else:
        connect_server(ip=PY2_SERVER_IP, port=PY2_SERVER_PORT, method=POST, api_entry='/robotSay', data={"message":lm_result})

def debug_player(audio_data):
    """Debug function to play the audio from the queue."""
    print("Playing audio...")
    sd.play(audio_data, SAMPLING_RATE)

# Dedicated Debug Block For main function
cuz bugs really easy to 'be produced' in this block....

In [12]:
def main():
    """ This function controls when to record and when to stop recording """
    voice_input_stream = sd.InputStream(channels=1, samplerate=SAMPLING_RATE)
    voice_input_stream.start()
    
    sound_amp_queue = queue.Queue()
    can_record:bool = False
    
    try:
        while True:
            # detect sound amplitude to determine if we should record
            if (sound_amp_queue.qsize() > 15):
                sound_amp_queue.get()
            
            data, overflowed = voice_input_stream.read(SAMPLING_RATE * REC_DURATION)
            volume_norm = np.linalg.norm(data) * 10
            
            if sound_amp_queue.qsize() < 3:
                sound_amp_queue.put(volume_norm)
                continue
            
            # print("amplitude queue: ", sound_amp_queue.queue)
            avg_mic_amplitude = sum(sound_amp_queue.queue) / sound_amp_queue.qsize()
            # only collect background noise level, not outliers.
            if abs(volume_norm - avg_mic_amplitude) > RECORD_THRESHOLD:
                can_record = True
            else:
                sound_amp_queue.put(volume_norm)
            
            if can_record:
                print("[+] Recording...")
                audio_array = np.empty((0, 1)) 
                record_amp_queue = queue.Queue()
                while can_record:
                    audio_array = np.append(audio_array, data)
                    data, overflowed = voice_input_stream.read(SAMPLING_RATE * REC_DURATION)
                    rec_volume_norm = np.linalg.norm(data) * 10
                    # determine when to stop recording
                    record_amp_queue.put(rec_volume_norm)
                    if record_amp_queue.qsize() > 3:
                        record_amp = sum(record_amp_queue.queue) / record_amp_queue.qsize()
                        # terminate recording if the amplitude back to normal
                        if abs(avg_mic_amplitude - record_amp) < RECORD_THRESHOLD:
                            can_record = False
                            audio_data = {"array": audio_array, "sampling_rate": SAMPLING_RATE}
                            # process audio stream
                            # debug_player(audio_data["array"])
                            process_audio_stream(audio_data.copy())
                            print("Recording stopped.")
                            break
                        record_amp_queue.get()
    finally:
        voice_input_stream.stop()
        voice_input_stream.close()

# Start it UP!

In [16]:
main()

[+] Recording...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


**** Debug ****:   What's her name mate?
**** Debug ****:  historicalQuestion




ValueError: Expected 2D array, got 1D array instead:
array=[-0.34581345].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.