In [None]:
import pyaudio
import wave
import requests
import json
import os
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from dotenv import load_dotenv
import numpy as np
import time

# Load environment variables
load_dotenv()

# STT and TTS credentials
STT_API_KEY = os.getenv('STT_API_KEY')
STT_URL = os.getenv('STT_URL')
TTS_API_KEY = os.getenv('TTS_API_KEY')
TTS_URL = os.getenv('TTS_URL')

# LLM credentials
API_KEY = os.getenv('API_KEY')
PROJECT_ID = os.getenv('PROJECT_ID')
IBM_CLOUD_URL = os.getenv('IBM_CLOUD_URL')
MODEL_ID = os.getenv('MODEL_ID')

# Optional: Add error checking
MISSING_ENV = []

if not STT_API_KEY:
    MISSING_ENV.append("STT_API_KEY")
if not STT_URL:
    MISSING_ENV.append("STT_URL")
if not TTS_API_KEY:
    MISSING_ENV.append("TTS_API_KEY")
if not TTS_URL:
    MISSING_ENV.append("TTS_URL")
if not API_KEY:
    MISSING_ENV.append("API_KEY")
if not PROJECT_ID:
    MISSING_ENV.append("PROJECT_ID")
if not IBM_CLOUD_URL:
    MISSING_ENV.append("IBM_CLOUD_URL")
if not MODEL_ID:
    MISSING_ENV.append("MODEL_ID")

if MISSING_ENV:
    raise ValueError(f"Missing required environment variables: {', '.join(MISSING_ENV)}. Please check your .env file.")


# Audio recording settings
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
SILENCE_THRESHOLD = 500  # Amplitude threshold for silence detection
SILENCE_DURATION = 2  # Seconds of silence to stop recording

# Initialize LLM
generate_params = {GenParams.MAX_NEW_TOKENS: 900}
model = Model(
    model_id=MODEL_ID,
    params=generate_params,
    credentials={"apikey": API_KEY, "url": IBM_CLOUD_URL},
    project_id=PROJECT_ID
)

def record_audio():
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    print("Recording... Speak now.")
    
    frames = []
    silent_chunks = 0
    recording = True
    
    while recording:
        data = stream.read(CHUNK)
        frames.append(data)
        
        # Convert audio chunk to numpy array for amplitude analysis
        audio_data = np.frombuffer(data, dtype=np.int16)
        amplitude = np.abs(audio_data).mean()
        
        # Check for silence
        if amplitude < SILENCE_THRESHOLD:
            silent_chunks += 1
        else:
            silent_chunks = 0
        
        # Stop recording after sustained silence
        if silent_chunks > (SILENCE_DURATION * RATE / CHUNK):
            recording = False
    
    print("Stopped recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Save audio to WAV file
    wf = wave.open("input.wav", 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    
    return "input.wav"

def speech_to_text(audio_file):
    headers = {"Content-Type": "audio/wav"}
    with open(audio_file, 'rb') as f:
        response = requests.post(
            STT_URL,
            headers=headers,
            data=f,
            auth=("apikey", STT_API_KEY)
        )
    
    if response.status_code == 200:
        result = response.json()
        if result.get("results"):
            return result["results"][0]["alternatives"][0]["transcript"]
    return "Error transcribing audio."

def generate_response(text):
    system_prompt = "You are a helpful assistant."
    formatted_prompt = f"<<SYS>>\n{system_prompt.strip()}\n<</SYS>>\n\n[INST]{text.strip()}[/INST]"
    response = model.generate(prompt=formatted_prompt)["results"][0]["generated_text"].strip()
    return response

def text_to_speech(text):
    headers = {
        "Content-Type": "application/json",
        "Accept": "audio/wav"
    }
    data = json.dumps({"text": text})
    response = requests.post(
        TTS_URL + "?voice=en-US_MichaelV3Voice",
        headers=headers,
        data=data,
        auth=("apikey", TTS_API_KEY)
    )
    
    if response.status_code == 200:
        with open("output.wav", "wb") as f:
            f.write(response.content)
        return "output.wav"
    return None

def play_audio(audio_file):
    wf = wave.open(audio_file, 'rb')
    p = pyaudio.PyAudio()
    stream = p.open(
        format=p.get_format_from_width(wf.getsampwidth()),
        channels=wf.getnchannels(),
        rate=wf.getframerate(),
        output=True
    )
    
    data = wf.readframes(CHUNK)
    while data:
        stream.write(data)
        data = wf.readframes(CHUNK)
    
    stream.stop_stream()
    stream.close()
    p.terminate()

def main():
    while True:
        # Step 1: Record audio
        audio_file = record_audio()
        
        # Step 2: Transcribe audio to text
        transcribed_text = speech_to_text(audio_file)
        print(f"User said: {transcribed_text}")
        
        if "Error" in transcribed_text:
            print("Failed to transcribe speech. Try again.")
            continue
        
        # Step 3: Generate response with LLM
        response_text = generate_response(transcribed_text)
        print(f"AI response: {response_text}")
        
        # Step 4: Convert response to speech
        response_audio = text_to_speech(response_text)
        if response_audio:
            # Step 5: Play the response
            play_audio(response_audio)
        
        # Clean up
        os.remove(audio_file)
        if response_audio:
            os.remove(response_audio)
        print("Puase for 5 seconds")
        time.sleep(5)

if __name__ == "__main__":
    main()

Recording... Speak now.
Stopped recording.
