In [2]:
import pyaudio
import wave
import requests
import json
import os
from dotenv import load_dotenv
import numpy as np
import time
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams

# 환경변수 불러오기
load_dotenv()

# Watson API Credential
STT_API_KEY = os.getenv('STT_API_KEY')
STT_URL = os.getenv('STT_URL')
TTS_API_KEY = os.getenv('TTS_API_KEY')
TTS_URL = os.getenv('TTS_URL')

# LLM Credential
API_KEY = os.getenv('API_KEY')
PROJECT_ID = os.getenv('PROJECT_ID')
IBM_URL = os.getenv('IBM_URL')
MODEL_ID = os.getenv('MODEL_ID')

# 오디오 설정
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
SILENCE_THRESHOLD = 1000
SILENCE_DURATION = 2

# LLM 세팅
generate_params = {GenParams.MAX_NEW_TOKENS: 900}
model = Model(
    model_id=MODEL_ID,
    params=generate_params,
    credentials={"apikey": API_KEY, "url": IBM_URL},
    project_id=PROJECT_ID
)

def record_audio():
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    print("녹음중... 말을 해주세요.")

    frames = []
    silent_chunks = 0

    while True:
        data = stream.read(CHUNK)
        frames.append(data)
        audio_data = np.frombuffer(data, dtype=np.int16)
        amplitude = np.abs(audio_data).mean()
        if amplitude < SILENCE_THRESHOLD:
            silent_chunks += 1
        else:
            silent_chunks = 0
        if silent_chunks > (SILENCE_DURATION * RATE / CHUNK):
            break

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open("input.wav", 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    return "input.wav"

def speech_to_text(audio_file):
    endpoint = f"{STT_URL}/v1/recognize"
    headers = {"Content-Type": "audio/wav"}
    auth = ("apikey", STT_API_KEY)

    with open(audio_file, 'rb') as f:
        response = requests.post(
            endpoint,
            headers=headers,
            data=f,
            auth=auth
        )

    if response.status_code == 200:
        result = response.json()
        if result.get("results"):
            return result["results"][0]["alternatives"][0]["transcript"]
        else:
            return "(음성 인식 결과 없음)"
    else:
        print("STT 오류:", response.status_code)
        print(response.text)
        return "(STT 오류)"

def generate_response(text):
    system_prompt = "You are a helpful assistant."
    formatted_prompt = f"<<SYS>>\n{system_prompt.strip()}\n<</SYS>>\n\n[INST]{text.strip()}[/INST]"
    try:
        response = model.generate(prompt=formatted_prompt)["results"][0]["generated_text"].strip()
    except Exception as e:
        print("LLM 오류:", str(e))
        response = "(LLM 오류)"
    return response

def text_to_speech(text):
    endpoint = f"{TTS_URL}/v1/synthesize"
    headers = {"Content-Type": "application/json", "Accept": "audio/wav"}
    payload = {
        "text": text
    }
    params = {"voice": "en-US_MichaelV3Voice"}
    auth = ("apikey", TTS_API_KEY)

    response = requests.post(
        endpoint,
        headers=headers,
        params=params,
        json=payload,
        auth=auth,
        stream=True
    )

    if response.status_code == 200:
        with open("output.wav", "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        return "output.wav"
    else:
        print("TTS 오류:", response.status_code)
        print(response.text)
        return None

def play_audio(audio_file):
    wf = wave.open(audio_file, 'rb')
    p = pyaudio.PyAudio()
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)
    data = wf.readframes(CHUNK)
    while data:
        stream.write(data)
        data = wf.readframes(CHUNK)
    stream.stop_stream()
    stream.close()
    p.terminate()

def main():
    while True:
        audio_file = record_audio()    # 1. 녹음
        transcribed = speech_to_text(audio_file)   # 2. 음성→텍스트
        print("사용자:", transcribed)

        response_text = generate_response(transcribed)   # 3. LLM이 답변 만듦
        print("AI:", response_text)

        response_audio = text_to_speech(response_text)   # 4. 답변을 음성으로 변환
        if response_audio:
            play_audio(response_audio)  # 5. 음성 답변 재생

        # 파일 정리
        try:
            os.remove(audio_file)
            if response_audio:
                os.remove(response_audio)
        except Exception as e:
            print("파일 삭제 오류:", str(e))

        print("5초 후 다음 입력을 대기합니다...")
        time.sleep(5)

if __name__ == "__main__":
    main()


Error getting IAM Token.
Reason: <Response [400]>


WMLClientError: Error getting IAM Token.
Reason: <Response [400]>