# Local Voice Agent

## Agent Functions

In [None]:
import sys
import os

# Add CosyVoice paths
cosyvoice_base = r'C:\Users\MIDTOWER\Documents\Speech Synthesis Models\CosyVoice'
matcha_path = os.path.join(cosyvoice_base, 'third_party', 'Matcha-TTS')
sys.path.insert(0, cosyvoice_base)
sys.path.insert(0, matcha_path)

# Other dependencies
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import FileResponse
import uvicorn
import whisper
import ollama
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio
from datetime import datetime

# Define components
app = FastAPI(title="Voice Agent", version="1.0.0")
asr_model = whisper.load_model("small")
model_path = os.path.join(cosyvoice_base, 'pretrained_models', 'CosyVoice2-0.5B')
cosyvoice = CosyVoice2(model_path, load_jit=False, load_trt=False, load_vllm=False, fp16=False)
prompt_audio_path = os.path.join(cosyvoice_base, 'asset', 'zero_shot_prompt.wav')
prompt_speech_16k = load_wav(prompt_audio_path, 16000)

# Conversation history
conversation_history = []

# Speech to text functionality
def transcribe_audio(audio_bytes):
    with open("temp.wav", "wb") as f:
        f.write(audio_bytes)
    result = asr_model.transcribe("temp.wav")

    return result["text"]

# Prompting ollama functionality
def generate_response(user_text):
    conversation_history.append({"role": "user", "content": user_text})
    recent_history = conversation_history[-10:]
    response = ollama.chat(model='llama2', messages=recent_history)
    bot_response = response['message']['content']
    conversation_history.append({"role": "assistant", "content": bot_response})
    
    return bot_response

# Speech synthesis functionality
def synthesize_speech(text, output_path=None):
    if output_path is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
        output_path = f"response_{timestamp}.wav"

    for i, j in enumerate(cosyvoice.inference_zero_shot(
        text,
        "Remind yourself overconfidence is a slow and insidious killer.",
        prompt_speech_16k,
        stream=False
    )):
        torchaudio.save(output_path, j['tts_speech'], cosyvoice.sample_rate)
        print(f"Saved audio: {output_path}, size: {os.path.getsize(output_path)} bytes")
        break
    
    if not os.path.exists(output_path):
        raise Exception(f"Output file not created: {output_path}")
    
    if os.path.getsize(output_path) == 0:
        raise Exception(f"Output file is empty: {output_path}")
    
    return output_path

# API definition
@app.post("/chat/")
async def chat_endpoint(file: UploadFile = File(...)):
    try:
        # Step 1: Receive and transcribe audio
        audio_bytes = await file.read()
        user_text = transcribe_audio(audio_bytes)
        print(f"User said: {user_text}")
        
        # Step 2: Generate response with LLM
        bot_text = generate_response(user_text)
        print(f"Bot responds: {bot_text}")
        
        # Step 3: Convert response to speech
        audio_path = synthesize_speech(bot_text)
        print(f"Audio generated: {audio_path}")
        
        # Step 4: Return audio response
        return FileResponse(audio_path, media_type="audio/wav")
        
    except Exception as e:
        print(f"Error: {e}")
        return {"error": str(e)}

## Server

In [None]:
import threading

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()