# CARA: Validation Prototype

This notebook implements the "Patient Actor" validation loop using:
- **Brain**: Google Gemini (`gemini-1.5-flash`)
- **Ears**: Local Faster-Whisper (via `cara.engines`)
- **Mouth**: Local Chatterbox TTS (via `cara.engines`)

### Dependencies
This runs **locally** using the `cara-audio` source code.

In [None]:
# 1. Setup Environment & Path
import sys
import os
import asyncio
from pathlib import Path
from dotenv import load_dotenv

# Add ../src to python path so we can import 'cara'
project_root = Path("..").resolve()
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

load_dotenv() 

# Gemini Key
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [None]:
# 2. Initialize Local Models (STT/TTS)
from cara.models import model_manager
from cara.config import get_settings
from cara.engines.transcription import TranscriptionEngine
from cara.utils.audio import audio_to_wav_bytes
import torch

print("⏳ Loading Local Models (This may take a moment)...\n")
if not model_manager.is_loaded:
    await model_manager.load_all()
print("\n✅ Models Loaded! Ready to speak.")

## 3. Audio Tools (Local Wrapper)

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np

async def listen_local() -> str:
    """Record Mic -> Save WAV -> Local STT Engine"""
    fs = 44100
    duration = 5 # seconds
    print("🎤 Listening (5s)...")
    
    try:
        # Record
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
        sd.wait()
        
        # Save temp file
        temp_wav = "input_local.wav"
        write(temp_wav, fs, (recording * 32767).astype(np.int16))
        
        # Transcribe using Local Engine
        print("👂 Transcribing (Local Whisper)...")
        settings = get_settings()
        engine = TranscriptionEngine(model_manager.stt, settings)
        
        # Run in thread pool if needed, but faster-whisper is usually fast enough
        result = await engine.transcribe(temp_wav, language="it")
        
        text = result["text"].strip()
        print(f"👤 You: {text}")
        return text
    except Exception as e:
        print(f"\n⚠️ Audio Input Error: {e}")
        print("Falling back to text input interface.")
        print("-" * 30)
        return input("Type your message to CARA: ")

async def speak_local(text: str):
    """Text -> Local TTS Engine -> Play"""
    if not text: return
    print(f"🤖 CARA: {text}")
    print("🗣️ Generating Speech (Local Chatterbox)...")
    
    # Generate
    # Note: Using default speaker. You can pass 'speaker_wav' path here for cloning.
    tts_engine = model_manager.tts
    
    # Run generation in executor to not block
    loop = asyncio.get_event_loop()
    def _generate():
         return tts_engine.generate(
            text=text, 
            language="it", # Multilingual model supports IT
            temperature=0.7,
            exaggeration=0.5 # Warm/Balanced
        )
    
    wav_tensor = await loop.run_in_executor(None, _generate)
    
    # Convert to bytes
    audio_np = wav_tensor.squeeze().cpu().numpy()
    wav_bytes = audio_to_wav_bytes(audio_np.tolist(), sample_rate=tts_engine.sample_rate)
    
    # Save and Play
    with open("output_local.wav", "wb") as f:
        f.write(wav_bytes)
        
    # Playback
    from IPython.display import Audio, display
    display(Audio("output_local.wav", autoplay=True))

## 4. The Brain (LangGraph + Gemini)

In [None]:
from typing import Annotated, TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage

# State
class AgentState(TypedDict):
    messages: Annotated[list, add_messages]

# Model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.7)

# Prompt
SYSTEM_PROMPT = """
Sei CARA, un'assistente vocale amorevole per un'anziana signora.
Parla IN ITALIANO. Risposte BREVI (massimo 2 frasi).
Tono: Caldo, premuroso, lento.
"""

# Graph
def chatbot_node(state: AgentState):
    return {"messages": [llm.invoke([SystemMessage(content=SYSTEM_PROMPT)] + state["messages"])]}

graph_builder = StateGraph(AgentState)
graph_builder.add_node("chatbot", chatbot_node)
graph_builder.add_edge(START, "chatbot")
graph_builder.add_edge("chatbot", END)
app = graph_builder.compile()

In [None]:
# 5. Run The Loop

state = {"messages": []}

async def run_turn():
    # Listen
    user_input = await listen_local()
    if not user_input:
        return
        
    # Think
    print("🧠 Thinking...")
    state["messages"].append(HumanMessage(content=user_input))
    result = await app.ainvoke(state)
    ai_response = result["messages"][-1].content
    state["messages"].append(result["messages"][-1])
    
    # Speak
    await speak_local(ai_response)

# Run it!
await run_turn()