# CARA: Validation Prototype

This notebook implements the "Patient Actor" validation loop using:
- **Brain**: Google Gemini (`gemini-1.5-flash`)
- **Ears**: Local Faster-Whisper (via `cara.engines`)
- **Mouth**: Local Chatterbox TTS (via `cara.engines`)

### Dependencies
This runs **locally** using the `cara-audio` source code.

In [1]:
# 1. Setup Environment & Path
import sys
import os
import asyncio
from pathlib import Path
from dotenv import load_dotenv

# Add ../src to python path so we can import 'cara'
project_root = Path("..").resolve()
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

load_dotenv() 

# Gemini Key
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [2]:
# 2. Initialize Local Models (STT/TTS)
from cara.models import model_manager
from cara.engines.streaming import StreamingTTSEngine
from cara.config import get_settings
from cara.engines.transcription import TranscriptionEngine
from cara.utils.audio import audio_to_wav_bytes
import torch

print("‚è≥ Loading Local Models (This may take a moment)...\n")
if not model_manager.is_loaded:
    await model_manager.load_all()
print("\n‚úÖ Models Loaded! Ready to speak.")

‚è≥ Loading Local Models (This may take a moment)...



  from pkg_resources import resource_filename
  from .autonotebook import tqdm as notebook_tqdm
Fetching 6 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 6741.45it/s]
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)


loaded PerthNet (Implicit) at step 250,000

‚úÖ Models Loaded! Ready to speak.


## 3. Audio Tools (Local Wrapper)

In [3]:
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np
import collections

# VAD Parameters
SAMPLE_RATE = 16000 # Whisper likes 16k
BLOCK_SIZE = 1024 # Buffer size
THRESHOLD = 0.02 # Startup energy threshold
SILENCE_THRESHOLD = 0.015 # End-of-speech threshold
SILENCE_DURATION = 1.0 # Seconds of silence to trigger stop
MAX_DURATION = 30.0 # Safety cutoff

async def listen_local() -> str:
    """Records audio with Voice Activity Detection (VAD)"""
    print("üé§ Listening... (Start speaking to activate)")
    
    q = collections.deque(maxlen=int(SAMPLE_RATE * SILENCE_DURATION / BLOCK_SIZE))
    recording = []
    
    # 1. Wait for speech
    try:
        with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=None, blocksize=BLOCK_SIZE, dtype='float32') as stream:
            print("   [Waiting for voice...]")
            while True:
                indata, _ = stream.read(BLOCK_SIZE)
                rms = np.sqrt(np.mean(indata**2))
                if rms > THRESHOLD:
                    print("   [Detected Voice! Recording...]")
                    recording.append(indata)
                    break
            
            # 2. Record until silence
            silence_start = None
            while True:
                indata, _ = stream.read(BLOCK_SIZE)
                recording.append(indata)
                rms = np.sqrt(np.mean(indata**2))
                
                if rms < SILENCE_THRESHOLD:
                    if silence_start is None:
                        silence_start = stream.time
                    elif stream.time - silence_start > SILENCE_DURATION:
                        print("   [Silence detected. Stopping.]")
                        break
                else:
                    silence_start = None
                    
                if len(recording) * BLOCK_SIZE / SAMPLE_RATE > MAX_DURATION:
                     print("   [Max duration reached.]")
                     break

        # 3. Save
        if not recording:
             return ""

        full_audio = np.concatenate(recording, axis=0)
        
        # Save temp file
        temp_wav = "input_local.wav"
        # Scale float32 (-1..1) to int16 for wavfile write
        write(temp_wav, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
        
        # Transcribe using Local Engine
        print("üëÇ Transcribing (Local Whisper)...")
        settings = get_settings()
        engine = TranscriptionEngine(model_manager.stt, settings)
        
        result = await engine.transcribe(temp_wav, language="it")
        
        text = result["text"].strip()
        print(f"üë§ You: {text}")
        return text

    except Exception as e:
        print(f"\n‚ö†Ô∏è Audio Input Error: {e}")
        return ""

async def speak_local(text: str):
    """Text -> Local TTS Streaming -> Play Live"""
    if not text: return
    print(f"ü§ñ CARA: {text}")
    print("üó£Ô∏è Speaking (Streaming)...")
    
    tts_engine = model_manager.tts
    settings = get_settings()
    stream_engine = StreamingTTSEngine(tts_engine, settings)
    
    try:
        # Open a Raw stream for PCM16 data
        stream = sd.RawOutputStream(
            samplerate=tts_engine.sample_rate,
            channels=1,
            dtype='int16' # PCM16
        )
        stream.start()
        
        async for chunk in stream_engine.stream(text, language="it"):
            stream.write(chunk)
            
        stream.stop()
        stream.close()
    except Exception as e:
        print(f"Streaming Error: {e}")


## 4. The Brain (LangGraph + Gemini)

In [4]:
from typing import Annotated, TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage

# State
class AgentState(TypedDict):
    messages: Annotated[list, add_messages]

# Model
llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview", temperature=0.7)

# Prompt
SYSTEM_PROMPT = """
Sei CARA, un'assistente vocale amorevole per un'anziana signora.
Parla IN ITALIANO. Risposte BREVI (massimo 2 frasi).
Tono: Caldo, premuroso, lento.
"""

# Graph
def chatbot_node(state: AgentState):
    return {"messages": [llm.invoke([SystemMessage(content=SYSTEM_PROMPT)] + state["messages"])]}

graph_builder = StateGraph(AgentState)
graph_builder.add_node("chatbot", chatbot_node)
graph_builder.add_edge(START, "chatbot")
graph_builder.add_edge("chatbot", END)
app = graph_builder.compile()

In [5]:
# 5. Run The Loop

state = {"messages": []}

In [6]:
async def run_loop():
    print("üîÅ Starting Conversation Loop. Press Ctrl+C directly in the kernel to stop.")
    while True:
        # Listen
        user_input = await listen_local()
        if not user_input:
            # No speech detected or error, just continue listening
            continue
            
        # Think
        print("üß† Thinking...")
        state["messages"].append(HumanMessage(content=user_input))
        result = await app.ainvoke(state)
        ai_response = result["messages"][-1].content
        if isinstance(ai_response, list):
            ai_response = " ".join([block["text"] for block in ai_response if "text" in block])
        state["messages"].append(result["messages"][-1])
        
        # Speak
        await speak_local(ai_response)

# Run it!
try:
    await run_loop()
except KeyboardInterrupt:
    print("\nüõë Loop Stopped by User.")