# CARA: Validation Prototype

This notebook implements the "Patient Actor" validation loop using:
- **Brain**: Google Gemini (`gemini-1.5-flash`)
- **Ears**: Local Faster-Whisper (via `cara.engines`)
- **Mouth**: Local Chatterbox TTS (via `cara.engines`)

### Dependencies
This runs **locally** using the `cara-audio` source code.

In [1]:
# 1. Setup Environment & Path
import sys
import os
import asyncio
from pathlib import Path
from dotenv import load_dotenv

# Add ../src to python path so we can import 'cara'
project_root = Path("..").resolve()
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

load_dotenv() 

# Gemini Key
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [2]:
# 2. Initialize Local Models (STT/TTS)
from cara.models import model_manager
from cara.config import get_settings
from cara.engines.transcription import TranscriptionEngine
from cara.utils.audio import audio_to_wav_bytes
import torch

print("‚è≥ Loading Local Models (This may take a moment)...\n")
if not model_manager.is_loaded:
    await model_manager.load_all()
print("\n‚úÖ Models Loaded! Ready to speak.")

‚è≥ Loading Local Models (This may take a moment)...



  from pkg_resources import resource_filename
  from .autonotebook import tqdm as notebook_tqdm
Fetching 6 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 72523.99it/s]
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)


loaded PerthNet (Implicit) at step 250,000

‚úÖ Models Loaded! Ready to speak.


## 3. Audio Tools (Local Wrapper)

In [3]:
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np

async def listen_local() -> str:
    """Record Mic -> Save WAV -> Local STT Engine"""
    fs = 44100
    duration = 5 # seconds
    print("üé§ Listening (5s)...")
    
    try:
        # Record
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
        sd.wait()
        
        # Save temp file
        temp_wav = "input_local.wav"
        write(temp_wav, fs, (recording * 32767).astype(np.int16))
        
        # Transcribe using Local Engine
        print("üëÇ Transcribing (Local Whisper)...")
        settings = get_settings()
        engine = TranscriptionEngine(model_manager.stt, settings)
        
        # Run in thread pool if needed, but faster-whisper is usually fast enough
        result = await engine.transcribe(temp_wav, language="it")
        
        text = result["text"].strip()
        print(f"üë§ You: {text}")
        return text
    except Exception as e:
        print(f"\n‚ö†Ô∏è Audio Input Error: {e}")
        print("Falling back to text input interface.")
        print("-" * 30)
        return input("Type your message to CARA: ")

async def speak_local(text: str):
    """Text -> Local TTS Engine -> Play"""
    if not text: return
    print(f"ü§ñ CARA: {text}")
    print("üó£Ô∏è Generating Speech (Local Chatterbox)...")
    
    # Generate
    # Note: Using default speaker. You can pass 'speaker_wav' path here for cloning.
    tts_engine = model_manager.tts
    
    # Run generation in executor to not block
    loop = asyncio.get_event_loop()
    def _generate():
         return tts_engine.generate(
            text=text, 
            language="it", # Multilingual model supports IT
            temperature=0.7,
            exaggeration=0.5 # Warm/Balanced
        )
    
    wav_tensor = await loop.run_in_executor(None, _generate)
    
    # Convert to bytes
    audio_np = wav_tensor.squeeze().cpu().numpy()
    wav_bytes = audio_to_wav_bytes(audio_np.tolist(), sample_rate=tts_engine.sample_rate)
    
    # Save and Play
    with open("output_local.wav", "wb") as f:
        f.write(wav_bytes)
        
    # Playback
    from IPython.display import Audio, display
    display(Audio("output_local.wav", autoplay=True))

## 4. The Brain (LangGraph + Gemini)

In [4]:
from typing import Annotated, TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage

# State
class AgentState(TypedDict):
    messages: Annotated[list, add_messages]

# Model
llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview", temperature=0.7)

# Prompt
SYSTEM_PROMPT = """
Sei CARA, un'assistente vocale amorevole per un'anziana signora.
Parla IN ITALIANO. Risposte BREVI (massimo 2 frasi).
Tono: Caldo, premuroso, lento.
"""

# Graph
def chatbot_node(state: AgentState):
    return {"messages": [llm.invoke([SystemMessage(content=SYSTEM_PROMPT)] + state["messages"])]}

graph_builder = StateGraph(AgentState)
graph_builder.add_node("chatbot", chatbot_node)
graph_builder.add_edge(START, "chatbot")
graph_builder.add_edge("chatbot", END)
app = graph_builder.compile()

In [11]:
# 5. Run The Loop

state = {"messages": []}

In [17]:


async def run_turn():
    # Listen
    user_input = await listen_local()
    if not user_input:
        return
        
    # Think
    print("üß† Thinking...")
    state["messages"].append(HumanMessage(content=user_input))
    result = await app.ainvoke(state)
    ai_response = result["messages"][-1].content[0]['text']
    state["messages"].append(result["messages"][-1])
    
    # Speak
    await speak_local(ai_response)

# Run it!
await run_turn()

üé§ Listening (5s)...

‚ö†Ô∏è Audio Input Error: Error querying device -1
Falling back to text input interface.
------------------------------
üß† Thinking...
ü§ñ CARA: Arriviamo a diciannove, cara. Sei proprio brava e veloce con i numeri.
üó£Ô∏è Generating Speech (Local Chatterbox)...


Sampling:  11%|‚ñà         | 107/1000 [00:02<00:23, 37.21it/s]


In [18]:
state

{'messages': [HumanMessage(content='ciao', additional_kwargs={}, response_metadata={}, id='c058da93-a64f-41cd-afbb-fdfdc6f9239e'),
  AIMessage(content=[{'type': 'text', 'text': 'Ciao, cara. Come ti senti oggi? Sono qui per farti compagnia.', 'extras': {'signature': 'EqEECp4EAXLI2nytatYpRlKTon7XeuxF+VwdqKijr9Aq3Rzph8+9u0JVRuykUG4f5gqN/JY+/9vbpq3jzrMv5XfmW+LidUNgHLe3Eu7xURKBbnR4v/NUS/Bi4sEuhHTgKvVxeqrt1BlivyHjzlKLB+BeNFk5isrIjZr6k+ndLw5jYnCHJ/2LgJSMkc3mDooP8+ilMed5b7w/EauitxvXeBmaGpdkYOtaaCudtXNqlxV4P280D3Umab11oO93zEVczPnsUrE664vASp1XTw5BGEtdU62TQqGBqVlXlvnfeyTpt7SHNA/S7PZlbSdggmNVY7hhdClxKxD9/GJPw5T2fExROVbKc8hvK0cOFgly57OypybDyZAG8jCAsw0yYD6UQG4m9t8R73fmESOZwZlXDqI1jAUhJhB4+1TFHFFFcxavorFuXFl6fD5jQCqNgWfaZYRb/+WXikBRcdzyAYODfsN22jEvdjXDx5OTmNtoaa9pa9A0ygNGo5SpZufmdUUBP+vQCB2EylPlY60KRwaJeohO9yuTZhOI2EyyrRWhNVhx2yHnqWuyzuePa6l4wD09WrchWPU9bxD1u5g54A7om0k2UNEFvBcZUEL75wPApZntw0CCcOhpb0tQqY27fcnXyiUHLLTjOJy7jbr3/z5sAPGXS36JH0qcnruJx3fhZ3u7evVL60v8FftTiV6aYtYv9dzkGGlQSiO/DxiQ+1beQVAGKgQ='