## 1. Install the llama stack client and other libraries

In [None]:
%pip install llama_stack==0.3.0
%pip install sounddevice
%pip install kokoro

## 2. List available models

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://localhost:8321")
client.models.list()

## 3. Set the LLM and Embedding Model

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://localhost:8321")

models = client.models.list()

# Select the LLM and first embedding models
model_id = "ollama/qwen2.5:3b"
embedding_model_id = (
    em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]

print(model_id)
print(embedding_model_id)

## 4. Ask LLM a question about Red Bank Financial

In [None]:
from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient
agent = Agent(
    client,
    model=model_id,
    instructions="""
    You are a helpful assistant.
    Answer questions briefly and to the best of your knowledge.
    """,
)

prompt = "Who founded Red Bank Financial?"
print("prompt>", prompt)

response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session_0"),
    stream=True,
)

for log in AgentEventLogger().log(response):
    print(log, end="", flush=True)

## 5. Download the files, chunk the documents, and insert into vector DB

In [None]:
import io
from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient
import requests

vector_db_name = "redbank_knowledge_base"
client = LlamaStackClient(base_url="http://localhost:8321")

sources = ["https://raw.githubusercontent.com/ChristianZaccaria/redbank-kb/main/redbankfinancial_about.pdf", "https://raw.githubusercontent.com/ChristianZaccaria/redbank-kb/main/redbankfinancial_faq.pdf"]

# Upload all files first and collect file_ids
file_ids = []
for source in sources:
    print("Downloading and uploading document:", source)
    
    # Download the docs from URL
    response = requests.get(source)
    file_content = io.BytesIO(response.content)
    filename = source.split("/")[-1]
    
    # Upload file to storage
    file = client.files.create(
        file=(filename, file_content, "application/pdf"),
        purpose="assistants"
    )
    file_ids.append(file.id)
    print(f"‚úì Uploaded {filename} (file_id: {file.id})")

# Create vector store from uploaded files
vector_store = client.vector_stores.create(
    name=vector_db_name,
    file_ids=file_ids,
    chunking_strategy={
        "type": "static",
        "static": {
            "max_chunk_size_tokens": 512,
            "chunk_overlap_tokens": 128
        }
    },
    extra_body={
        "embedding_model": embedding_model_id,
        "embedding_dimension": embedding_dimension,
        "provider_id": "milvus"
    }
)
print("Created vector store with ID:", vector_store.id)

## 6. Define AI Agent and Kokoro
Prompt the LLM with a question in relation to the documents inserted, and see it return accurate answers.

In [None]:
def elevenlabs_speak(text: str):
    from dotenv import load_dotenv
    from elevenlabs.client import ElevenLabs
    from elevenlabs.play import play
    load_dotenv()
    elevenlabs = ElevenLabs(
    api_key="sk_4955ac123c0319ff726b70b128a60a2264976596e2acc606",
    )
    audio = elevenlabs.text_to_speech.convert(
        text=text,
        voice_id="JBFqnCBsd6RMkjVDRZzb",
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_128",
    )
    play(audio, use_ffmpeg=False)

def kokoro_speak(text: str):
    import sounddevice as sd
    from kokoro import KPipeline

    pipe = KPipeline(lang_code="a", device='mps', repo_id="hexgrad/Kokoro-82M")

    # Kokoro now returns a generator
    for i, (graphemes, phonemes, audio) in enumerate(pipe(text, voice="af_heart")):
        print(f"‚ñ∂Ô∏è Segment {i}: {graphemes}")
        sd.play(audio, 24000, blocking=True)  # play immediately as each chunk is produced

    print("‚úÖ Done speaking with Kokoro locally.")

def run_agent(text: str | None = None, return_voice: bool = False):
    agent = Agent(
        client,
        model=model_id,
        instructions="""
        You are a helpful, concise Red Bank Financial assistant who answer questions briefly by using the file_search tool.
        - All questions are from customers of Red Bank Financial.
        - Do not show your reasoning steps.
        - You are answering questions to the customer directly.
        - Use the file_search tool to answer all questions in relation to banks, a bank, and red bank financial bank.
        - If the user asks to speak to a real agent, use the file_search tool to retrieve the relevant information on our best agents and provide it to the user briefly."
        - Do not add any filler, speculation, or statements such as "based on the information provided" or "unfortunately...".
        - DO NOT include say "This is a fact" or "For more FAQs", or any file references.
        - DO NOT mention source files or document references in your response.
        - End your response right after the relevant steps or answer.
        """,
        tools=[
            {
                "type": "file_search",
                "vector_store_ids": [vector_store.id]
            }
        ],
    )

    if text is not None:
        prompt = "Red Bank Financial Customer: " + text
        print(prompt)

    if return_voice:
        response = agent.create_turn(
            messages=[{"role": "user", "content": prompt}],
            session_id=agent.create_session("rag_session"),
            stream=False,
        )
        final_text = response.output_text

        print("final_text>", final_text)
        # kokoro_speak(final_text)
        elevenlabs_speak(final_text)

## Speak to the model

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import requests
from llama_stack_client import LlamaStackClient
import numpy as np
import queue
import time
from kokoro import KPipeline

WHISPER_URL = "http://localhost:8000/transcribe"
LLAMASTACK_URL = "http://localhost:8321"
MODEL_ID = model_id

client = LlamaStackClient(base_url=LLAMASTACK_URL)

def record_until_silence(filename="input.wav", samplerate=24000, threshold=500, silence_time=2.0):
    print(f"üéôÔ∏è Recording... will stop after {silence_time:.1f}s of silence.")
    q, frames = queue.Queue(), []

    def callback(indata, *_):
        q.put(indata.copy())

    with sd.InputStream(samplerate=samplerate, channels=1, dtype="int16", callback=callback):
        last_sound_time = time.time()
        while True:
            data = q.get()
            rms = np.sqrt(np.mean(data.astype(np.float32)**2))
            frames.append(data)
            if rms < threshold and time.time() - last_sound_time >= silence_time:
                break
            if rms >= threshold:
                last_sound_time = time.time()

    write(filename, samplerate, np.concatenate(frames))
    print(f"‚úÖ Audio recorded: {filename}")
    return filename

def transcribe_audio(filename):
    with open(filename, "rb") as f:
        res = requests.post(WHISPER_URL, files={"file": f})
    res.raise_for_status()
    text = res.json().get("text", "").strip()
    print(f"üìù Transcription: {text}")
    return text

if __name__ == "__main__":
    wav = record_until_silence()
    pipe = KPipeline(lang_code="a", device='mps', repo_id="hexgrad/Kokoro-82M")
    prompt = transcribe_audio(wav)
    if prompt:
        run_agent(prompt, return_voice=True)