## 1. Create Llama Stack client, list available models and vector databases

In [1]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://lsd-llama-milvus-service:8321")

models = client.models.list()
print(f"Models information: {models}\n")

inference_llm = next((model.identifier for model in models if model.model_type == 'llm'), None)
print(f"Identifier for Inference model in usage: {inference_llm}\n")

# Check what vector databases exist
print("=== Available Vector Databases ===")
vector_dbs = client.vector_dbs.list()
if vector_dbs:
    for vdb in vector_dbs:
        print(f"- ID: {vdb.identifier}")
        print(f"  Provider: {vdb.provider_id}")
        print(f"  Embedding Model: {vdb.embedding_model}")
        print()
else:
    print("No vector databases found!")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/vector-dbs "HTTP/1.1 200 OK"


Models information: [Model(identifier='vllm', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='vllm', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]

Identifier for Inference model in usage: vllm

=== Available Vector Databases ===
- ID: csv-vector-db
  Provider: milvus
  Embedding Model: granite-embedding-125m



## 2. Create RAG Agent and prompt the LLM
Prompt the LLM with questions in relation to the documents inserted, and see it return accurate answers.

In [2]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model="vllm",
    instructions="You are a helpful assistant. Answer the user's question based only on the provided search results. Respond with 'I don’t know' if the information is outside of the scope of your knowledge and not present in the search results.",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["csv-vector-db"]},
        }
    ],
)

user_prompts = [
    "What is gender, home country and age of Dulce Abril and Philip Gent?",
    "What is customer id, company, city, country, phone number, email, subscription date, subscribed website of of Sheryl Baxter?",
    "What products were sold according to sample sales data?",
    "What is the economics condition at Ireland in 2025?", # Dummy question the model will answer with 'I don’t know' 
]

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

for prompt in user_prompts:
    print("prompt>", prompt)
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
        stream=True,
    )
    for log in AgentEventLogger().log(response):
        log.print()

# Get session response for further evaluation of RAG metrics
session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=rag_agent.agent_id,
)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session/8c30db52-8962-44b0-bf8f-80c39561b112/turn "HTTP/1.1 200 OK"


prompt> What is gender, home country and age of Dulce Abril and Philip Gent?
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Dulce Abril and Philip Gent gender, home country, age'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: 1, First Name = Dulce. 1, Last Name = Abril. 1, Gender = Female. 1, Country = United States. 1, Age = 32. 1, Date = 15/10/2017. 1, Id = 1562. 2, First Name = Mara. 2, Last Name = Hashimoto. 2, Gender = Female. 2, Country = Great Britain. 2, Age = 25. 2, Date = 16/08/2016. 2, Id = 1582. 3, First Name = Philip. 3, Last Name = Gent. 3, Gender = Male. 3, Country = France. 3, Age = 36. 3, Date = 21/05/2015. 3, Id = 2587. 4, First Name = Kathleen. 4, Last Name = Hanner. 4, Gender = Female. 4, Country = United States. 4, Age = 25. 4, Date = 15/10/2017. 

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session/8c30db52-8962-44b0-bf8f-80c39561b112/turn "HTTP/1.1 200 OK"


prompt> What is customer id, company, city, country, phone number, email, subscription date, subscribed website of of Sheryl Baxter?
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Sheryl Baxter customer id, company, city, country, phone number, email, subscription date, subscribed website'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: 1, Customer Id = DD37Cf93aecA6Dc. 1, First Name = Sheryl. 1, Last Name = Baxter. 1, Company = Rasmussen Group. 1, City = East Leonard. 1, Country = Chile. 1, Phone 1 = 229.077.5154. 1, Phone 2 = 397.884.0519x718. 1, Email = zunigavanessa@smith.info. 1, Subscription Date = 2020-08-24. 1, Website = http://www.stephenson.com/. 2, Customer Id = 1Ef7b82A4CAAD10. 2, First Name = Preston. 2, Last Name = Lozano, Dr. 2, Company = Vega-Gentry. 2,

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session/8c30db52-8962-44b0-bf8f-80c39561b112/turn "HTTP/1.1 200 OK"


prompt> What products were sold according to sample sales data?
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'sample sales data products sold'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Widget A, Date = 2024-01-01. Widget A, Quantity = 5. Widget A, Revenue = 5000. Widget B, Date = 2024-01-02. Widget B, Quantity = 10. Widget B, Revenue = 12000. Widget C, Date = 2024-01-03. Widget C, Quantity = 3. Widget C, Revenue = 3000. Widget D, Date = 2024-01-04. Widget D, Quantity = 8. Widget D, Revenue = 8000. Widget A, Date = 2024-01-05. Widget A, Quantity = 7. Widget A, Revenue = 7000. Widget B, Date = 2024-01-06. Widget B, Quantity = 6. Widget B, Revenue = 6000. Widget C, Date = 2024-01-07. Widget C, Quantity = 12. Widget C, Revenue = 15000. Widget D, Date = 2024-01-08. W

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session/8c30db52-8962-44b0-bf8f-80c39561b112/turn "HTTP/1.1 200 OK"


prompt> What is the economics condition at Ireland in 2025?
[33minference> [0m[33m[0m[33mI[0m[33m don[0m[33m’t[0m[33m know[0m[97m[0m
[30m[0m

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/agents/994fe601-236b-4f3c-9e12-31c04b3446e8/session/8c30db52-8962-44b0-bf8f-80c39561b112 "HTTP/1.1 200 OK"


## 3. Preparation for evaluating RAG models using [RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/?h=metrics)

- We will use two key metrics to show the performance of the RAG server:
    1. [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/) - measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
    2. [Response Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/) - metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information.

 - Create .env.dev file and paste there your API Key from [Groq Cloud](https://console.groq.com/home)

In [3]:
from dotenv import load_dotenv

with open(".env.dev", "w") as f:
    f.write('GROQ_API_KEY=YOUR_GROQ_API_KEY')

# load env variable
load_dotenv(dotenv_path=".env.dev", override=True)

True

In [4]:
import re
from typing import List, Dict, Any, Union
from llama_stack_client.types.agents import Turn

# Compile regex pattern once for better performance
CONTENT_PATTERN = re.compile(r"Content:\s*(.*?)(?=\nMetadata:|$)", re.DOTALL)

# This function extracts the search results for the trace of each query
def extract_retrieved_contexts(turn_object: Turn) -> List[str]:
    """
    Extracts retrieved contexts from LlamaStack tool execution responses.
    
    Args:
        turn_object: A Turn object from LlamaStack containing steps with tool responses
        
    Returns:
        List of retrieved context strings for Ragas evaluation
    """
    retrieved_context = []

    # Filter tool execution steps first to reduce iterations
    tool_steps = [step for step in turn_object.steps if step.step_type == "tool_execution"]
    
    for step in tool_steps:
        for response in step.tool_responses:
            if not response.content or not isinstance(response.content, list):
                continue
                
            # Process all valid text items at once
            text_items = [
                item.text for item in response.content 
                if (hasattr(item, "text") and hasattr(item, "type") and 
                    item.type == "text" and item.text and 
                    item.text.startswith("Result ") and "Content:" in item.text)
            ]
            
            # Extract content from all valid texts
            for text in text_items:
                match = CONTENT_PATTERN.search(text)
                if match:
                    retrieved_context.append(match.group(1).strip())

    return retrieved_context

In [5]:
from ragas.dataset_schema import EvaluationDataset

samples = []

references = [
'''
Dulce Abril is 32 years old female from USA and Philip Gent is 36 years old man from France.
''',
    
'''
Sheryl Baxter's customer ID is DD37Cf93aecA6Dc, her company is Rasmussen Group, her city is East Leonard, her country is Chile, her phone numbers are 229.077.5154 and 397.884.0519x718, her email is zunigavanessa@smith.info, her subscription date is 2020-08-24, and her subscribed website is http://www.stephenson.com/.
'''
]

# Constructing a Ragas EvaluationDataset
for i, turn in enumerate(session_response.turns[:2]):
    samples.append(
        {
            "user_input": turn.input_messages[0].content,
            "response": turn.output_message.content,
            "reference": references[i],
            "retrieved_contexts": extract_retrieved_contexts(turn),
        }
    )

ragas_eval_dataset = EvaluationDataset.from_list(samples)
ragas_eval_dataset.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.7.1 available.


Unnamed: 0,user_input,retrieved_contexts,response,reference
0,"What is gender, home country and age of Dulce ...","[1, First Name = Dulce. 1, Last Name = Abril. ...","Based on the search results, here is the infor...",\nDulce Abril is 32 years old female from USA ...
1,"What is customer id, company, city, country, p...","[1, Customer Id = DD37Cf93aecA6Dc. 1, First Na...","Based on the search results, here is the infor...",\nSheryl Baxter's customer ID is DD37Cf93aecA6...


## 4. Prerequisites for RAG evaluation

In [6]:
from ragas.metrics import (
    Faithfulness, 
    ResponseRelevancy,
) 
from ragas.dataset_schema import SingleTurnSample 
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(
    model="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0,
)

# Wrap the Groq LLM for use with Ragas
evaluator_llm = LangchainLLMWrapper(llm)

# Using HuggingFace embeddings as a free alternative
embeddings_model = HuggingFaceEmbeddings(
    model_name="ibm-granite/granite-embedding-125m-english"
)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings_model)


# references for both prompts
reference_for_first_prompt = samples[0]["reference"]
reference_for_second_prompt = samples[1]["reference"]

# inputs for both prompts
user_input_for_first_prompt = samples[0]["user_input"]
user_input_for_second_prompt = samples[1]["user_input"]

# responses for both prompts
response_for_first_prompt = samples[0]["response"]
response_for_second_prompt = samples[1]["response"]

# reference lists for both prompts
reference_list_for_first_prompt = [line.strip() for line in reference_for_first_prompt.strip().split('\n')]
reference_list_for_second_prompt = [line.strip() for line in reference_for_second_prompt.strip().split('\n')]

# Retrieved contexts for both prompts
retrieved_contexts_for_first_prompt = samples[0]["retrieved_contexts"]
retrieved_contexts_for_second_prompt = samples[1]["retrieved_contexts"]

print(f"Retrieved contexts for the first prompt: {retrieved_contexts_for_first_prompt}\n")
print(f"Retrieved contexts for the second prompt: {retrieved_contexts_for_second_prompt}\n")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ibm-granite/granite-embedding-125m-english


Retrieved contexts for the first prompt: ['1, First Name = Dulce. 1, Last Name = Abril. 1, Gender = Female. 1, Country = United States. 1, Age = 32. 1, Date = 15/10/2017. 1, Id = 1562. 2, First Name = Mara. 2, Last Name = Hashimoto. 2, Gender = Female. 2, Country = Great Britain. 2, Age = 25. 2, Date = 16/08/2016. 2, Id = 1582. 3, First Name = Philip. 3, Last Name = Gent. 3, Gender = Male. 3, Country = France. 3, Age = 36. 3, Date = 21/05/2015. 3, Id = 2587. 4, First Name = Kathleen. 4, Last Name = Hanner. 4, Gender = Female. 4, Country = United States. 4, Age = 25. 4, Date = 15/10/2017. 4, Id = 3549. 5, First Name = Nereida. 5, Last Name = Magwood. 5, Gender = Female. 5, Country = United States. 5, Age = 58. 5, Date = 16/08/2016. 5, Id = 2468. 6, First Name = Gaston. 6, Last Name = Brumm. 6, Gender = Male. 6, Country = United States. 6, Age = 24. 6, Date = 21/05/2015. 6, Id = 2554. 7, First Name = Etta. 7, Last Name = Hurn. 7, Gender = Female. 7, Country = Great Britain. 7, Age = 56. 

## 5. Evaluate Faithfulness Score for both prompts

In [7]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
faithfulness_scorer = Faithfulness(llm=evaluator_llm)
faithfulness_score_for_first_prompt = await faithfulness_scorer.single_turn_ascore(first_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[0]}': {faithfulness_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'What is gender, home country and age of Dulce Abril and Philip Gent?': 1.0


In [8]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
faithfulness_score_for_second_prompt = await faithfulness_scorer.single_turn_ascore(second_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[1]}': {faithfulness_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 7.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'What is customer id, company, city, country, phone number, email, subscription date, subscribed website of of Sheryl Baxter?': 0.9166666666666666


## 6. Evaluate Response Relevancy for both prompts

In [9]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)
response_relevancy_score_for_first_prompt = await response_relevancy_scorer.single_turn_ascore(first_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[0]}': {response_relevancy_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'What is gender, home country and age of Dulce Abril and Philip Gent?': 0.9849353950326382


In [10]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
response_relevancy_score_for_second_prompt = await response_relevancy_scorer.single_turn_ascore(second_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[1]}': {response_relevancy_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'What is customer id, company, city, country, phone number, email, subscription date, subscribed website of of Sheryl Baxter?': 0.9287016272341538
