## 1. Create Llama Stack client, list available models and vector databases

In [1]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://lsd-llama-milvus-service:8321")

models = client.models.list()
print(f"Models information: {models}\n")

inference_llm = next((model.identifier for model in models if model.model_type == 'llm'), None)
print(f"Identifier for Inference model in usage: {inference_llm}\n")

# Check what vector databases exist
print("=== Available Vector Databases ===")
vector_dbs = client.vector_dbs.list()
if vector_dbs:
    for vdb in vector_dbs:
        print(f"- ID: {vdb.identifier}")
        print(f"  Provider: {vdb.provider_id}")
        print(f"  Embedding Model: {vdb.embedding_model}")
        print()
else:
    print("No vector databases found!")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/vector-dbs "HTTP/1.1 200 OK"


Models information: [Model(identifier='vllm', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='vllm', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]

Identifier for Inference model in usage: vllm

=== Available Vector Databases ===
- ID: asr-vector-db
  Provider: milvus
  Embedding Model: granite-embedding-125m



## 2. Create RAG Agent and prompt the LLM
Prompt the LLM with questions in relation to the documents inserted, and see it return accurate answers.

In [2]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model="vllm",
    instructions="You are a helpful assistant. Answer the user's question based only on the provided search results. Respond with 'I don’t know' if the information is outside of the scope of your knowledge and not present in the search results.",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["asr-vector-db"]},
        }
    ],
)

user_prompts = [
    "List RAG key market use cases",
    "Name Red Hat RAG target audience and customers",
    "What beneficial goals RAG support?",
    "Regular LLM output disadvantages",
    "What is the economics condition at Ireland in 2025?", # Dummy question the model will answer with 'I don’t know' or reason why can't answer 
]

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

for prompt in user_prompts:
    print("prompt>", prompt)
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
        stream=True,
    )
    for log in AgentEventLogger().log(response):
        log.print()

# Get session response for further evaluation of RAG metrics
session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=rag_agent.agent_id,
)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1/turn "HTTP/1.1 200 OK"


prompt> List RAG key market use cases
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG key market use cases'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Key market use cases. RAC is being adopted across various industries for diverse applications,\nincluding knowledge question answering, providing accurate answers in customer service using product\nmanuals or fax. Code generation, retrieving relevant code snippets and documentation to\nassist in code creation. Recommendation systems, enhancing recommendations by providing relevant\ncontext. Customer service, improving support accuracy with access to current product information.\nPersonal assistance, enabling more comprehensive and accurate information from AI assistants.\nMulti-hub question answering, handling co

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1/turn "HTTP/1.1 200 OK"


prompt> Name Red Hat RAG target audience and customers
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Red Hat RAG target audience and customers'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Clarifying target audience and user roles. This document clarifies the target audience and user\nroles for Red Hat Rack project focusing on the distinction between end users and builders.\nEnd users vs. builders. End users consume the final product.\nInteract with a chat GPT-like application. Builders create and configure the AI systems\nused by end users. Configure a Rack backend tweaking parameters for a specific\nexperience such as chat GPT. We are targeting builders not end users. Builders optimize\ntheir systems for their specific end users. Builder archetypes. High-coder bu

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1/turn "HTTP/1.1 200 OK"


prompt> What beneficial goals RAG support?
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG beneficial goals'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Key market use cases. RAC is being adopted across various industries for diverse applications,\nincluding knowledge question answering, providing accurate answers in customer service using product\nmanuals or fax. Code generation, retrieving relevant code snippets and documentation to\nassist in code creation. Recommendation systems, enhancing recommendations by providing relevant\ncontext. Customer service, improving support accuracy with access to current product information.\nPersonal assistance, enabling more comprehensive and accurate information from AI assistants.\nMulti-hub question answering, handling c

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1/turn "HTTP/1.1 200 OK"


prompt> Regular LLM output disadvantages
[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Regular LLM output disadvantages'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: RAC vs. Regular LLM Outputs\nLLMs use machine learning and natural language processing NLP techniques to understand and generate human language for AI inference.\nAI inference is the operational phase of AI where the model is able to apply the learning from training and apply it to real-world solutions and situations.\nLLMs can be incredibly valuable for communication and data processing, but they have disadvantages too.\nLLMs are trained with generally available data but might not include the specific information you want them to reference, such as an internal data set from your organization.\nLLMs ha

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1/turn "HTTP/1.1 200 OK"


prompt> What is the economics condition at Ireland in 2025?
[33minference> [0m[33m[0m[33mI[0m[33m don[0m[33m’t[0m[33m know[0m[33m what[0m[33m the[0m[33m economic[0m[33m conditions[0m[33m will[0m[33m be[0m[33m like[0m[33m in[0m[33m Ireland[0m[33m in[0m[33m [0m[33m202[0m[33m5[0m[33m.[0m[97m[0m
[30m[0m

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/agents/f79e548b-4078-4d37-bdb6-e82a8d190dee/session/6b9db4ef-121c-46fc-bbe5-bf4772c9aac1 "HTTP/1.1 200 OK"


## 3. Preparation for evaluating RAG models using [RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/?h=metrics)

- We will use two key metrics to show the performance of the RAG server:
    1. [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/) - measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
    2. [Response Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/) - metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information.

 - Create .env.dev file and paste there your API Key from [Groq Cloud](https://console.groq.com/home)

In [3]:
from dotenv import load_dotenv

with open(".env.dev", "w") as f:
    f.write('GROQ_API_KEY=PASTE_YOUR_API_KEY_HERE')

# load env variable
load_dotenv(dotenv_path=".env.dev", override=True)

True

In [4]:
import re
from typing import List, Dict, Any, Union
from llama_stack_client.types.agents import Turn

# Compile regex pattern once for better performance
CONTENT_PATTERN = re.compile(r"Content:\s*(.*?)(?=\nMetadata:|$)", re.DOTALL)

# This function extracts the search results for the trace of each query
def extract_retrieved_contexts(turn_object: Turn) -> List[str]:
    """
    Extracts retrieved contexts from LlamaStack tool execution responses.
    
    Args:
        turn_object: A Turn object from LlamaStack containing steps with tool responses
        
    Returns:
        List of retrieved context strings for Ragas evaluation
    """
    retrieved_context = []

    # Filter tool execution steps first to reduce iterations
    tool_steps = [step for step in turn_object.steps if step.step_type == "tool_execution"]
    
    for step in tool_steps:
        for response in step.tool_responses:
            if not response.content or not isinstance(response.content, list):
                continue
                
            # Process all valid text items at once
            text_items = [
                item.text for item in response.content 
                if (hasattr(item, "text") and hasattr(item, "type") and 
                    item.type == "text" and item.text and 
                    item.text.startswith("Result ") and "Content:" in item.text)
            ]
            
            # Extract content from all valid texts
            for text in text_items:
                match = CONTENT_PATTERN.search(text)
                if match:
                    retrieved_context.append(match.group(1).strip())

    return retrieved_context

In [5]:
from ragas.dataset_schema import EvaluationDataset

samples = []

references = [
'''
Key Market Use Cases
RAG is being adopted across various industries for diverse applications, including:

Knowledge Question Answering: Providing accurate answers in customer service using product manuals or FAQs.

Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.

Recommendation Systems: Enhancing recommendations by providing relevant context.

Customer Service: Improving support accuracy with access to current product information.

Personal Assistants: Enabling more comprehensive and accurate information from AI assistants.

Multi-hop Question Answering: Handling complex, multi-step questions through iterative retrieval.

Legal Applications: Retrieving legal documents and case law for reliable legal opinions.

General Task Assistance: Aiding users in various tasks requiring information access and decision-making.

The rising demand for hyper-personalized content in areas like marketing and e-commerce is also a significant driver for RAG adoption, allowing for tailored ad copy and product recommendations.
''',
    
'''
Clarifying Target Audience and User Roles
This document clarifies the target audience and user roles for our project, focusing on the distinction between end-users and builders.
End Users vs. Builders:

End Users: Consume the final product (e.g., interact with a ChatGPT-like application).
Builders: Create and configure the AI systems used by end-users (e.g., configure a RAG backend, tweaking parameters for a specific experience such as ChatGPT).  We are targeting builders, not end-users. Builders optimize their systems for their specific end-users.

Builder Archetypes:

High-Coder Builders (aka pro-code): Prefer SDKs and code-based solutions. They need access to all configurable parameters via APIs and SDKs.  They may also want a quick way to "vibe check" their RAG system via a UI (e.g., llama-stack-cli my-rag-app.py --web).

Low-Coder Builders (no/low-code): Prefer UI-driven workflows and visual tools to configure their systems.  They could benefit from tools like the existing llama-stack playground.

Builders vs. Platformers vs. Opsers:

Builders (AI Engineers/AI Devs): Use the platform and its primitives to build AI systems.  Their skillset and the complexity of their tasks determine whether they are considered AI Engineers or AI Devs.

Platformers (AI Platform Engineers): Platformers focus on building, maintaining, and securing the AI platform and APIs. They serve both Builders (for development) and Opsers (for deployment/operations), ensuring infrastructure is reliable, scalable, and supports self-service.

Opsers (AI/MLOps Engineers): Opsers focus on operationalizing and automating the AI/ML  lifecycle. For example, they use platform APIs to deploy, monitor, and manage models, enabling Builders' models to reach and succeed in production. Opsers work closely with Platformers to ensure infrastructure meets operational needs.

In summary:

Platformers enable builders, and builders create systems for end-users.  Our focus is on empowering builders with the tools and flexibility they need to build the best experiences for their end-users.''',
]

# Constructing a Ragas EvaluationDataset
for i, turn in enumerate(session_response.turns[:2]):
    samples.append(
        {
            "user_input": turn.input_messages[0].content,
            "response": turn.output_message.content,
            "reference": references[i],
            "retrieved_contexts": extract_retrieved_contexts(turn),
        }
    )

ragas_eval_dataset = EvaluationDataset.from_list(samples)
ragas_eval_dataset.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.7.1 available.


Unnamed: 0,user_input,retrieved_contexts,response,reference
0,List RAG key market use cases,[Key market use cases. RAC is being adopted ac...,RAG (Retrieval-Augmented Generation) key marke...,\nKey Market Use Cases\nRAG is being adopted a...
1,Name Red Hat RAG target audience and customers,[Clarifying target audience and user roles. Th...,The target audience and customers of Red Hat R...,\nClarifying Target Audience and User Roles\nT...


## 4. Prerequisites for RAG evaluation

In [6]:
from ragas.metrics import (
    Faithfulness, 
    ResponseRelevancy,
) 
from ragas.dataset_schema import SingleTurnSample 
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(
    model="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0,
)

# Wrap the Groq LLM for use with Ragas
evaluator_llm = LangchainLLMWrapper(llm)

# Using HuggingFace embeddings as a free alternative
embeddings_model = HuggingFaceEmbeddings(
    model_name="ibm-granite/granite-embedding-125m-english"
)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings_model)


# references for both prompts
reference_for_first_prompt = samples[0]["reference"]
reference_for_second_prompt = samples[1]["reference"]

# inputs for both prompts
user_input_for_first_prompt = samples[0]["user_input"]
user_input_for_second_prompt = samples[1]["user_input"]

# responses for both prompts
response_for_first_prompt = samples[0]["response"]
response_for_second_prompt = samples[1]["response"]

# reference lists for both prompts
reference_list_for_first_prompt = [line.strip() for line in reference_for_first_prompt.strip().split('\n')]
reference_list_for_second_prompt = [line.strip() for line in reference_for_second_prompt.strip().split('\n')]

# Retrieved contexts for both prompts
retrieved_contexts_for_first_prompt = samples[0]["retrieved_contexts"]
retrieved_contexts_for_second_prompt = samples[1]["retrieved_contexts"]

print(f"Retrieved contexts for the first prompt: {retrieved_contexts_for_first_prompt}\n")
print(f"Retrieved contexts for the second prompt: {retrieved_contexts_for_second_prompt}\n")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ibm-granite/granite-embedding-125m-english


Retrieved contexts for the first prompt: ['Key market use cases. RAC is being adopted across various industries for diverse applications,\nincluding knowledge question answering, providing accurate answers in customer service using product\nmanuals or fax. Code generation, retrieving relevant code snippets and documentation to\nassist in code creation. Recommendation systems, enhancing recommendations by providing relevant\ncontext. Customer service, improving support accuracy with access to current product information.\nPersonal assistance, enabling more comprehensive and accurate information from AI assistants.\nMulti-hub question answering, handling complex multi-step questions through iterative retrieval.\nLegal applications, retrieving legal documents and case law for reliable legal opinions.\nGeneral task assistance, aiding users in various tasks requiring information access and decision\nmaking. The rising demand for hyper-personalized content in areas like marketing and e-comme

## 5. Evaluate Faithfulness Score for both prompts

In [7]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
faithfulness_scorer = Faithfulness(llm=evaluator_llm)
faithfulness_score_for_first_prompt = await faithfulness_scorer.single_turn_ascore(first_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[0]}': {faithfulness_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'List RAG key market use cases': 1.0


In [8]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
faithfulness_score_for_second_prompt = await faithfulness_scorer.single_turn_ascore(second_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[1]}': {faithfulness_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 9.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 38.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'Name Red Hat RAG target audience and customers': 0.8888888888888888


## 6. Evaluate Response Relevancy for both prompts

In [9]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)
response_relevancy_score_for_first_prompt = await response_relevancy_scorer.single_turn_ascore(first_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[0]}': {response_relevancy_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'List RAG key market use cases': 0.9039218462290007


In [10]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
response_relevancy_score_for_second_prompt = await response_relevancy_scorer.single_turn_ascore(second_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[1]}': {response_relevancy_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 14.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 14.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 14.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'Name Red Hat RAG target audience and customers': 0.9860429489968419
