## 1. Install the llama stack client

In [None]:
%pip install llama-stack

## 2. List available models

In [3]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://lsd-llama-milvus-service:8321")
print(client.models.list())

# Check what vector databases exist
print("=== Available Vector Databases ===")
vector_dbs = client.vector_dbs.list()
if vector_dbs:
    for vdb in vector_dbs:
        print(f"- ID: {vdb.identifier}")
        print(f"  Provider: {vdb.provider_id}")
        print(f"  Embedding Model: {vdb.embedding_model}")
        print()
else:
    print("No vector databases found!")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/vector-dbs "HTTP/1.1 200 OK"


[Model(identifier='vllm', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='vllm', model_type='llm'), Model(identifier='ibm-granite/granite-embedding-125m-english', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]
=== Available Vector Databases ===
- ID: my_demo_image_ocr_vector_id
  Provider: milvus
  Embedding Model: ibm-granite/granite-embedding-125m-english



## 3. Import and run the KubeFlow Pipeline
Import the "[docling_convert_images_pipeline_ocr_only_compiled.yaml](./docling_convert_images_pipeline_ocr_only_compiled.yaml)" KubeFlow Pipeline into your pipeline server, then run the pipeline to insert your PDF documents into the vector database.

When running the pipeline, you can customize the following parameters:

- `base_url`: Base URL to fetch Image files from
- `image_filenames`: Comma-separated list of PNG/JPG filenames to download and convert
- `num_workers`: Number of parallel workers
- `vector_db_id`: Milvus vector database ID
- `service_url`: Milvus service URL
- `embed_model_id`: Embedding model to use
- `max_tokens`: Maximum tokens per chunk
- `use_gpu`: Enable/disable GPU acceleration

Note: The compiled pipeline was generated by running `python docling_convert_images_pipeline_ocr_only.py`.

## 4. Prompt the LLM
Prompt the LLM with a question in relation to the documents inserted, and see it return accurate answers.

In [4]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model="vllm",
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["my_demo_image_ocr_vector_id"]},
        }
    ],
)

prompt = "List RAG key market use cases"
print("prompt>", prompt)

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

response = rag_agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=session_id,
    stream=True,
)

for log in AgentEventLogger().log(response):
    log.print()

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/55314250-e41e-4fa3-896c-41262dc57472/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/55314250-e41e-4fa3-896c-41262dc57472/session/f415ad31-e62e-4698-8de4-1d807225841a/turn "HTTP/1.1 200 OK"


prompt> List RAG key market use cases
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG key market use cases'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 2 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\nKnowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\nCode Generation: Retrieving relevant code snippets and documentation to assist in code creation.\nRecommendation Systems: Enhancing recommendations by providing relevant context.\nCustomer Service: Improving support accuracy with access to current product information.\nPersonal Assistants: Enabling more comprehensive and accurate information from Al assistants .\nMulti-hop Question Answering: Handling complex;

In [5]:
prompt = "Describe the sequence of steps of the Ingestion Flow"
print("prompt>", prompt)

# session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

response = rag_agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=session_id,
    stream=True,
)

for log in AgentEventLogger().log(response):
    log.print()

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/55314250-e41e-4fa3-896c-41262dc57472/session/f415ad31-e62e-4698-8de4-1d807225841a/turn "HTTP/1.1 200 OK"


prompt> Describe the sequence of steps of the Ingestion Flow
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Ingestion Flow RAG sequence of steps'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 2 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\nKnowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\nCode Generation: Retrieving relevant code snippets and documentation to assist in code creation.\nRecommendation Systems: Enhancing recommendations by providing relevant context.\nCustomer Service: Improving support accuracy with access to current product information.\nPersonal Assistants: Enabling more comprehensive and accurate information from Al assistants .\nMulti-hop Qu

Or Query chunks from a vector database.

In [None]:
query_result = client.vector_io.query(
    vector_db_id=vector_db_id,
    query="what do you know about?",
)
print(query_result)

### Congratulations! You've successfully inserted your PDF documents via a KubeFlow Pipeline, and queried your RAG application using Llama Stack! 🎉🥳