# RAG with LlamaIndex

In [42]:
import llama_index
from accelerate.commands.config.update import description
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter  # from text to chunks
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # from chunks to vectors
from llama_index.core.ingestion import IngestionPipeline
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.llms.huggingface_api import  HuggingFaceInferenceAPI
from llama_index.llms.ollama import Ollama
from llama_index.core.evaluation import  FaithfulnessEvaluator
from dotenv import load_dotenv
import os
load_dotenv()

True

In [43]:
# GLOBALS
HF_TOKEN = os.getenv('HF_TOKEN')
PHOENIX_API_KEY = os.getenv('PHOENIX_API_KEY')
model_name = 'BAAI/bge-small-en-v1.5'
big_model_name = 'Qwen/Qwen2.5-Coder-32B-Instruct'

In [40]:
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix",
    endpoint="https://llamatrace.com/v1/traces"
)

NameError: name 'llama_index' is not defined

In [156]:
reader = SimpleDirectoryReader(input_dir='papers')
docs = reader.load_data()

In [157]:
type(docs)

list

In [158]:
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_overlap=0),
        HuggingFaceEmbedding(model_name=model_name)
    ]
)

In [159]:
nodes = await pipeline.arun(documents=docs)

In [160]:
type(nodes)

list

In [161]:
db = chromadb.PersistentClient(path='./alfred_chroma_db')
chroma_collection = db.get_or_create_collection('alfred')
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_overlap=0),
        HuggingFaceEmbedding(model_name=model_name)
    ],
    vector_store=vector_store
)

In [162]:
nodes = await pipeline.arun(documents=docs)

In [163]:
embed_model = HuggingFaceEmbedding(model_name=model_name)
# index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)

In [199]:
llm = HuggingFaceInferenceAPI(model_name=big_model_name, token=HF_TOKEN)
# llm = Ollama(model="gemma3:1b", request_timeout=120.0)
# llm = Ollama(model="llama3.2:latest", request_timeout=120.0)
# llm = Ollama(model="qwen3:8b", request_timeout=120.0)
# llm = Ollama(model="qwen3:1.7b", request_timeout=120.0)

In [200]:
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
    # response_mode="compact",
    # response_mode="refine",
)
answer = query_engine.query("What is CGA?")
print(answer)

CGA is an algorithm designed to solve Single-Agent Corridor Goal (SACG) problems and Multi-Agent Path Finding (MAPF) problems. It operates by selecting corridors in a graph and ensuring that agents can evacuate these corridors optimally. The algorithm prioritizes agents and plans their movements accordingly, updating plans as necessary and ensuring that all agents can reach their goals in a finite amount of time under certain conditions. CGA is implemented in a way that it can handle both single-agent and multi-agent scenarios, with specific procedures to manage corridors and agent priorities.


In [188]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

# build index
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=2,
    embed_model=embed_model
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", llm=llm
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [189]:
# query
response = query_engine.query("What is CGA stands for?")
print(response)

KeyboardInterrupt: 

In [168]:
evaluator = FaithfulnessEvaluator(llm=llm)

In [169]:
eval_result = await evaluator.aevaluate_response(response=response)
# print(eval_result)
print(eval_result.passing)

True


---

# Tools in LlamaIndex


In [213]:
from llama_index.core.tools import FunctionTool
from llama_index.core.tools import QueryEngineTool
from llama_index.tools.google import GmailToolSpec
from llama_index.tools.mcp import BasicMCPClient, McpToolSpec

In [177]:
def get_weather(location: str) -> str:
    """Useful for getting the weather for a given location."""
    print(f'Getting weather for {location}')
    return f"The weather in {location} is sunny"


In [259]:
weather_tool = FunctionTool.from_defaults(
    get_weather,
    name='my_weather_tool',
    description="Useful for getting the weather for a given location.",
    return_direct=False,
)

In [179]:
weather_tool.call('New York')

Getting weather for New York


ToolOutput(content='The weather in New York is sunny', tool_name='my_weather_tool', raw_input={'args': ('New York',), 'kwargs': {}}, raw_output='The weather in New York is sunny', is_error=False)

In [258]:
query_engine_tool = QueryEngineTool.from_defaults(
    query_engine,
    name='search in docs',
    description='Useful to search for information in my docs.'
)

In [205]:
query_engine_tool.call("What is CGA stands for? Answer short.")

ToolOutput(content='CGA stands for Corridor Guided Algorithm.', tool_name='search_in_docs', raw_input={'input': 'What is CGA stands for? Answer short.'}, raw_output=Response(response='CGA stands for Corridor Guided Algorithm.', source_nodes=[NodeWithScore(node=TextNode(id_='00763113-7e2e-4de8-8575-73f2f3c79177', embedding=None, metadata={'page_label': '6', 'file_name': '2024_CGA.pdf', 'file_path': '/Users/perchik/PycharmProjects/Learning_LLM_Agents/papers/2024_CGA.pdf', 'file_type': 'application/pdf', 'file_size': 2606327, 'creation_date': '2025-05-16', 'last_modified_date': '2025-01-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e713b66f-133c-475e-9282-dfb4b8621161', node_type='4', metadata={'page_l

In [212]:
gmail_tool_spec = GmailToolSpec()
gmail_tool_spec_list = gmail_tool_spec.to_tool_list()
for t in gmail_tool_spec_list:
    print(t.metadata.name)
    print(t.metadata.description)
    print('---')

load_data
load_data() -> List[llama_index.core.schema.Document]
Load emails from the user's account.
---
search_messages
search_messages(query: str, max_results: Optional[int] = None)
Searches email messages given a query string and the maximum number
        of results requested by the user
           Returns: List of relevant message objects up to the maximum number of results.

        Args:
            query[str]: The user's query
            max_results (Optional[int]): The maximum number of search results
            to return.
        
---
create_draft
create_draft(to: Optional[List[str]] = None, subject: Optional[str] = None, message: Optional[str] = None) -> str
Create and insert a draft email.
           Print the returned draft's message and id.
           Returns: Draft object, including draft id and message meta data.

        Args:
            to (Optional[str]): The email addresses to send the message to
            subject (Optional[str]): The subject for the event
    

In [216]:
from llama_index.tools.mcp import BasicMCPClient, McpToolSpec

# We consider there is a mcp server running on 127.0.0.1:8000, or you can use the mcp client to connect to your own mcp server.
mcp_client = BasicMCPClient("http://127.0.0.1:8000/sse")
mcp_tool = McpToolSpec(client=mcp_client)

# Using Agents in LlamaIndex

In [271]:
from llama_index.core.agent.workflow import AgentWorkflow
from llama_index.core.agent.workflow import FunctionAgent, ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.core.workflow import Context

In [290]:
# llm = HuggingFaceInferenceAPI(model_name=big_model_name, token=HF_TOKEN)
llm = Ollama(model="ebdm/gemma3-enhanced:12b", request_timeout=120.0)
# llm = Ollama(model="gemma3:1b", request_timeout=120.0)
# llm = Ollama(model="PetrosStav/gemma3-tools:4b", request_timeout=120.0)
# llm = Ollama(model="llama3.2:latest", request_timeout=120.0)
# llm = Ollama(model="qwen3:8b", request_timeout=120.0)
# llm = Ollama(model="qwen3:1.7b", request_timeout=120.0)

In [248]:
def multiply(a: int, b: int) -> int:
    """Multiplies two integers and returns the resulting integer."""
    return a * b

In [249]:
agent = AgentWorkflow.from_tools_or_functions(
    [FunctionTool.from_defaults(multiply)],
    llm=llm
)

In [250]:
ctx = Context(agent)

In [251]:
response = await agent.run('My name is Bob.', ctx=ctx)
response.response.content

'Hello Bob! It’s nice to meet you. How can I help you today?'

In [252]:
response = await agent.run('What is my name again?', ctx=ctx)
response.response.content

'Your name is Bob. 😊'

In [267]:
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
query_engine = index.as_query_engine(llm=llm, similarity_top_k=3) # as shown in the Components in LlamaIndex section

query_engine_tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="look in the docs",
    description="The docs file that is important for the output.",
    return_direct=False,
)
query_engine_agent = AgentWorkflow.from_tools_or_functions(
    [query_engine_tool],
    llm=llm,
    system_prompt="You are a helpful assistant that has access to a database containing persona descriptions. "
)
ctx = Context(query_engine_agent)

In [270]:
response = await query_engine_agent.run('Use docs. What does CGA stand for?', ctx=ctx)
print(response.response.content)

CGA stands for Corridor Generation Algorithm. It’s an algorithm designed to move agents through a space, creating corridors to guide them to their destinations.


##### MAS

In [291]:
async def add(a: int, b: int) -> int:
    """Adds two numbers."""
    return a + b

In [292]:
async def subtract(a: int, b: int) -> int:
    """Subtract two numbers."""
    return a - b

In [293]:
calculator_agent = ReActAgent(
    name="calculator",
    description="Performs basic arithmetic operations",
    system_prompt="You are a calculator assistant. Use your tools for any math operations.",
    tools=[add, subtract],
    llm=llm,
)

In [294]:
query_agent = ReActAgent(
    name="docs_lookup",
    description="Looks up information inside docs.",
    system_prompt="Use your tool to query a RAG system to answer information from docs.",
    tools=[query_engine_tool],
    llm=llm,
)

In [299]:
agent = AgentWorkflow(
    agents=[calculator_agent, query_agent], root_agent='calculator'
)

In [297]:
response = await agent.run(user_msg='Can you add 5 and 3?')
# response = await agent.run(user_msg='What is DCOP? Look up in docs.')
print(response.response.content)

I cannot answer the question with the provided tools.


# Agentic Workflows

In [2]:
import llama_index
from llama_index.core.workflow import StartEvent, StopEvent, Workflow, step, Event, Context
import random
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.llms.ollama import Ollama
from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
HF_TOKEN = os.getenv('HF_TOKEN')
PHOENIX_API_KEY = os.getenv('PHOENIX_API_KEY')
model_name = 'BAAI/bge-small-en-v1.5'
big_model_name = 'Qwen/Qwen2.5-Coder-32B-Instruct'

In [4]:
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix",
    endpoint="https://llamatrace.com/v1/traces"
)

In [5]:
class MyWorkflow(Workflow):
    @step
    async def my_step(self, ev: StartEvent) -> StopEvent:
        # do something here
        # print(type(ev))
        return StopEvent(result='Hello, World!')

In [6]:
w = MyWorkflow(timeout=10, verbose=False)

In [7]:
result = await w.run()
print(result)

Hello, World!


In [8]:
class ProcessingEvent(Event):
    intermediate_result: str

In [9]:
class MultiStepWorkflow(Workflow):
    @step
    async def step_one(self, ev: StartEvent) -> ProcessingEvent:
        # process initial data
        return ProcessingEvent(intermediate_result='Step 1 complete! YEEEEP')
    @step
    async def ste_two(self, ev: ProcessingEvent) -> StopEvent:
        # use the intermediate result
        final_result = f'Finished at: {ev.intermediate_result}'
        return StopEvent(result=final_result)

In [10]:
w = MultiStepWorkflow(timeout=10, verbose=True)

In [11]:
result = await w.run()
print(result)

Running step step_one
Step step_one produced event ProcessingEvent
Running step ste_two
Step ste_two produced event StopEvent
Finished at: Step 1 complete! YEEEEP


In [12]:
class LoopEvent(Event):
    loop_output: str

In [13]:
class MultiStepWorkflow(Workflow):
    @step
    async def step_one(self, ev: StartEvent | LoopEvent) -> ProcessingEvent | LoopEvent:
        if random.randint(0, 1) == 0:
            print('Bad thing happened')
            return LoopEvent(loop_output='Back to step one')
        else:
            print('Great thing happened')
            return ProcessingEvent(intermediate_result='First step is done')
    @step
    async def step_two(self, ev: ProcessingEvent) -> StopEvent:
        final_result = f'Finished at: {ev.intermediate_result}'
        return StopEvent(result=final_result)

In [14]:
w = MultiStepWorkflow(verbose=True)

In [15]:
result = await w.run()
print(result)

Running step step_one
Bad thing happened
Step step_one produced event LoopEvent
Running step step_one
Great thing happened
Step step_one produced event ProcessingEvent
Running step step_two
Step step_two produced event StopEvent
Finished at: First step is done


In [16]:
draw_all_possible_flows(w, 'flow.html')

flow.html


In [17]:
def add(a: int, b: int) -> int:
    """Add two numbers."""
    return a + b

def multiply(a: int, b: int) -> int:
    """Multiply two numbers."""
    return a * b

In [18]:
llm = HuggingFaceInferenceAPI(model_name=big_model_name)
# llm = Ollama(model="gemma3:1b", request_timeout=120.0)
# llm = Ollama(model="llama3.2:latest", request_timeout=120.0)
# llm = Ollama(model="qwen3:8b", request_timeout=120.0)
# llm = Ollama(model="qwen3:1.7b", request_timeout=120.0)
# llm = Ollama(model="mistral", request_timeout=120.0)  # 4.1 GB
# llm = Ollama(model="mistral:7b-instruct-q4_K_M", request_timeout=120.0)
# llm = Ollama(model="qwen2.5:0.5b", request_timeout=120.0)  # 398 MB

In [19]:
multiply_agent = ReActAgent(
    name='multiply_agent',
    description="Is able to multiply two integers",
    tools=[multiply],
    llm=llm,
)
addition_agent = ReActAgent(
    name='add_agent',
    description="Is able to add two integers",
    tools=[add],
    llm=llm,
)

In [20]:
# import ssl
# import certifi
#
# ssl_context = ssl.create_default_context(cafile=certifi.where())

workflow = AgentWorkflow(
    agents=[multiply_agent, addition_agent],
    root_agent='multiply_agent',
)

In [25]:

response = await workflow.run(user_msg='Can you add 5 and 3?')
print(response)

WorkflowRuntimeError: Error in step 'run_agent_step': Cannot connect to host router.huggingface.co:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')]

In [89]:
import certifi
import requests

response = requests.get("https://huggingface.co", verify=certifi.where())


In [91]:
import ssl
import certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

In [24]:
import ssl
import certifi

def get_secure_context():
    try:
        context = ssl.create_default_context(cafile=certifi.where())
        return context
    except Exception as e:
        # Fallback to default SSL context if certifi is not available
        return ssl.create_default_context()

# Usage
ssl_context = get_secure_context()