In [None]:
#!pip install llama-index-llms-huggingface
#!pip install llama-index-llms-huggingface-api llama-index-embeddings-huggingface
#!pip install llama-index
# !pip install hf_xet
#!pip install llama-index-vector-stores-chroma
#!pip install -U llama-index-callbacks-arize-phoenix
#!pip install llama-index-tools-google
# !pip install llama-index-tools-mcp --issues with this
!pip install llama-index-utils-workflow


In [None]:
# example usage of the Hugging Face inference API for an LLM component.
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Retrieve HF_TOKEN from the environment variables
hf_token = os.getenv("HF_TOKEN")

llm = HuggingFaceInferenceAPI(
    model_name="Qwen/Qwen2.5-Coder-32B-Instruct",
    temperature=0.7,
    max_tokens=100,
    token=hf_token,
    provider="auto"
)

response = llm.complete("Hello, how are you?")
print(response)
# Hello! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you. How can I assist you today?

In [None]:
# While LlamaIndex has many components, we’ll focus specifically on the QueryEngine component. Why? Because it 
# can be used as a Retrieval-Augmented Generation (RAG) tool for an agent.

# So, what is RAG? LLMs are trained on enormous bodies of data to learn general knowledge. However, they may not 
# be trained on relevant and up-to-date data. RAG solves this problem by finding and retrieving relevant information 
# from your data and giving that to the LLM.


# There are five key stages within RAG, which in turn will be a part of most larger applications you build. These are:

# 1. Loading: this refers to getting your data from where it lives — whether it’s text files, PDFs, another website, 
#    a database, or an API — into your workflow. LlamaHub provides hundreds of integrations to choose from.

# 2. Indexing: this means creating a data structure that allows for querying the data. For LLMs, this nearly always 
#    means creating vector embeddings. Which are numerical representations of the meaning of the data. Indexing can 
#    also refer to numerous other metadata strategies to make it easy to accurately find contextually relevant data based on properties.

# 3. Storing: once your data is indexed you will want to store your index, as well as other metadata, to avoid having to re-index it.
# 4. Querying: for any given indexing strategy there are many ways you can utilize LLMs and LlamaIndex data structures 
#    to query, including sub-queries, multi-step queries and hybrid strategies.

# 5. Evaluation: a critical step in any flow is checking how effective it is relative to other strategies, or when you 
#    make changes. Evaluation provides objective measures of how accurate, faithful and fast your responses to queries are.

In [None]:
# Loading and embedding documents
# As mentioned before, LlamaIndex can work on top of your own data, however, before accessing data, we need to load it. 
# There are three main ways to load data into LlamaIndex:
# 1. SimpleDirectoryReader: A built-in loader for various file types from a local directory.
# 2. LlamaParse: LlamaParse, LlamaIndex’s official tool for PDF parsing, available as a managed API.
# 3. LlamaHub: A registry of hundreds of data-loading libraries to ingest data from any source.

# SimpleDirectoryReader component can load various file types from a folder and convert them into Document objects that LlamaIndex
#   from llama_index.core import SimpleDirectoryReader
#   reader = SimpleDirectoryReader(input_dir="path/to/directory")
#   documents = reader.load_data()


# After loading our documents, we need to break them into smaller pieces called Node objects. A Node is just a chunk 
# of text from the original document that’s easier for the AI to work with, while it still has references to the original 
# Document object.

# The IngestionPipeline helps us create these nodes through two key transformations.

# SentenceSplitter breaks down documents into manageable chunks by splitting them at natural sentence boundaries.
# HuggingFaceEmbedding converts each chunk into numerical embeddings - vector representations that capture the semantic 
# meaning in a way AI can process efficiently.
# This process helps us organise our documents in a way that’s more useful for searching and analysis.

from llama_index.core import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_overlap=0),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ]
)

nodes = await pipeline.arun(documents=[Document.example()])

In [None]:
# Storing and indexing documents
# After creating our Node objects we need to index them to make them searchable, but before we can do that, we need a place 
# to store our data.
# Since we are using an ingestion pipeline, we can directly attach a vector store to the pipeline to populate it. In this case,
# we will use Chroma to store our documents.

import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./alfred_chroma_db")
chroma_collection = db.get_or_create_collection("alfred")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ],
    vector_store=vector_store,
)


# This is where vector embeddings come in - by embedding both the query and nodes in the same vector space, we can find 
# relevant matches. The VectorStoreIndex handles this for us, using the same embedding model we used during ingestion to 
# ensure consistency. Let’s see how to create this index from our vector store and embeddings:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)

# All information is automatically persisted within the ChromaVectorStore object and the passed directory path.




In [None]:
# Querying a VectorStoreIndex with prompts and LLMs
# Before we can query our index, we need to convert it to a query interface. The most common conversion options are:

# -- as_retriever: For basic document retrieval, returning a list of NodeWithScore objects with similarity scores
# -- as_query_engine: For single question-answer interactions, returning a written response
# -- as_chat_engine: For conversational interactions that maintain memory across multiple messages, returning a written 
#                    response using chat history and indexed context
# We’ll focus on the query engine since it is more common for agent-like interactions. We also pass in an LLM to the query 
# engine to use for the response.

from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)
query_engine.query("What is the meaning of life?")
# The meaning of life is 42

In [None]:
# Response Processing
# Under the hood, the query engine doesn’t only use the LLM to answer the question but also uses a ResponseSynthesizer as a 
# strategy to process the response. Once again, this is fully customisable but there are three main strategies that work well 
# out of the box:

# refine: create and refine an answer by sequentially going through each retrieved text chunk. This makes a separate LLM call 
#         per Node/retrieved chunk.
# compact (default): similar to refining but concatenating the chunks beforehand, resulting in fewer LLM calls.
# tree_summarize: create a detailed answer by going through each retrieved text chunk and creating a tree structure of the answer.

# The language model won’t always perform in predictable ways, so we can’t be sure that the answer we get is always correct. 
# We can deal with this by evaluating the quality of the answer.

In [None]:
# Evaluation and observability
# LlamaIndex provides built-in evaluation tools to assess response quality. These evaluators leverage LLMs to analyze responses 
# across different dimensions. Let’s look at the three main evaluators available:
# FaithfulnessEvaluator: Evaluates the faithfulness of the answer by checking if the answer is supported by the context.
# AnswerRelevancyEvaluator: Evaluate the relevance of the answer by checking if the answer is relevant to the question.
# CorrectnessEvaluator: Evaluate the correctness of the answer by checking if the answer is correct.
from llama_index.core.evaluation import FaithfulnessEvaluator

query_engine = # from the previous section
llm = # from the previous section

# query index
evaluator = FaithfulnessEvaluator(llm=llm)
response = query_engine.query(
    "What battles took place in New York City in the American Revolution?"
)
eval_result = evaluator.evaluate_response(response=response)
eval_result.passing


# Even without direct evaluation, we can gain insights into how our system is performing through observability. This is especially 
# useful when we are building more complex workflows and want to understand how each component is performing.
# we can install the LlamaTrace callback from Arize Phoenix with the following command: pip install -U llama-index-callbacks-arize-phoenix
# Additionally, we need to set the PHOENIX_API_KEY environment variable to our LlamaTrace API key. We can get this by:
# Creating an account at LlamaTrace - https://llamatrace.com/login
# Generating an API key in your account settings
# Using the API key in the code below to enable tracing

import llama_index
import os

PHOENIX_API_KEY = "<PHOENIX_API_KEY>"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix",
    endpoint="https://llamatrace.com/v1/traces"
)




In [None]:
# ABOVE - how to use components to create a QueryEngine.
# BELOW - how we can use the QueryEngine as a tool for an agent

In [None]:
# There are four main types of tools in LlamaIndex:

# 1. FunctionTool: Convert any Python function into a tool that an agent can use. It automatically figures out how the function works.
# 2. QueryEngineTool: A tool that lets agents use query engines. Since agents are built on query engines, they can also use other agents as tools.
# 3. Toolspecs: Sets of tools created by the community, which often include tools for specific services like Gmail.
# 4. Utility Tools: Special tools that help handle large amounts of data from other tools.





# Creating a FunctionTool
# A FunctionTool provides a simple way to wrap any Python function and make it available to an agent. You can pass either a 
# synchronous or asynchronous function to the tool, along with optional name and description parameters. The name and description 
# are particularly important as they help the agent understand when and how to use the tool effectively. Let’s look at how to 
# create a FunctionTool below and then call it.
from llama_index.core.tools import FunctionTool

def get_weather(location: str) -> str:
    """Useful for getting the weather for a given location."""
    print(f"Getting weather for {location}")
    return f"The weather in {location} is sunny"

tool = FunctionTool.from_defaults(
    get_weather,
    name="my_weather_tool",
    description="Useful for getting the weather for a given location.",
)
tool.call("New York")
# When using an agent or LLM with function calling, the tool selected (and the arguments written for that tool) rely strongly 
# on the tool name and description of the purpose and arguments of the tool.




# Creating a QueryEngineTool
# The QueryEngine we defined in the previous unit can be easily transformed into a tool using the QueryEngineTool class. 
# Let’s see how to create a QueryEngineTool from a QueryEngine in the example below.
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")

db = chromadb.PersistentClient(path="./alfred_chroma_db")
chroma_collection = db.get_or_create_collection("alfred")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)

llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
query_engine = index.as_query_engine(llm=llm)
tool = QueryEngineTool.from_defaults(query_engine, name="some useful name", description="some useful description")



# Creating Toolspecs
# Think of ToolSpecs as collections of tools that work together harmoniously - like a well-organized professional toolkit. 
# Just as a mechanic’s toolkit contains complementary tools that work together for vehicle repairs, a ToolSpec combines 
# related tools for specific purposes. For example, an accounting agent’s ToolSpec might elegantly integrate spreadsheet 
# capabilities, email functionality, and calculation tools to handle financial tasks with precision and efficiency.
# And now we can load the toolspec and convert it to a list of tools.
from llama_index.tools.google import GmailToolSpec

tool_spec = GmailToolSpec()
tool_spec_list = tool_spec.to_tool_list()
# To get a more detailed view of the tools, we can take a look at the metadata of each tool.
[(tool.metadata.name, tool.metadata.description) for tool in tool_spec_list]



# LlamaIndex also allows using MCP tools through a ToolSpec on the LlamaHub. You can simply run an MCP server and start 
# using it through the following implementation.
from llama_index.tools.mcp import BasicMCPClient, McpToolSpec

# We consider there is a mcp server running on 127.0.0.1:8000, or you can use the mcp client to connect to your own mcp server.
mcp_client = BasicMCPClient("http://127.0.0.1:8000/sse")
mcp_tool = McpToolSpec(client=mcp_client)

# get the agent
agent = await get_agent(mcp_tool)

# create the agent context
agent_context = Context(agent)




# Utility Tools
# Oftentimes, directly querying an API can return an excessive amount of data, some of which may be irrelevant, overflow 
# the context window of the LLM, or unnecessarily increase the number of tokens that you are using. Let’s walk through our 
# two main utility tools below.

# OnDemandToolLoader: This tool turns any existing LlamaIndex data loader (BaseReader class) into a tool that an agent can use. 
# The tool can be called with all the parameters needed to trigger load_data from the data loader, along with a natural language 
# query string. During execution, we first load data from the data loader, index it (for instance with a vector store), and then 
# query it ‘on-demand’. All three of these steps happen in a single tool call.

# LoadAndSearchToolSpec: The LoadAndSearchToolSpec takes in any existing Tool as input. As a tool spec, it implements to_tool_list, 
# and when that function is called, two tools are returned: a loading tool and then a search tool. The load Tool execution would call 
# the underlying Tool, and then index the output (by default with a vector index). The search Tool execution would take in a query 
# string as input and call the underlying index.



In [None]:
# Using Agents in LlamaIndex

# LlamaIndex supports three main types of reasoning agents:
# 1. Function Calling Agents - These work with AI models that can call specific functions.
# 2. ReAct Agents - These can work with any AI that does chat or text endpoint and deal with complex reasoning tasks.
# 3. Advanced Custom Agents - These use more complex methods to deal with more complex tasks and workflows.





# Initialising Agents
# To create an agent, we start by providing it with a set of functions/tools that define its capabilities. 
# Let’s look at how to create an agent with some basic tools. As of this writing, the agent will automatically 
# use the function calling API (if available), or a standard ReAct agent loop.

# LLMs that support a tools/functions API are relatively new, but they provide a powerful way to call tools by 
# avoiding specific prompting and allowing the LLM to create tool calls based on provided schemas.

# ReAct agents are also good at complex reasoning tasks and can work with any LLM that has chat or text completion 
# capabilities. They are more verbose, and show the reasoning behind certain actions that they take.
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.core.agent.workflow import AgentWorkflow
from llama_index.core.tools import FunctionTool

# define sample Tool -- type annotations, function names, and docstrings, are all included in parsed schemas!
def multiply(a: int, b: int) -> int:
    """Multiplies two integers and returns the resulting integer"""
    return a * b

# initialize llm
llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")

# initialize agent
agent = AgentWorkflow.from_tools_or_functions(
    [FunctionTool.from_defaults(multiply)],
    llm=llm
)

# Agents are stateless by default, however, they can remember past interactions using a Context object. This might 
# be useful if you want to use an agent that needs to remember previous interactions, like a chatbot that maintains 
# context across multiple messages or a task manager that needs to track progress over time.
# stateless
response = await agent.run("What is 2 times 2?")

# remembering state
from llama_index.core.workflow import Context

ctx = Context(agent)

response = await agent.run("My name is Bob.", ctx=ctx)
response = await agent.run("What was my name again?", ctx=ctx)
# You’ll notice that agents in LlamaIndex are async because they use Python’s await operator.





# Creating RAG Agents with QueryEngineTools
# Agentic RAG is a powerful way to use agents to answer questions about your data. We can pass various tools to Alfred 
# to help him answer questions. However, instead of answering the question on top of documents automatically, Alfred can 
# decide to use any other tool or flow to answer the question.
# It is easy to wrap QueryEngine as a tool for an agent. When doing so, we need to define a name and description. 
# The LLM will use this information to correctly use the tool. Let’s see how to load in a QueryEngineTool using the 
# QueryEngine we created in the component section.
from llama_index.core.tools import QueryEngineTool

query_engine = index.as_query_engine(llm=llm, similarity_top_k=3) # as shown in the Components in LlamaIndex section

query_engine_tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="name",
    description="a specific description",
    return_direct=False,
)
query_engine_agent = AgentWorkflow.from_tools_or_functions(
    [query_engine_tool],
    llm=llm,
    system_prompt="You are a helpful assistant that has access to a database containing persona descriptions. "
)



# Creating Multi-agent systems
# The AgentWorkflow class also directly supports multi-agent systems. By giving each agent a name and description, 
# the system maintains a single active speaker, with each agent having the ability to hand off to another agent.
# By narrowing the scope of each agent, we can help increase their general accuracy when responding to user messages.
# Agents in LlamaIndex can also directly be used as tools for other agents, for more complex and custom scenarios.
from llama_index.core.agent.workflow import (
    AgentWorkflow,
    FunctionAgent,
    ReActAgent,
)

# Define some tools
def add(a: int, b: int) -> int:
    """Add two numbers."""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two numbers."""
    return a - b


# Create agent configs
# NOTE: we can use FunctionAgent or ReActAgent here.
# FunctionAgent works for LLMs with a function calling API.
# ReActAgent works for any LLM.
calculator_agent = ReActAgent(
    name="calculator",
    description="Performs basic arithmetic operations",
    system_prompt="You are a calculator assistant. Use your tools for any math operation.",
    tools=[add, subtract],
    llm=llm,
)

query_agent = ReActAgent(
    name="info_lookup",
    description="Looks up information about XYZ",
    system_prompt="Use your tool to query a RAG system to answer information about XYZ",
    tools=[query_engine_tool],
    llm=llm
)

# Create and run the workflow
agent = AgentWorkflow(
    agents=[calculator_agent, query_agent], root_agent="calculator"
)

# Run the system
response = await agent.run(user_msg="Can you add 5 and 3?")

In [None]:
# Creating agentic workflows in LlamaIndex

# Basic Workflow Creation
from llama_index.core.workflow import StartEvent, StopEvent, Workflow, step

class MyWorkflow(Workflow):
    @step
    async def my_step(self, ev: StartEvent) -> StopEvent:
        # do something here
        return StopEvent(result="Hello, world!")

w = MyWorkflow(timeout=10, verbose=False)
result = await w.run()





# Connecting Multiple Steps
from llama_index.core.workflow import Event

class ProcessingEvent(Event):
    intermediate_result: str

class MultiStepWorkflow(Workflow):
    @step
    async def step_one(self, ev: StartEvent) -> ProcessingEvent:
        # Process initial data
        return ProcessingEvent(intermediate_result="Step 1 complete")

    @step
    async def step_two(self, ev: ProcessingEvent) -> StopEvent:
        # Use the intermediate result
        final_result = f"Finished processing: {ev.intermediate_result}"
        return StopEvent(result=final_result)

w = MultiStepWorkflow(timeout=10, verbose=False)
result = await w.run()
result
# The type hinting is important here, as it ensures that the workflow is executed correctly.




# Loops and Branches
# example of creating a loop by using the union operator |. In the example below, we see that the LoopEvent 
# is taken as input for the step and can also be returned as output.
from llama_index.core.workflow import Event
import random

class ProcessingEvent(Event):
    intermediate_result: str

class LoopEvent(Event):
    loop_output: str

class MultiStepWorkflow(Workflow):
    @step
    async def step_one(self, ev: StartEvent | LoopEvent) -> ProcessingEvent | LoopEvent:
        if random.randint(0, 1) == 0:
            print("Bad thing happened")
            return LoopEvent(loop_output="Back to step one.")
        else:
            print("Good thing happened")
            return ProcessingEvent(intermediate_result="First step complete.")

    @step
    async def step_two(self, ev: ProcessingEvent) -> StopEvent:
        # Use the intermediate result
        final_result = f"Finished processing: {ev.intermediate_result}"
        return StopEvent(result=final_result)

w = MultiStepWorkflow(verbose=False)
result = await w.run()
result





# Drawing Workflows
from llama_index.utils.workflow import draw_all_possible_flows

w = ... # as defined in the previous section
draw_all_possible_flows(w, "flow.html")





# State Management
# State management is useful when you want to keep track of the state of the workflow, so that every step 
# has access to the same state. We can do this by using the Context type hint on top of a parameter in 
# the step function.
from llama_index.core.workflow import Context, StartEvent, StopEvent

@step
async def query(self, ctx: Context, ev: StartEvent) -> StopEvent:
    # store query in the context
    await ctx.store.set("query", "What is the capital of France?")

    # do something with context and event
    val = ...

    # retrieve query from the context
    query = await ctx.store.get("query")

    return StopEvent(result=val)

In [None]:
# Automating workflows with Multi-Agent Workflows

# Instead of manual workflow creation, we can use the AgentWorkflow class to create a multi-agent workflow. 
# The AgentWorkflow uses Workflow Agents to allow you to create a system of one or more agents that can collaborate 
# and hand off tasks to each other based on their specialized capabilities. This enables building complex agent 
# systems where different agents handle different aspects of a task. Instead of importing classes from 
# llama_index.core.agent, we will import the agent classes from llama_index.core.agent.workflow. One agent must be 
# designated as the root agent in the AgentWorkflow constructor. 
# When a user message comes in, it is first routed to the root agent. Each agent can then:
# - Handle the request directly using their tools
# - Handoff to another agent better suited for the task
# - Return a response to the user

from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# Define some tools
def add(a: int, b: int) -> int:
    """Add two numbers."""
    return a + b

def multiply(a: int, b: int) -> int:
    """Multiply two numbers."""
    return a * b

llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")

# we can pass functions directly without FunctionTool -- the fn/docstring are parsed for the name/description
multiply_agent = ReActAgent(
    name="multiply_agent",
    description="Is able to multiply two integers",
    system_prompt="A helpful assistant that can use a tool to multiply numbers.",
    tools=[multiply],
    llm=llm,
)

addition_agent = ReActAgent(
    name="add_agent",
    description="Is able to add two integers",
    system_prompt="A helpful assistant that can use a tool to add numbers.",
    tools=[add],
    llm=llm,
)

# Create the workflow
workflow = AgentWorkflow(
    agents=[multiply_agent, addition_agent],
    root_agent="multiply_agent",
)

# Run the system
response = await workflow.run(user_msg="Can you add 5 and 3?")



# Agent tools can also modify the workflow state we mentioned earlier. Before starting the workflow, we can provide 
# an initial state dict that will be available to all agents. The state is stored in the state key of the workflow 
# context. It will be injected into the state_prompt which augments each new user message.
# Let’s inject a counter to count function calls by modifying the previous example:
from llama_index.core.workflow import Context

# Define some tools
async def add(ctx: Context, a: int, b: int) -> int:
    """Add two numbers."""
    # update our count
    cur_state = await ctx.store.get("state")
    cur_state["num_fn_calls"] += 1
    await ctx.store.set("state", cur_state)

    return a + b

async def multiply(ctx: Context, a: int, b: int) -> int:
    """Multiply two numbers."""
    # update our count
    cur_state = await ctx.store.get("state")
    cur_state["num_fn_calls"] += 1
    await ctx.store.set("state", cur_state)

    return a * b

...

workflow = AgentWorkflow(
    agents=[multiply_agent, addition_agent],
    root_agent="multiply_agent",
    initial_state={"num_fn_calls": 0},
    state_prompt="Current state: {state}. User message: {msg}",
)

# run the workflow with context
ctx = Context(workflow)
response = await workflow.run(user_msg="Can you add 5 and 3?", ctx=ctx)

# pull out and inspect the state
state = await ctx.store.get("state")
print(state["num_fn_calls"])

