# Llama Index Framework
This notebook is dedicated to learning about the Llama Index Framework for creating Agents

In [21]:
# Make sure to setup your environment currectly 
# For this project I used python 3.12.3 

# Since we are using huggingface models, we need to install the huggingface_hub library
# %pip install -q llama-index-llms-huggingface-api llama-index-embeddings-huggingface
# %pip install -q lmstudio
# %pip install -q load-dotenv

In [None]:
# Test out llm studio sdk
import lmstudio as lms

#List all model avialable locally
lmstudio_downloaded_models = lms.list_downloaded_models()
for model in lmstudio_downloaded_models:
    print(f"Downloaded model: {model}")
    
# List LLM's only
lmstudio_llms_only = lms.list_downloaded_models("llm")
for llm in lmstudio_llms_only:
    print(f"LLM: {llm}")
    
# List embeddings only
lmstudio_embeddings_only = lms.list_downloaded_models("embedding")
for embedding in lmstudio_embeddings_only:
    print(f"Embedding: {embedding}")



In [23]:
import lmstudio as lms

lmstudio_llm = 78 

In [None]:
# Structured output
import lmstudio as lms
from pydantic import BaseModel

class Book(BaseModel):
    title: str
    author: str
    year: int
    rating: float

lmstudio_llm = lms.llm() # Gets the currrent loaded model

prompt = "Tell me about the book 'The Great Gatsby"

response = lmstudio_llm.respond(
    prompt,
    response_format=Book
)

print(response)


In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from environment variable
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    print("HF_TOKEN found in environment variables")
else:
    raise ValueError("HF_TOKEN not found in environment variables. Please add it to your .env file")

# Login to Hugging Face
import huggingface_hub
# huggingface_hub.login(token=hf_token)


In [None]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from dotenv import load_dotenv
import os

load_dotenv()

# model_name = "Qwen/Qwen2.5-Coder-32B-Instruct" # This is if you want to use a model from huggingface
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

llm = HuggingFaceInferenceAPI(
    model_name=model_name,
    temperature=0.7,
    max_tokens=1000,
    token=os.environ.get("HF_TOKEN"),
)


llm.complete("Hello, how are you?")



# Key stages in RAG pipeline
1. Loading
2. Idexing
3. Storing
4. Querying
5. Evaluation

In [None]:
# Load data
from llama_index.core import SimpleDirectoryReader
from load_dotenv import load_dotenv
import os

load_dotenv(override=True)

directory_path = os.environ.get("DOCUMENTS_DIR")
if directory_path:
    print(f"Directory path: {directory_path}")
else:
    raise ValueError("Directory path not found in environment variables. Please add it to your .env file")

reader = SimpleDirectoryReader(input_dir=directory_path)
documents = reader.load_data()
print(f"Found {len(documents)} documents")

## Document Processing and Node Creation

After loading our documents, we need to break them into smaller pieces called Node objects. A Node is just a chunk of text from the original document that's easier for the AI to work with, while it still has references to the original Document object.

The IngestionPipeline helps us create these nodes through two key transformations:

1. **SentenceSplitter**: Breaks down documents into manageable chunks by splitting them at natural sentence boundaries.
2. **HuggingFaceEmbedding**: Converts each chunk into numerical embeddings - vector representations that capture the semantic meaning in a way AI can process efficiently.

This process helps us organise our documents in a way that's more useful for searching and analysis.

In [33]:
from llama_index.core import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

chunk_size = 1000
chunk_overlap = 0
# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = "BAAI/bge-small-en-v1.5"

# Create a pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        HuggingFaceEmbedding(model_name=embedding_model)
    ]
)

# Apply the pipeline to our documents
nodes = await pipeline.arun(documents=[Document.example()])

Storing and indexing documents
After creating our Node objects we need to index them to make them searchable, but before we can do that, we need a place to store our data.

Since we are using an ingestion pipeline, we can directly attach a vector store to the pipeline to populate it. In this case, we will use Chroma to store our documents.

In [34]:
# %pip install llama-index-vector-stores-chroma

In [35]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from load_dotenv import load_dotenv
import os

load_dotenv(override=True)
# Initialize ChromaDB
chroma_collection_name = "rag_collection"
db = chromadb.PersistentClient(path=os.environ.get("CHROMA_DB_PATH"))
chroma_collection = db.get_or_create_collection(name=chroma_collection_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Define the pipeline
chunk_size = 1000
chunk_overlap = 0
embedding_model = "BAAI/bge-small-en-v1.5"

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        HuggingFaceEmbedding(model_name=embedding_model)
    ],
    vector_store=vector_store,
)

This is where vector embeddings come in - by embedding both the query and nodes in the same vector space, we can find relevant matches. The VectorStoreIndex handles this for us, using the same embedding model we used during ingestion to ensure consistency.

Let’s see how to create this index from our vector store and embeddings:

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from load_dotenv import load_dotenv
import os

load_dotenv(override=True)

embedding_model_name = os.environ.get("EMBEDDING_MODEL")
if embedding_model_name:
    print(f"Embedding model name: {embedding_model_name}")
else:
    raise ValueError("Embedding model name not found in environment variables. Please add it to your .env file")

embedding_model = HuggingFaceEmbedding(model_name=embedding_model_name)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embedding_model)

# # Querying the index
# query_engine = index.as_query_engine()
# response = query_engine.query("What is the main idea of the document?")
# print(response)


In [None]:
# Lets make a query to the index
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from load_dotenv import load_dotenv
import os

load_dotenv(override=True)

huggingface_model_name = os.environ.get("HUGGINGFACE_MODEL")
if huggingface_model_name:
    print(f"Huggingface model name: {huggingface_model_name}")
else:
    raise ValueError("Huggingface model name not found in environment variables. Please add it to your .env file")

huggingface_llm = HuggingFaceInferenceAPI(model_name=huggingface_model_name)

# using LM Studio
import lmstudio as lms

lmstudio_llm = lms.llm()

query_engine = index.as_query_engine(
    llm=huggingface_llm,
    # llm=lmstudio_llm,
    response_mode="tree_summarize",
)

response = query_engine.query("What is the meaning of life?")
print(response)

In [None]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# model_name="Qwen/Qwen2.5-Coder-32B-Instruct"
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = HuggingFaceInferenceAPI(model_name=model_name)
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)
query_engine.query("What is the meaning of life?")
# The meaning of life is 42

In [None]:
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import lmstudio as lms

# llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = HuggingFaceInferenceAPI(model_name=model_name)
# llm = lms.llm()

query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)

# query index
evaluator = FaithfulnessEvaluator(llm=llm)

response = query_engine.query("When was the american civil war?")

eval_result = evaluator.evaluate_response(response=response)
eval_result.passing

# evaluate response
evaluator.evaluate(response, query_engine.query_node)


In [None]:
import nest_asyncio
nest_asyncio.apply()

# Now we can run the evaluation
evaluator = FaithfulnessEvaluator(llm=llm)
response = query_engine.query("When was the american civil war?")
eval_result = evaluator.evaluate_response(response=response)
print(f"Evaluation passed: {eval_result.passing}")
print(f"Evaluation score: {eval_result.score}")
print(f"Evaluation feedback: {eval_result.feedback}")

In [None]:
import llama_index
import os

PHOENIX_API_KEY = "<PHOENIX_API_KEY>"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix",
    endpoint="https://llamatrace.com/v1/traces"
)

In [42]:
%pip install llama-index datasets llama-index-callbacks-arize-phoenix llama-index-vector-stores-chroma llama-index-llms-huggingface-api -U -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [43]:
# Login to Hugging Face
import huggingface_hub
from load_dotenv import load_dotenv
import os

load_dotenv(override=True)
huggingface_hub.login(token=os.environ.get("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [45]:
from llama_index.core.tools import FunctionTool


def get_weather(location: str) -> str:
    """Useful for getting the weather for a given location."""
    print(f"Getting weather for {location}")
    return f"The weather in {location} is sunny"

def calculate_sum(a: int, b: int) -> int:
    """Useful for calculating the sum of two numbers."""
    print(f"Calculating sum of {a} and {b}")
    return a + b

tool = FunctionTool.from_defaults(
    get_weather,
    name="my_weather_tool",
    description="Useful for getting the weather for a given location.",
)

tool2 = FunctionTool.from_defaults(
    calculate_sum,
    name="my_sum_tool",
    description="Useful for calculating the sum of two numbers.",
)

tool.call("New York")
# tool2.call(3,5)

Getting weather for New York


ToolOutput(content='The weather in New York is sunny', tool_name='my_weather_tool', raw_input={'args': ('New York',), 'kwargs': {}}, raw_output='The weather in New York is sunny', is_error=False)

In [46]:
import chromadb

from llama_index.core import VectorStoreIndex
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface_api import HuggingFaceInferenceAPIEmbedding
from llama_index.core.tools import QueryEngineTool
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./alfred_chroma_db")
chroma_collection = db.get_or_create_collection("alfred")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
embed_model = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm = HuggingFaceInferenceAPI(model_name="meta-llama/Llama-3.2-3B-Instruct")
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, embed_model=embed_model
)
query_engine = index.as_query_engine(llm=llm)
tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="some useful name",
    description="some useful description",
)
await tool.acall(
    "Responds about research on the impact of AI on the future of work and society?"
)

ModuleNotFoundError: No module named 'llama_index.embeddings.huggingface_api'