In [1]:
from dotenv import load_dotenv

load_dotenv()


True

In [2]:
from llama_index.core import SimpleDirectoryReader
import os

# load documents
documents = SimpleDirectoryReader('./paul_graham/').load_data()
len(documents)


1

In [3]:
from llama_index.core.node_parser import SentenceSplitter


# Initialize the sentence splitter with desired parameters
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)

# Assuming 'documents' is a list of Document objects
nodes = node_parser.get_nodes_from_documents(documents)

In [3]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

myid = "charanvardhan"
dataset_path = f"hub://{myid}/LlamaIndex_paulgraham_essay"



In [5]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)


In [6]:
all_node_ids = list(storage_context.docstore.docs.keys())
print(f"Total nodes: {len(all_node_ids)}")
print("First 5 node IDs:", all_node_ids[:5])

Total nodes: 42
First 5 node IDs: ['48482c90-2850-4991-ad8a-a2fdf672be33', '8d518718-1bb3-477e-abb7-f3889b560609', '2166b23f-22dc-4c61-b279-5219b071c897', '0d823376-f78b-42c7-bb68-98effeb78a05', '885146f6-3496-4b75-9e88-d4b2c48d59e5']


## FaithfulnessEvaluator

In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings


llm = OpenAI(
    temperature=0,
    model="gpt-3.5-turbo",
    max_tokens=512,
    streaming=True,
)
# Use Settings instead of ServiceContext
Settings.llm = llm


In [6]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=False, read_only=True)


Deep Lake Dataset in hub://charanvardhan/LlamaIndex_paulgraham_essay already exists, loading from the storage


In [9]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [10]:
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

In [11]:
from llama_index.core.evaluation import FaithfulnessEvaluator

evaluator = FaithfulnessEvaluator()

In [12]:
query_engine = index.as_query_engine()
print(type(query_engine))
response = query_engine.query("What does Paul Graham do?")

eval_result = evaluator.evaluate_response(response=response)

<class 'llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine'>


In [13]:
print( "> response:", response )

print( "> evaluator result:", eval_result.passing )

> response: Paul Graham organizes a Summer Founders Program for undergraduates, where he funds selected groups of students to work on startup projects instead of taking traditional summer jobs. He also gives talks and writes essays to share his insights and advice on starting a startup. Additionally, he is involved in investing in startups and providing guidance to young founders.
> evaluator result: True


# RAGAS

In [None]:
from llama_index.core import download_loader

# Dynamically load the SimpleWebPageReader class
SimpleWebPageReader = download_loader("SimpleWebPageReader")

# Instantiate it (html_to_text=True strips out tags)
reader = SimpleWebPageReader(html_to_text=True)

# Load pages
docs = reader.load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

print(f"Loaded {len(docs)} documents; first 200 chars:\n")
print(docs[0].get_content()[:200], "…")


In [19]:
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

# docs = 
vector_index = VectorStoreIndex.from_documents(docs, service_context=SentenceSplitter(chunk_size=512))
                                               

In [20]:
query_engine = vector_index.as_query_engine()

response_vector = query_engine.query("How did New York City get its name?")

print(response_vector)

New York City was named after King Charles II of England granted the lands to his brother, the Duke of York, in 1664. It was temporarily renamed New York before being permanently renamed in November 1674.


In [21]:
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
]

eval_answers = [
    "8,804,000",  # incorrect answer
    "Queens",  # incorrect answer
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
]

eval_answers = [[a] for a in eval_answers]

In [28]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    # harmfulness,
)
# from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    # harmfulness,
]

In [26]:
import ragas.metrics as m
print(dir(m))

['AgentGoalAccuracyWithReference', 'AgentGoalAccuracyWithoutReference', 'AnswerAccuracy', 'AnswerCorrectness', 'AnswerRelevancy', 'AnswerSimilarity', 'AspectCritic', 'BleuScore', 'ContextEntityRecall', 'ContextPrecision', 'ContextRecall', 'ContextRelevance', 'ContextUtilization', 'DataCompyScore', 'DistanceMeasure', 'ExactMatch', 'FactualCorrectness', 'Faithfulness', 'FaithfulnesswithHHEM', 'InstanceRubrics', 'LLMContextPrecisionWithReference', 'LLMContextPrecisionWithoutReference', 'LLMContextRecall', 'LLMSQLEquivalence', 'Metric', 'MetricOutputType', 'MetricType', 'MetricWithEmbeddings', 'MetricWithLLM', 'MultiModalFaithfulness', 'MultiModalRelevance', 'MultiTurnMetric', 'NoiseSensitivity', 'NonLLMContextPrecisionWithReference', 'NonLLMContextRecall', 'NonLLMStringSimilarity', 'ResponseGroundedness', 'ResponseRelevancy', 'RougeScore', 'RubricsScore', 'SemanticSimilarity', 'SimpleCriteriaScore', 'SingleTurnMetric', 'StringPresence', 'SummarizationScore', 'ToolCallAccuracy', 'TopicAdhe

In [None]:
# from ragas.llama_index.evaluation import evaluate

# result = evaluate(query_engine, metrics, eval_questions, eval_answers)

# Custom RAG Pipeline

In [34]:
!wget 'https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt'

--2025-04-30 19:20:55--  https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19241 (19K) [text/plain]
Saving to: ‘venus_transmission.txt’


2025-04-30 19:20:55 (7.28 MB/s) - ‘venus_transmission.txt’ saved [19241/19241]



In [36]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["venus_transmission.txt"])

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 1 docs


In [38]:
from llama_index.core.node_parser import SentenceSplitter


# Initialize the sentence splitter with desired parameters
node_parser = SentenceSplitter(chunk_size=512)

# Assuming 'documents' is a list of Document objects
nodes = node_parser.get_nodes_from_documents(documents)

In [39]:
vector_index = VectorStoreIndex(nodes)

In [40]:
query_engine = vector_index.as_query_engine()

response_vector = query_engine.query("What was The first beings to inhabit the planet?")

In [41]:
response_vector.response

'Aliens.'

In [42]:
response_vector.source_nodes[0].get_text()

"I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.\n\nComputers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he'd write 2 pages at a time and then print them out, but it was a lot better than a typewriter.\n\nThough I liked programming, I didn't plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowle

In [43]:
# Second retrieved node
response_vector.source_nodes[1].get_text()

"Individually these two phenomena are tedious but bearable, but the combination is disastrous. You actually have to respond to the misinterpretations, because the assumption that you're present in the conversation means that not responding to any sufficiently upvoted misinterpretation reads as a tacit admission that it's correct. But that in turn encourages more; anyone who wants to pick a fight with you senses that now is their chance.\n\n[18] The worst thing about leaving YC was not working with Jessica anymore. We'd been working on YC almost the whole time we'd known each other, and we'd neither tried nor wanted to separate it from our personal lives, so leaving was like pulling up a deeply rooted tree.\n\n[19] One way to get more precise about the concept of invented vs discovered is to talk about space aliens. Any sufficiently advanced alien civilization would certainly know about the Pythagorean theorem, for example. I believe, though with less certainty, that they would also kno

In [46]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.core.evaluation import generate_question_context_pairs



# llm = OpenAI(
#     temperature=0,
#     model="gpt-3.5-turbo",
# )
# # Use Settings instead of ServiceContext
# Settings.llm = llm

qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm,
    num_questions_per_chunk=2
)

queries = list(qa_dataset.queries.values())
print( queries )

100%|██████████| 61/61 [00:51<00:00,  1.20it/s]

["Describe the author's experience working with the IBM 1401 in 9th grade, including the challenges they faced and the limitations of the technology at the time.", "How did the author's experience with programming on the IBM 1401 differ from their later experience with microcomputers? Discuss the impact of this transition on their work and understanding of programming.", "How did the introduction of microcomputers change the author's experience with programming compared to using the 1401 computer?", "Describe the author's journey with programming, from their initial struggles with the 1401 computer to their eventual success with writing programs on a TRS-80.", "How did the author's experience with programming on a TRS-80 computer in the 1980s influence their decision to switch from studying philosophy to AI in college?", 'Discuss the influence of Heinlein\'s novel "The Moon is a Harsh Mistress" and the PBS documentary featuring SHRDLU on the author\'s interest in working on AI in the m




In [47]:
retriever = vector_index.as_retriever(similarity_top_k=2)

In [49]:
from llama_index.core.evaluation import RetrieverEvaluator

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

In [50]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [51]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

In [52]:
import pandas as pd

display_results("OpenAI Embedding Retriever", eval_results)

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,OpenAI Embedding Retriever,0.770492,0.643443


In [55]:
from llama_index.core.settings import Settings

# gpt-3.5-turbo
gpt35 = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.llm = gpt35

# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-4-turbo")
Settings.llm = gpt4

In [56]:
vector_index = VectorStoreIndex(nodes, service_context = gpt35)
query_engine = vector_index.as_query_engine()

In [57]:
eval_query = queries[10]

response_vector = query_engine.query(eval_query)

In [58]:
print( "> eval_query: ", eval_query )
print( "> response_vector:", response_vector )

> eval_query:  Explain the author's realization about AI during their first year of grad school. How did this realization shape their focus on Lisp and their decision to write a book about Lisp hacking?
> response_vector: During the author's first year of graduate school, they realized that the prevailing approach to artificial intelligence (AI) at the time was fundamentally flawed. The AI systems they encountered were limited to transforming natural language into formal representations and adding these to a database of knowledge. This approach demonstrated that only a small subset of natural language could be treated as a formal language, revealing a significant gap between the capabilities of these AI systems and true understanding of natural language.

This realization led the author to reassess their academic and career focus. Disillusioned with the traditional path of AI research, which they saw as unlikely to bridge the gap in understanding natural language, the author turned the

In [60]:
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.evaluation import FaithfulnessEvaluator

relevancy_gpt4 = RelevancyEvaluator(gpt4)
faithfulness_gpt4 = FaithfulnessEvaluator(gpt4)

In [61]:
# Compute faithfulness evaluation

eval_result = faithfulness_gpt4.evaluate_response(response=response_vector)
# check passing parameter in eval_result if it passed the evaluation.
print( eval_result.passing )

# Relevancy evaluation
eval_result = relevancy_gpt4.evaluate_response(
    query=eval_query, response=response_vector
)
# You can check passing parameter in eval_result if it passed the evaluation.
print( eval_result.passing )

True
True


In [63]:
#Batch Evaluator:
#BatchEvalRunner to compute multiple evaluations in batch wise manner.

from llama_index.core.evaluation import BatchEvalRunner

# Let's pick top 10 queries to do evaluation
batch_eval_queries = queries[:10]

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=8,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)
# Let's get faithfulness score
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
# get relevancy score
relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])

print( "> faithfulness_score", faithfulness_score )
print( "> relevancy_score", relevancy_score )

> faithfulness_score 1.0
> relevancy_score 1.0
