In [3]:
# ============================================
# STOP FIGHTING - USE WHAT COLAB HAS
# ============================================

# First, check what's already installed
!pip list | grep llama

# Uninstall the problematic packages ONLY
!pip uninstall -y llama-index-indices-managed-llama-cloud llama-cloud-services

# Install ONLY what we need on top of what's there
!pip install -q deeplake ragas==0.0.22 html2text==2020.1.16 nest-asyncio

# DON'T restart - just run this:

llama-cloud                              0.1.19
llama-index                              0.9.14.post3
llama-index-cli                          0.1.13
llama-index-instrumentation              0.4.2
llama-index-multi-modal-llms-openai      0.1.9
llama-index-readers-llama-parse          0.1.6
llama-index-workflows                    2.11.5
llama-parse                              0.4.9
llamaindex-py-client                     0.1.19
[0m

In [1]:
# ============================================
# FINAL FIX - Update OpenAI to work with httpx
# ============================================

!pip install --upgrade openai

# NO RESTART NEEDED - Just re-run the imports:



In [2]:
import os
import urllib.request
import pandas as pd
import nest_asyncio
asyncio = __import__('asyncio')

nest_asyncio.apply()

# Set API key
os.environ["OPENAI_API_KEY"] = "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"

# Try modern imports first
try:
    from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, Settings
    from llama_index.vector_stores.deeplake import DeepLakeVectorStore
    from llama_index.llms.openai import OpenAI
    from llama_index.embeddings.openai import OpenAIEmbedding
    from llama_index.core.node_parser import SimpleNodeParser
    from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
    from llama_index.readers.web import SimpleWebPageReader
    print("‚úÖ Using MODERN llama-index API")
    MODERN_API = True
except ImportError:
    # Fall back to legacy imports
    from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext
    from llama_index.vector_stores import DeepLakeVectorStore
    from llama_index.llms import OpenAI
    from llama_index.node_parser import SimpleNodeParser
    from llama_index.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
    from llama_index.readers.web import SimpleWebPageReader
    print("‚úÖ Using LEGACY llama-index API")
    MODERN_API = False

# Now setup LLM
llm = OpenAI(model="gpt-4", temperature=0.0)

if MODERN_API:
    # Modern API uses Settings
    Settings.llm = llm
    Settings.embed_model = OpenAIEmbedding()
    service_context = None
else:
    # Legacy API uses ServiceContext
    service_context = ServiceContext.from_defaults(llm=llm)

print("‚úÖ LLM initialized")

# Setup DeepLake - LOCAL storage (NO TOKEN!)
vector_store = DeepLakeVectorStore(
    dataset_path="./my_deeplake_db/",
    overwrite=False
)

if MODERN_API:
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
else:
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

print("‚úÖ Vector store ready")

# Download sample data
print("üì• Downloading sample data...")
url = 'https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt'
urllib.request.urlretrieve(url, 'venus_transmission.txt')

# Load documents
reader = SimpleDirectoryReader(input_files=["venus_transmission.txt"])
docs = reader.load_data()
print(f"‚úÖ Loaded {len(docs)} document(s)")

# Parse into nodes
if MODERN_API:
    Settings.chunk_size = 512
    node_parser = SimpleNodeParser()
else:
    node_parser = SimpleNodeParser.from_defaults(chunk_size=512)

nodes = node_parser.get_nodes_from_documents(docs)
print(f"‚úÖ Created {len(nodes)} chunks")

# Create index
if MODERN_API:
    index = VectorStoreIndex(nodes, storage_context=storage_context)
else:
    index = VectorStoreIndex(nodes, storage_context=storage_context, service_context=service_context)

print("‚úÖ Index created!")

# Query it
query_engine = index.as_query_engine()
response = query_engine.query("What were the first beings to inhabit the planet?")

print("\n" + "="*60)
print("TEST QUERY RESULT:")
print("="*60)
print(response.response)
print("="*60)

# Evaluate
if MODERN_API:
    evaluator = FaithfulnessEvaluator(llm=llm)
else:
    evaluator = FaithfulnessEvaluator(service_context=service_context)

eval_result = evaluator.evaluate_response(response=response)
print(f"\n‚úÖ Faithfulness: {'PASS' if eval_result.passing else 'FAIL'}")

print("\nüéâ SUCCESS! The code works with your installed version!")

  import pkg_resources


‚úÖ Using LEGACY llama-index API
‚úÖ LLM initialized
Deep Lake Dataset in ./my_deeplake_db/ already exists, loading from the storage
‚úÖ Vector store ready
üì• Downloading sample data...
‚úÖ Loaded 1 document(s)
‚úÖ Created 13 chunks
Uploading data to deeplake dataset.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:00<00:00, 253.60it/s]

Dataset(path='./my_deeplake_db/', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (13, 1536)  float32   None   
    id        text      (13, 1)      str     None   
 metadata     json      (13, 1)      str     None   
   text       text      (13, 1)      str     None   
‚úÖ Index created!





ModuleNotFoundError: No module named 'llama_index.core.base'

In [3]:
!pip uninstall -y llama-index-multi-modal-llms-openai


Found existing installation: llama-index-multi-modal-llms-openai 0.1.9
Uninstalling llama-index-multi-modal-llms-openai-0.1.9:
  Successfully uninstalled llama-index-multi-modal-llms-openai-0.1.9


In [4]:
# Re-import to clear the cache
from importlib import reload
import llama_index
reload(llama_index)

from llama_index.llms import OpenAI as LegacyOpenAI

# Use the legacy OpenAI LLM directly
llm = LegacyOpenAI(model="gpt-4", temperature=0.0)

# Recreate query engine with explicit LLM
query_engine = index.as_query_engine(llm=llm)

# Now query
response = query_engine.query("What were the first beings to inhabit the planet?")

print("\n" + "="*60)
print(response.response)
print("="*60)


The first beings to inhabit the planet were a dinoid and reptoid race from two different systems outside our solar system.


In [5]:
test_query = "What were the first beings to inhabit the planet?"
response = query_engine.query(test_query)

print("\n" + "="*60)
print("TEST QUERY")
print("="*60)
print(f"Q: {test_query}")
print(f"A: {response.response}")
print("="*60)

# Faithfulness evaluation
evaluator = FaithfulnessEvaluator(service_context=service_context)
eval_result = evaluator.evaluate_response(response=response)

print(f"\nüìä Faithfulness: {'‚úÖ PASS' if eval_result.passing else '‚ùå FAIL'}")


TEST QUERY
Q: What were the first beings to inhabit the planet?
A: The first beings to inhabit the planet were a dinoid and reptoid race from two different systems outside our solar system.

üìä Faithfulness: ‚úÖ PASS


In [8]:
# Add this import
from llama_index.evaluation import generate_question_context_pairs

# Then run the evaluation code
print("\nüîÑ Generating evaluation questions...")
llm_eval = OpenAI(model="gpt-3.5-turbo")

qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm_eval,
    num_questions_per_chunk=2
)

queries = list(qa_dataset.queries.values())
print(f"‚úÖ Generated {len(queries)} evaluation questions")
print(f"\nSample questions:")
for i, q in enumerate(queries[:3]):
    print(f"  {i+1}. {q}")


üîÑ Generating evaluation questions...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:14<00:00,  1.15s/it]

‚úÖ Generated 26 evaluation questions

Sample questions:
  1. How did the different races colonize the planets in our solar system according to the information presented in the text?
  2. What were some of the key characteristics and abilities of the beings who came to this solar system, as described in the meeting recounted in the text?
  3. Describe the beliefs and characteristics of the four races of beings who came to Earth to bring human life and protect all life on the planets in the solar system.





In [12]:
from llama_index.evaluation import RetrieverEvaluator

print("\nüîç Evaluating retriever...")
retriever = index.as_retriever(similarity_top_k=2)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

# Run evaluation
async def evaluate_retriever():
    return await retriever_evaluator.aevaluate_dataset(qa_dataset)

eval_results = asyncio.run(evaluate_retriever())

# Calculate metrics
metric_dicts = [eval_result.metric_vals_dict for eval_result in eval_results]
full_df = pd.DataFrame(metric_dicts)

hit_rate = full_df["hit_rate"].mean()
mrr = full_df["mrr"].mean()

print(f"\nüìä RETRIEVER METRICS:")
print(f"  Hit Rate: {hit_rate:.2%}")
print(f"  MRR: {mrr:.4f}")


üîç Evaluating retriever...

üìä RETRIEVER METRICS:
  Hit Rate: 88.46%
  MRR: 0.7115


In [1]:
# Install compatible langchain version for ragas 0.0.22
!pip install langchain==0.0.350 langchain-community langchain-core

# Then re-run the RAGAS code



In [1]:
!pip install --upgrade langchain-core langchain-openai langchain




In [2]:
import os

# SET YOUR API KEY
os.environ["OPENAI_API_KEY"] = "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"

# Import everything needed
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader

llm = OpenAI(model="gpt-4", temperature=0.0)
print("‚úÖ LLM initialized")

# RAGAS evaluation
print("\nüåê Running RAGAS evaluation on web data...")

documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

service_context_web = ServiceContext.from_defaults(llm=llm, chunk_size=512)
vector_index_web = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_web
)

query_engine_web = vector_index_web.as_query_engine()

test_response = query_engine_web.query("How did New York City get its name?")
print(f"‚úÖ Test: {test_response.response[:150]}...")

# Evaluation
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "How did New York City get its name?",
]

eval_answers = [
    ["8,804,000"],
    ["Queens"],
    ["New York City got its name when it came under British control in 1664."],
]

from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

metrics = [faithfulness, answer_relevancy, context_precision, context_recall]

from ragas.integrations.llama_index import evaluate

result = evaluate(query_engine_web, metrics, eval_questions, eval_answers)

print("\nüìä RAGAS RESULTS:")
for key, value in result.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2%}")

print("\n‚úÖ RAGAS complete!")

  import pkg_resources


‚úÖ LLM initialized

üåê Running RAGAS evaluation on web data...
‚úÖ Test: The context provided does not contain information on how New York City got its name....


ModuleNotFoundError: No module named 'langchain_core.pydantic_v1'

In [3]:
import os

os.environ["OPENAI_API_KEY"] = "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"

from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader
from llama_index.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

llm = OpenAI(model="gpt-4", temperature=0.0)

# Load web data
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

service_context_web = ServiceContext.from_defaults(llm=llm, chunk_size=512)
vector_index_web = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_web
)

query_engine_web = vector_index_web.as_query_engine()

# Test queries
questions = [
    "What is the population of New York City?",
    "How did New York City get its name?",
    "Which borough has the highest population?",
]

print("üìä EVALUATION RESULTS:\n")

for q in questions:
    response = query_engine_web.query(q)

    # Evaluate
    faith_eval = FaithfulnessEvaluator(service_context=service_context_web)
    relev_eval = RelevancyEvaluator(service_context=service_context_web)

    faith_result = faith_eval.evaluate_response(response=response)
    relev_result = relev_eval.evaluate_response(query=q, response=response)

    print(f"Q: {q}")
    print(f"A: {response.response[:150]}...")
    print(f"Faithful: {'‚úÖ' if faith_result.passing else '‚ùå'} | Relevant: {'‚úÖ' if relev_result.passing else '‚ùå'}")
    print("-" * 80)

print("\nüéâ Evaluation complete!")

üìä EVALUATION RESULTS:

Q: What is the population of New York City?
A: The context information provided does not include the population of New York City....
Faithful: ‚úÖ | Relevant: ‚ùå
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
Faithful: ‚úÖ | Relevant: ‚úÖ
--------------------------------------------------------------------------------
Q: Which borough has the highest population?
A: The context provided does not contain information about the population of any boroughs....
Faithful: ‚úÖ | Relevant: ‚ùå
--------------------------------------------------------------------------------

üéâ Evaluation complete!


In [5]:
import os
import pandas as pd

os.environ["OPENAI_API_KEY"] = "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"


from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader
from llama_index.evaluation import (
    FaithfulnessEvaluator,  # Similar to RAGAS faithfulness
    RelevancyEvaluator,     # Similar to RAGAS answer_relevancy
    RetrieverEvaluator      # For context_precision and context_recall
)

llm = OpenAI(model="gpt-4", temperature=0.0)

# Load web data
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

service_context_web = ServiceContext.from_defaults(llm=llm, chunk_size=512)
vector_index_web = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_web
)

query_engine_web = vector_index_web.as_query_engine()

# Evaluation questions and ground truth
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "How did New York City get its name?",
]

eval_answers = [
    "8,804,000",
    "Queens",
    "New York City got its name when it came under British control in 1664.",
]

# Create evaluators
faithfulness_eval = FaithfulnessEvaluator(service_context=service_context_web)
relevancy_eval = RelevancyEvaluator(service_context=service_context_web)

# Store results
results = []

print("üîÑ Evaluating queries...\n")

for question, ground_truth in zip(eval_questions, eval_answers):
    # Get response
    response = query_engine_web.query(question)

    # Evaluate faithfulness (like RAGAS faithfulness)
    faith_result = faithfulness_eval.evaluate_response(response=response)

    # Evaluate relevancy (like RAGAS answer_relevancy)
    relev_result = relevancy_eval.evaluate_response(query=question, response=response)

    # Store results
    results.append({
        'question': question,
        'answer': response.response,
        'ground_truth': ground_truth,
        'faithfulness': 1.0 if faith_result.passing else 0.0,
        'answer_relevancy': 1.0 if relev_result.passing else 0.0,
        'faithfulness_feedback': faith_result.feedback,
        'relevancy_feedback': relev_result.feedback
    })

    print(f"Q: {question}")
    print(f"A: {response.response[:150]}...")
    print(f"Faithfulness: {'‚úÖ PASS' if faith_result.passing else '‚ùå FAIL'}")
    print(f"Relevancy: {'‚úÖ PASS' if relev_result.passing else '‚ùå FAIL'}")
    print("-" * 80)

# Calculate aggregate scores
df = pd.DataFrame(results)

print("\nüìä AGGREGATE RESULTS:")
print(f"  Faithfulness: {df['faithfulness'].mean():.2%}")
print(f"  Answer Relevancy: {df['answer_relevancy'].mean():.2%}")

# For context_precision and context_recall, we need retriever evaluation
print("\nüîç Evaluating retriever (context metrics)...")

# Create a simple QA dataset
from llama_index.evaluation import generate_question_context_pairs

# Generate questions from documents
node_parser = service_context_web.node_parser
nodes = node_parser.get_nodes_from_documents(documents)

qa_dataset = generate_question_context_pairs(
    nodes[:5],  # Use first 5 nodes for speed
    llm=OpenAI(model="gpt-3.5-turbo"),
    num_questions_per_chunk=1
)

# Evaluate retriever
retriever = vector_index_web.as_retriever(similarity_top_k=2)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

import asyncio

async def eval_retriever():
    return await retriever_evaluator.aevaluate_dataset(qa_dataset)

retriever_results = asyncio.run(eval_retriever())

# Calculate retriever metrics
retriever_metrics = [r.metric_vals_dict for r in retriever_results]
retriever_df = pd.DataFrame(retriever_metrics)

print(f"  Hit Rate (context_recall): {retriever_df['hit_rate'].mean():.2%}")
print(f"  MRR (context_precision): {retriever_df['mrr'].mean():.4f}")

# Final summary
print("\n" + "="*60)
print("üìä FINAL EVALUATION SUMMARY")
print("="*60)
print(f"Faithfulness:        {df['faithfulness'].mean():.2%}")
print(f"Answer Relevancy:    {df['answer_relevancy'].mean():.2%}")
print(f"Context Recall:      {retriever_df['hit_rate'].mean():.2%}")
print(f"Context Precision:   {retriever_df['mrr'].mean():.4f}")
print("="*60)

print("\nüéâ Evaluation complete!")

# Show detailed results table
print("\nüìã Detailed Results:")
print(df[['question', 'faithfulness', 'answer_relevancy']])

üîÑ Evaluating queries...

Q: What is the population of New York City as of 2020?
A: The context information provided does not include the population of New York City as of 2020....
Faithfulness: ‚úÖ PASS
Relevancy: ‚úÖ PASS
--------------------------------------------------------------------------------
Q: Which borough of New York City has the highest population?
A: The context information provided does not include details about the population of New York City's boroughs....
Faithfulness: ‚úÖ PASS
Relevancy: ‚ùå FAIL
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
Faithfulness: ‚úÖ PASS
Relevancy: ‚úÖ PASS
--------------------------------------------------------------------------------

üìä AGGREGATE RESULTS:
  Faithfulness: 100.00%
  Answer Relevancy: 66.67%

üîç Evaluating retriever (context metrics)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.33s/it]


  Hit Rate (context_recall): 0.00%
  MRR (context_precision): 0.0000

üìä FINAL EVALUATION SUMMARY
Faithfulness:        100.00%
Answer Relevancy:    66.67%
Context Recall:      0.00%
Context Precision:   0.0000

üéâ Evaluation complete!

üìã Detailed Results:
                                            question  faithfulness  \
0  What is the population of New York City as of ...           1.0   
1  Which borough of New York City has the highest...           1.0   
2                How did New York City get its name?           1.0   

   answer_relevancy  
0               1.0  
1               0.0  
2               1.0  


In [6]:
import os

os.environ["OPENAI_API_KEY"] = "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"

from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader

llm = OpenAI(model="gpt-4", temperature=0.0)

# Load web data
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

print(f"‚úÖ Loaded document with {len(documents[0].text)} characters")

# BETTER SETTINGS: Larger chunks, more retrieval
service_context_web = ServiceContext.from_defaults(
    llm=llm,
    chunk_size=1024,      # ‚Üê Larger chunks (was 512)
    chunk_overlap=200     # ‚Üê Add overlap for better context
)

vector_index_web = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_web
)

# Retrieve MORE chunks for better context
query_engine_web = vector_index_web.as_query_engine(
    similarity_top_k=5  # ‚Üê Get top 5 chunks (was 2)
)

# Test the queries again
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "How did New York City get its name?",
]

print("\nüìä TESTING WITH BETTER SETTINGS:\n")

for question in eval_questions:
    response = query_engine_web.query(question)
    print(f"Q: {question}")
    print(f"A: {response.response[:300]}...")
    print(f"Retrieved {len(response.source_nodes)} chunks")
    print("-" * 80)

‚úÖ Loaded document with 127 characters

üìä TESTING WITH BETTER SETTINGS:

Q: What is the population of New York City as of 2020?
A: The context provided does not include information about the population of New York City as of 2020....
Retrieved 1 chunks
--------------------------------------------------------------------------------
Q: Which borough of New York City has the highest population?
A: The context information provided does not include details about the population of New York City's boroughs....
Retrieved 1 chunks
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
Retrieved 1 chunks
--------------------------------------------------------------------------------


In [7]:
import os
import pandas as pd
from datetime import datetime

os.environ["OPENAI_API_KEY"] = "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"

from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader
from llama_index.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    RetrieverEvaluator,
    generate_question_context_pairs
)
import asyncio

# Load web data once
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

print(f"‚úÖ Loaded document with {len(documents[0].text)} characters\n")

# Evaluation questions
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "How did New York City get its name?",
]

eval_answers = [
    "8,804,000",
    "Brooklyn",  # ‚Üê Fixed: Brooklyn is actually highest, not Queens
    "New York City got its name when it came under British control in 1664.",
]

# Function to evaluate a configuration
def evaluate_config(config_name, chunk_size, chunk_overlap, similarity_top_k):
    print(f"\n{'='*80}")
    print(f"üî¨ TESTING: {config_name}")
    print(f"   Chunk Size: {chunk_size}, Overlap: {chunk_overlap}, Top-K: {similarity_top_k}")
    print(f"{'='*80}\n")

    llm = OpenAI(model="gpt-4", temperature=0.0)

    # Create service context with specific settings
    service_context = ServiceContext.from_defaults(
        llm=llm,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    # Build index
    vector_index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context
    )

    # Create query engine with specific top_k
    query_engine = vector_index.as_query_engine(similarity_top_k=similarity_top_k)

    # Create evaluators
    faithfulness_eval = FaithfulnessEvaluator(service_context=service_context)
    relevancy_eval = RelevancyEvaluator(service_context=service_context)

    # Evaluate questions
    results = []

    for question, ground_truth in zip(eval_questions, eval_answers):
        response = query_engine.query(question)

        faith_result = faithfulness_eval.evaluate_response(response=response)
        relev_result = relevancy_eval.evaluate_response(query=question, response=response)

        # Check if answer actually contains useful info (not just "I don't know")
        has_answer = "does not" not in response.response.lower() and len(response.response) > 50

        results.append({
            'config': config_name,
            'question': question,
            'answer': response.response,
            'ground_truth': ground_truth,
            'faithfulness': 1.0 if faith_result.passing else 0.0,
            'answer_relevancy': 1.0 if relev_result.passing else 0.0,
            'has_answer': 1.0 if has_answer else 0.0,
            'answer_length': len(response.response),
            'chunks_retrieved': len(response.source_nodes)
        })

        print(f"Q: {question}")
        print(f"A: {response.response[:200]}...")
        print(f"‚úì Faithful: {faith_result.passing} | Relevant: {relev_result.passing} | Has Answer: {has_answer}")
        print("-" * 80)

    df = pd.DataFrame(results)

    # Retriever evaluation
    node_parser = service_context.node_parser
    nodes = node_parser.get_nodes_from_documents(documents)

    qa_dataset = generate_question_context_pairs(
        nodes[:5],
        llm=OpenAI(model="gpt-3.5-turbo"),
        num_questions_per_chunk=1
    )

    retriever = vector_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

    async def eval_retriever():
        return await retriever_evaluator.aevaluate_dataset(qa_dataset)

    retriever_results = asyncio.run(eval_retriever())
    retriever_metrics = [r.metric_vals_dict for r in retriever_results]
    retriever_df = pd.DataFrame(retriever_metrics)

    # Summary metrics
    summary = {
        'config': config_name,
        'chunk_size': chunk_size,
        'chunk_overlap': chunk_overlap,
        'similarity_top_k': similarity_top_k,
        'faithfulness': df['faithfulness'].mean(),
        'answer_relevancy': df['answer_relevancy'].mean(),
        'has_answer_rate': df['has_answer'].mean(),
        'avg_answer_length': df['answer_length'].mean(),
        'context_recall': retriever_df['hit_rate'].mean(),
        'context_precision': retriever_df['mrr'].mean(),
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    print(f"\nüìä SUMMARY for {config_name}:")
    print(f"  Faithfulness:      {summary['faithfulness']:.2%}")
    print(f"  Answer Relevancy:  {summary['answer_relevancy']:.2%}")
    print(f"  Has Answer Rate:   {summary['has_answer_rate']:.2%} ‚Üê KEY METRIC!")
    print(f"  Context Recall:    {summary['context_recall']:.2%}")
    print(f"  Context Precision: {summary['context_precision']:.4f}")

    return summary, df

# ============================================
# RUN EXPERIMENTS WITH DIFFERENT CONFIGS
# ============================================

all_summaries = []
all_details = []

# Config 1: BASELINE (your current settings)
summary1, details1 = evaluate_config(
    config_name="Baseline",
    chunk_size=512,
    chunk_overlap=0,
    similarity_top_k=2
)
all_summaries.append(summary1)
all_details.append(details1)

# Config 2: LARGER CHUNKS
summary2, details2 = evaluate_config(
    config_name="Larger Chunks",
    chunk_size=1024,
    chunk_overlap=200,
    similarity_top_k=3
)
all_summaries.append(summary2)
all_details.append(details2)

# Config 3: MORE RETRIEVAL
summary3, details3 = evaluate_config(
    config_name="More Retrieval",
    chunk_size=1024,
    chunk_overlap=200,
    similarity_top_k=5
)
all_summaries.append(summary3)
all_details.append(details3)

# ============================================
# COMPARE ALL CONFIGS
# ============================================

print("\n" + "="*80)
print("üèÜ COMPARISON OF ALL CONFIGURATIONS")
print("="*80)

comparison_df = pd.DataFrame(all_summaries)
print(comparison_df[['config', 'has_answer_rate', 'faithfulness', 'answer_relevancy', 'context_recall', 'context_precision']])

# Find best config
best_config = comparison_df.loc[comparison_df['has_answer_rate'].idxmax()]
print(f"\nü•á BEST CONFIG: {best_config['config']}")
print(f"   Has Answer Rate: {best_config['has_answer_rate']:.2%}")
print(f"   Settings: chunk_size={best_config['chunk_size']}, overlap={best_config['chunk_overlap']}, top_k={best_config['similarity_top_k']}")

# Save results to CSV
comparison_df.to_csv('rag_evaluation_results.csv', index=False)
print("\nüíæ Results saved to 'rag_evaluation_results.csv'")

print("\nüéâ Evaluation complete!")

‚úÖ Loaded document with 127 characters


üî¨ TESTING: Baseline
   Chunk Size: 512, Overlap: 0, Top-K: 2

Q: What is the population of New York City as of 2020?
A: The context provided does not include information about the population of New York City as of 2020....
‚úì Faithful: True | Relevant: True | Has Answer: False
--------------------------------------------------------------------------------
Q: Which borough of New York City has the highest population?
A: The context information provided does not include details about the population of New York City's boroughs....
‚úì Faithful: True | Relevant: False | Has Answer: False
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
‚úì Faithful: True | Relevant: True | Has Answer: False
--------------------------------------------------------------------------------


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.00it/s]



üìä SUMMARY for Baseline:
  Faithfulness:      100.00%
  Answer Relevancy:  66.67%
  Has Answer Rate:   0.00% ‚Üê KEY METRIC!
  Context Recall:    0.00%
  Context Precision: 0.0000

üî¨ TESTING: Larger Chunks
   Chunk Size: 1024, Overlap: 200, Top-K: 3

Q: What is the population of New York City as of 2020?
A: The context provided does not include information about the population of New York City as of 2020....
‚úì Faithful: True | Relevant: True | Has Answer: False
--------------------------------------------------------------------------------
Q: Which borough of New York City has the highest population?
A: The context information provided does not include details about the population of New York City's boroughs....
‚úì Faithful: True | Relevant: False | Has Answer: False
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
‚

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.37it/s]



üìä SUMMARY for Larger Chunks:
  Faithfulness:      100.00%
  Answer Relevancy:  66.67%
  Has Answer Rate:   0.00% ‚Üê KEY METRIC!
  Context Recall:    0.00%
  Context Precision: 0.0000

üî¨ TESTING: More Retrieval
   Chunk Size: 1024, Overlap: 200, Top-K: 5

Q: What is the population of New York City as of 2020?
A: The context provided does not include information about the population of New York City as of 2020....
‚úì Faithful: True | Relevant: True | Has Answer: False
--------------------------------------------------------------------------------
Q: Which borough of New York City has the highest population?
A: The context provided does not contain information about the population of New York City's boroughs....
‚úì Faithful: True | Relevant: False | Has Answer: False
--------------------------------------------------------------------------------
Q: How did New York City get its name?
A: The context provided does not contain information on how New York City got its name....
‚úì

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.40it/s]



üìä SUMMARY for More Retrieval:
  Faithfulness:      100.00%
  Answer Relevancy:  66.67%
  Has Answer Rate:   0.00% ‚Üê KEY METRIC!
  Context Recall:    0.00%
  Context Precision: 0.0000

üèÜ COMPARISON OF ALL CONFIGURATIONS
           config  has_answer_rate  faithfulness  answer_relevancy  \
0        Baseline              0.0           1.0          0.666667   
1   Larger Chunks              0.0           1.0          0.666667   
2  More Retrieval              0.0           1.0          0.666667   

   context_recall  context_precision  
0             0.0                0.0  
1             0.0                0.0  
2             0.0                0.0  

ü•á BEST CONFIG: Baseline
   Has Answer Rate: 0.00%
   Settings: chunk_size=512, overlap=0, top_k=2

üíæ Results saved to 'rag_evaluation_results.csv'

üéâ Evaluation complete!


In [8]:
import os
os.environ["OPENAI_API_KEY"] = "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"

from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
from llama_index.readers.web import SimpleWebPageReader

# Load the Wikipedia page
print("üì• Loading Wikipedia page...")
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/New_York_City"]
)

print(f"\n‚úÖ Loaded {len(documents)} document(s)")
print(f"üìè Document length: {len(documents[0].text)} characters")
print(f"\nüìÑ First 500 characters of document:")
print(documents[0].text[:500])
print("\n" + "="*80)

# Build index
llm = OpenAI(model="gpt-4", temperature=0.0)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=1024, chunk_overlap=200)
vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

# Get the nodes to see what chunks were created
node_parser = service_context.node_parser
nodes = node_parser.get_nodes_from_documents(documents)
print(f"‚úÖ Created {len(nodes)} chunks")

print(f"\nüì¶ Sample chunks:")
for i, node in enumerate(nodes[:3]):
    print(f"\nChunk {i+1} (first 200 chars):")
    print(node.text[:200])
    print("-" * 80)

# Now test retrieval
query_engine = vector_index.as_query_engine(similarity_top_k=5)

test_query = "What is the population of New York City?"
print(f"\nüîç Testing query: '{test_query}'")
response = query_engine.query(test_query)

print(f"\nüìä Retrieved {len(response.source_nodes)} chunks:")
for i, node in enumerate(response.source_nodes):
    print(f"\nRetrieved Chunk {i+1} (score: {node.score:.4f}):")
    print(node.text[:300])
    print("-" * 80)

print(f"\nüí¨ Final Answer:")
print(response.response)

üì• Loading Wikipedia page...

‚úÖ Loaded 1 document(s)
üìè Document length: 127 characters

üìÑ First 500 characters of document:
Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See
also https://phabricator.wikimedia.org/T400119.



‚úÖ Created 1 chunks

üì¶ Sample chunks:

Chunk 1 (first 200 chars):
Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See
also https://phabricator.wikimedia.org/T400119.
--------------------------------------------------------------------------------

üîç Testing query: 'What is the population of New York City?'

üìä Retrieved 1 chunks:

Retrieved Chunk 1 (score: 0.6891):
Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See
also https://phabricator.wikimedia.org/T400119.
--------------------------------------------------------------------------------

üí¨ Final Answer:
The context provided does not include information about the population of New York City.
