In [2]:
%pip install llama-index-postprocessor-cohere-rerank
%pip install llama-index

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os.path
import shutil
import logging
import sys
import chromadb
import openai
import time
import nltk
import nest_asyncio
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
    BaseExtractor,
    SummaryExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_response,
)
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.postprocessor.cohere_rerank import CohereRerank
from sherpa_reader import LLMSherapaReader
from llama_index.core import SimpleDirectoryReader
from document_sorter import DocumentSorter

nest_asyncio.apply()


In [4]:
# Directory for ChromaDB storage
PERSIST_DIR = "./chromadb"

## Set LLM

In [5]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=1024)

### Delete Previous DB

In [13]:
if os.path.exists(PERSIST_DIR):
        shutil.rmtree(PERSIST_DIR)

## Instatiate ChromaDB

In [12]:
chroma_client = chromadb.PersistentClient(path=PERSIST_DIR)
chroma_collection = chroma_client.get_or_create_collection("class_materials2")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

## Load Data

In [7]:
# Load data
print("Loading Data")
documents = SimpleDirectoryReader("data", file_extractor={".pdf" : LLMSherapaReader()}).load_data()
info, questions, garbage, broken = DocumentSorter().sort(documents)
print("Data Loaded")

Loading Data


100%|██████████| 243/243 [01:22<00:00,  2.95it/s]

Data Loaded





## Dad Loading & Ingestion Pipeline

In [13]:
# Ingest data through the pipeline
pipeline = IngestionPipeline(
transformations=[
    SentenceWindowNodeParser.from_defaults(
        # how many sentences on either side to capture
        window_size=3,
        # the metadata key that holds the window of surrounding sentences
        window_metadata_key="window",
        # the metadata key that holds the original sentence
        original_text_metadata_key="original_sentence",
    ),
    #SummaryExtractor(summaries=["prev", "self", "next"], llm=llm),
    #KeywordExtractor(keywords=3, llm=llm),
    OpenAIEmbedding(model_name="text-embedding-3-large")
],
vector_store=vector_store
)

nodes_post_pipe = pipeline.run(documents=info)


In [9]:
with open("nodes_post_pipe.txt", "w") as file:
    for node in nodes_post_pipe:
        file.write(node.text + "\n\n")

## Indexing

In [14]:
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large"), storage_context=storage_context)

## Querying

In [21]:
window_post_processor = MetadataReplacementPostProcessor(target_metadata_key="window")
cohere_api_key = os.environ.get("COHERE_API_KEY")
cohere_rerank = CohereRerank(api_key=cohere_api_key, top_n=3)

query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=llm,
    node_postprocessors=[
        window_post_processor,
        cohere_rerank
    ],
    )
response = query_engine.query("what information can you tell me about the textbook?")
display_response(
    response=response, source_length=1000, show_source=True, show_source_metadata=True
)

**`Final Response:`** The document discusses topics related to human-environment interaction, settlement patterns, and agriculture in the regions of North America known as the United States and Canada. It also provides insights into the early inhabitants of the American Southwest and their resource utilization strategies. Additionally, the document contains points of interest marked by small red squares or pink ribbons.

---

**`Source Node 1/3`**

**Node ID:** c8414a72-76bb-4d38-9ad4-ee395349d62f<br>**Similarity:** 0.7472472<br>**Text:** SETTLEMENT The first inhabitants of the area of North America now known as the United States and Canada were nomads, people who move from place to place.
 Most archaeologists believe that they probably migrated from Asia over Beringia, a land bridge that once connected Siberia and Alaska.
 These migrants moved about the land.
 They hunted game, fished, and gathered edible wild plants.<br>**Metadata:** {'window': 'SETTLEMENT The first inhabitants of the area of North America now known as the United States and Canada were nomads, people who move from place to place.\n Most archaeologists believe that they probably migrated from Asia over Beringia, a land bridge that once connected Siberia and Alaska.\n These migrants moved about the land.\n They hunted game, fished, and gathered edible wild plants.\n', 'original_sentence': 'SETTLEMENT The first inhabitants of the area of North America now known as the United States and Canada were nomads, people who move from place to place.\n', 'page_label': 10, 'file_name': 'geography_chapter5_uscan.pdf', 'section_header': 'Human–Environment Interaction > Settlement and Agriculture Alter the Land', 'file_path': '/Users/elidumper/tai/learning_llama/data/geography_chapter5_uscan.pdf', 'file_type': 'application/pdf', 'file_size': 2495511, 'creation_date': '2024-03-22', 'last_modified_date': '2024-03-12'}<br>

---

**`Source Node 2/3`**

**Node ID:** f86be288-10db-475e-a603-aad6a1ce2e90<br>**Similarity:** 0.65632796<br>**Text:** A HUMAN PERSPECTIVE The sun-baked American Southwest was a harsh environment for its early inhabitants, the ancestors of today’s Pueblo peoples.
 But these early settlers made good use of available resources.
 From the land, they took clay and stone building materials.<br>**Metadata:** {'window': 'A HUMAN PERSPECTIVE The sun-baked American Southwest was a harsh environment for its early inhabitants, the ancestors of today’s Pueblo peoples.\n But these early settlers made good use of available resources.\n From the land, they took clay and stone building materials.', 'original_sentence': 'A HUMAN PERSPECTIVE The sun-baked American Southwest was a harsh environment for its early inhabitants, the ancestors of today’s Pueblo peoples.\n', 'page_label': 10, 'file_name': 'geography_chapter5_uscan.pdf', 'section_header': 'Human–Environment Interaction > Places & Terms', 'file_path': '/Users/elidumper/tai/learning_llama/data/geography_chapter5_uscan.pdf', 'file_type': 'application/pdf', 'file_size': 2495511, 'creation_date': '2024-03-22', 'last_modified_date': '2024-03-12'}<br>

---

**`Source Node 3/3`**

**Node ID:** f7817a9f-df49-4ea9-9786-18896c38cc12<br>**Similarity:** 0.36511704<br>**Text:** Points of interest, such as the Alamo (B–2) or Sea World (A–2), are marked by small red squares or by pink ribbons, depending on their size.<br>**Metadata:** {'window': 'Points of interest, such as the Alamo (B–2) or Sea World (A–2), are marked by small red squares or by pink ribbons, depending on their size.', 'original_sentence': 'Points of interest, such as the Alamo (B–2) or Sea World (A–2), are marked by small red squares or by pink ribbons, depending on their size.', 'page_label': 14, 'file_name': 'geography_chapter5_uscan.pdf', 'section_header': 'A', 'file_path': '/Users/elidumper/tai/learning_llama/data/geography_chapter5_uscan.pdf', 'file_type': 'application/pdf', 'file_size': 2495511, 'creation_date': '2024-03-22', 'last_modified_date': '2024-03-12'}<br>

## Evaluate RAG Embeddings

In [22]:
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RetrieverEvaluator

In [23]:
retriever = index.as_retriever(similarity_top_k=3)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

qa_dataset = generate_question_context_pairs(
    nodes_post_pipe, llm=llm, num_questions_per_chunk=1
)

100%|██████████| 368/368 [05:59<00:00,  1.02it/s]


In [24]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [25]:
mrr_score = sum(result.metric_dict["mrr"].score for result in eval_results) / len(eval_results)
print(f"mrr_score: {mrr_score}")

hit_rate_score = sum(result.metric_dict["hit_rate"].score for result in eval_results) / len(eval_results)
print(f"hit_rate_score: {hit_rate_score}")

mrr_score: 0.8223070398642917
hit_rate_score: 0.8651399491094147


## Evaluate Model Responses

In [44]:
%pip install spacy

7654.06s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Collecting spacy
  Downloading spacy-3.7.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasa

In [39]:

from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator

In [40]:
# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-4")

evaluator_gpt4 = FaithfulnessEvaluator(llm=gpt4)

In [48]:
from llama_index.core.evaluation import DatasetGenerator


question_generator = DatasetGenerator.from_documents(info)
eval_questions = question_generator.generate_questions_from_nodes(30)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [49]:
import asyncio


def evaluate_query_engine(query_engine, questions):
    c = [query_engine.aquery(q) for q in questions]
    results = asyncio.run(asyncio.gather(*c))
    print("finished query")

    total_correct = 0
    for r in results:
        # evaluate with gpt 4
        eval_result = (
            1 if evaluator_gpt4.evaluate_response(response=r).passing else 0
        )
        total_correct += eval_result

    return total_correct, len(results)

In [52]:
correct, total = evaluate_query_engine(query_engine, eval_questions[:30])

print(f"score: {correct}/{total}")

finished query
score: 24/30


In [38]:
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner

queries = list(qa_dataset.queries.values())[:10]


faithfulness_evaluator = FaithfulnessEvaluator()
relevancy_evaluator = RelevancyEvaluator()

runner = BatchEvalRunner(
{"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
workers=8,
)
eval_results = await runner.aevaluate_queries(
    query_engine, queries=queries
)
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
print(f"faithfulness_score: {faithfulness_score}")

relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])
print(f"relevancy_score: {relevancy_score}")


faithfulness_score: 1.0
relevancy_score: 1.0
