In [2]:
import os
import time
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

from embedding import GeminiEmbeddings
from document_processor import DocumentProcessor, ContextualHeaderProcessor
from vector_store import VectorstoreManager
from retrieval import StandardRetriever, ContextualHeaderRetriever
from llm_interface import get_openrouter_llm, StandardRAGChain, ContextualHeaderRAGChain
from evaluation import RAGEvaluator

# Check if environment variables are set
required_env_vars = ["GEMINI_API_KEY", "PINECONE_API_KEY", "OPENROUTER_API_KEY"]
missing_vars = [var for var in required_env_vars if not os.getenv(var)]

if missing_vars:
    print(f"Missing environment variables: {', '.join(missing_vars)}")
    print("Please set these variables in your .env file.")
else:
    print("All required environment variables are set.")

All required environment variables are set.


In [3]:
# Configuration
DOCS_DIR = "books"  # Path to the documents directory
CHUNK_SIZE = 2000      # Size of text chunks
CHUNK_OVERLAP = 200    # Overlap between chunks
REBUILD_INDEX = False   # Whether to rebuild the vector store index

# Initialize components

In [5]:
# Initialize the embedding model
print("Initializing the embedding model ...")
embeddings = GeminiEmbeddings()

# Initialize the LLM
print("Initializing the LLM ...")
llm = get_openrouter_llm()

print("Initializing the vector store")
vectorstore_manager = VectorstoreManager(embeddings)

Initializing the embedding model ...
Initializing the LLM ...
Initializing the vector store


# Process Documents and Create vector store

In [None]:
if REBUILD_INDEX:
    print("Rebuilding index from documents ...")
    doc_processor = DocumentProcessor(DOCS_DIR,
                                      chunk_size=CHUNK_SIZE,
                                      chunk_overlap=CHUNK_OVERLAP)
    standard_docs = doc_processor.process_documents()
    print(f"Processed {len(standard_docs)} document chunks for standard RAG")

    # Display a sample document chunk
    if standard_docs:
        print("\nSample standard document chunk:")
        print(f"Source: {standard_docs[0].metadata.get("source", "unknown")}")
        print(f"Content (first 300 chars): {standard_docs[0].page_content[:300]}...")

    # Contextual Header document processing
    contextual_doc_processor = DocumentProcessor(
        llm = LLM,
        chunk_size = CHUNK_SIZE,
        chunk_overlap = CHUNK_OVERLAP,
        docs_dir = DOCS_DIR
    )
    contextual_docs = contextual_doc_processor.process_documents()
    print(f"Processed {len(contextual_docs)} document chunks for contextual RAG")

    # Display a sample contextual document chunk
    if contextual_docs:
        print("\nSample contextual document chunk:")
        print(f"Source: {contextual_docs[0].metadata.get("source", "unknown")}")
        print(f"Content (first 300 chars): {contextual_docs[0].page_content[:300]}...")

    # Add documents to the vector store
    # Clear any existing vector store
    print("Clearing any existing vector store")
    vectorstore_manager.clear_all()

    # Add documents to vector store
    print("Adding documents to standard RAG vector store...")
    vectorstore_manager.add_documents(standard_docs)
    # Add contextual documents to vector store
    print("Adding documents to contextual RAG vector store...")
    vectorstore_manager.add_documents(contextual_docs)

    

1368000