### Imports and Path setup

In [None]:
from pathlib import Path
import chromadb
import pickle
import os

multiquery_rag_output_path = "../RAG Results/multiquery_rag_results.txt"
Relative_Database_path = "./chroma_Data"
Absolute_Database_path = Path(Relative_Database_path).resolve()
file_path = "../Chunking/harry_potter_chunks.pkl"
# Create a new collection with a unique name
collection_name = "harry_potter_collection"
# Set API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyBhXBmVXGDeJxKwXloTIYppaOpBgLjnSk8"


### Chroma Setup and Chunk Loading
Sets up persistant client and loads previously computed chunks

In [None]:
# Initialize the persistent client
client = chromadb.PersistentClient(path=Absolute_Database_path)
print(f"[INFO] ChromaDB client initialized at: {Absolute_Database_path}")

# List existing collections
existing_collections = client.list_collections()
print(f"Existing collections: {[c.name for c in existing_collections]}")

[INFO] ChromaDB client initialized at: C:\Users\micro\Desktop\Abhinav college\Resources\Sem 7\Advanced NLP\RAG_for_research_papers\VectorDB\chroma_Data
Existing collections: ['harry_potter_collection', 'my_collection']


In [None]:

# No need for fitz or RecursiveCharacterTextSplitter here, as we are loading from a file.


loaded_docs = []

try:
    with open(file_path, "rb") as f: # 'rb' mode for reading in binary
        loaded_docs = pickle.load(f)
    print(f"Successfully loaded {len(loaded_docs)} chunks from '{file_path}'.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"Error loading file: {e}")

# Now you can inspect the loaded documents to verify.
print("\nHere is the metadata of a loaded chunk:")
if loaded_docs:
    print(loaded_docs[0].metadata)

Successfully loaded 657 chunks from '../Chunking/harry_potter_chunks.pkl'.

Here is the metadata of a loaded chunk:
{'source': '../harrypotter.pdf', 'page_number': 1}


### Set up Embedding Function
Will use default SentenceTransformer for generating embeddings

In [3]:
# Install if needed
# !pip install sentence_transformers

# Set up embedding function
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
print("Embedding function initialized with model: all-MiniLM-L6-v2")

Embedding function initialized with model: all-MiniLM-L6-v2


### Creating new Collection

In [None]:
from datetime import datetime



# Get or create the collection
client.delete_collection(name=collection_name)  
collection = client.get_or_create_collection(
    name=collection_name,
    embedding_function=embedding_function,
    metadata={
        "description": "Harry Potter book chunks",
        "created": str(datetime.now())
    }
)

print(f"Collection '{collection_name}' created or accessed successfully")

Collection 'harry_potter_collection' created or accessed successfully


### Add data to collection
The chunks have to be given an id and added to the collection now

In [7]:
import uuid

# Prepare documents for ChromaDB
ids = []
documents = []
metadatas = []

# Process each loaded document chunk
for i, doc in enumerate(loaded_docs):
    # Generate a unique ID (you could use a more deterministic approach if needed)
    doc_id = f"hp_chunk_{i}"
    
    # Get the document text
    document_text = doc.page_content
    
    # Get the document metadata
    metadata = doc.metadata
    
    # Add to our lists
    ids.append(doc_id)
    documents.append(document_text)
    metadatas.append(metadata)

# Add documents in batches to avoid memory issues
batch_size = 500
total_added = 0

for i in range(0, len(ids), batch_size):
    end_idx = min(i + batch_size, len(ids))
    
    # collection.update(
    #     ids=ids[i:end_idx],
    #     documents=documents[i:end_idx],
    #     metadatas=metadatas[i:end_idx]
    # )
    collection.add(
        ids=ids[i:end_idx],
        documents=documents[i:end_idx],
        metadatas=metadatas[i:end_idx]
    )
    
    total_added += end_idx - i
    print(f"Added batch: {i} to {end_idx-1} ({end_idx-i} items)")

print(f"Successfully added {total_added} documents to collection '{collection_name}'")

Added batch: 0 to 499 (500 items)
Added batch: 500 to 656 (157 items)
Successfully added 657 documents to collection 'harry_potter_collection'


In [8]:
# Check collection count
count = collection.count()
print(f"Total documents in collection: {count}")

# Peek at the first few entries
peek = collection.peek(limit=3)
print("\nSample entries:")
for i, (doc_id, doc_text, metadata) in enumerate(zip(
    peek['ids'], peek['documents'], peek['metadatas']
)):
    print(f"\n--- Document {i+1} ---")
    print(f"ID: {doc_id}")
    print(f"Text: {doc_text[:100]}...")
    print(f"Metadata: {metadata}")

Total documents in collection: 657

Sample entries:

--- Document 1 ---
ID: hp_chunk_0
Text: M
 
CHAPTER  ONE
THE BOY WHO LIVED
r....
Metadata: {'page_number': 1, 'source': '../harrypotter.pdf'}

--- Document 2 ---
ID: hp_chunk_1
Text: and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, t...
Metadata: {'page_number': 1, 'source': '../harrypotter.pdf'}

--- Document 3 ---
ID: hp_chunk_2
Text: When Mr....
Metadata: {'page_number': 2, 'source': '../harrypotter.pdf'}


### Querying the Database

In [9]:
# Rich table for displaying results (optional but nice)
try:
    from rich.console import Console
    from rich.table import Table
    
    console = Console()
    use_rich = True
except ImportError:
    use_rich = False
    print("Rich package not found. Using standard print.")

# Function to display query results
def print_results(results, use_rich=use_rich):
    if use_rich:
        table = Table(show_header=True, header_style="bold magenta")
        table.add_column("Rank", width=6)
        table.add_column("Document ID")
        table.add_column("Document Text", width=60)
        table.add_column("Page")
        table.add_column("Distance")
        
        docs = results['documents'][0]
        ids = results['ids'][0]
        metas = results['metadatas'][0]
        distances = results['distances'][0]
        
        for i, (doc, doc_id, meta, dist) in enumerate(zip(docs, ids, metas, distances)):
            table.add_row(
                str(i+1),
                doc_id,
                (doc[:100] + "...") if len(doc) > 100 else doc,
                str(meta.get('page_number', 'N/A')),
                f"{dist:.4f}"
            )
        
        console.print(table)
    else:
        # Standard print version
        for i, (doc, meta, dist) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            print(f"\n--- Result {i+1} ---")
            print(f"Text: {doc[:100]}...")
            print(f"Metadata: {meta}")
            print(f"Distance: {dist:.4f}")



In [11]:
# Run a query
query = "Who was Dumbledore? When was he first introduced?"
results = collection.query(
    query_texts=[query],
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

print(f"\nResults for query: '{query}'")
print_results(results)


Results for query: 'Who was Dumbledore? When was he first introduced?'


### Natural Language Generation

In [12]:
!pip install google-generativeai langchain-google-genai



In [None]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI


# Initialize Gemini (fixed the model name - using a valid Gemini model)
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

In [15]:
from langchain.prompts import PromptTemplate

# Better prompt
rag_prompt_template = """
You are an expert on Harry Potter books. Answer questions using ONLY the context below.
If you can't find a complete answer in the context but see partial information, try to provide what you can find and acknowledge the limitations of the available information.
If there is NO relevant information at all in the context, respond with "I don't have enough information to answer this question."

Context:
{context}

Question: {query}

Answer (based only on the context provided):
"""

prompt = PromptTemplate(
    template=rag_prompt_template,
    input_variables=["context", "query"]
)

In [16]:
!pip install rank_bm25



In [23]:
from rank_bm25 import BM25Okapi
import numpy as np

def answer_with_hybrid_rag(query, n_results=5):
    # 1. Semantic search with ChromaDB
    semantic_results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )
    
    # 2. Perform keyword search with BM25
    # First get all documents to search across
    all_docs = collection.get(
        limit=100,  # Adjust based on your collection size
        include=["documents", "metadatas"]
    )
    
    # Tokenize for BM25
    tokenized_docs = [doc.split() for doc in all_docs["documents"]]
    bm25 = BM25Okapi(tokenized_docs)
    
    # Get BM25 scores
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Get top BM25 results
    top_bm25_indices = np.argsort(bm25_scores)[-n_results:][::-1]
    
    # 3. Combine results (simple union)
    combined_docs = []
    combined_meta = []
    combined_ids = [] 
    seen_ids = set()
    
    # Add semantic results
    for doc, meta, doc_id in zip(
        semantic_results["documents"][0], 
        semantic_results["metadatas"][0],
        semantic_results["ids"][0]
    ):
        if doc_id not in seen_ids:
            combined_docs.append(doc)
            combined_meta.append(meta)
            combined_ids.append(doc_id)  # Store the id
            seen_ids.add(doc_id)
    
    # Add keyword results
    for idx in top_bm25_indices:
        doc_id = all_docs["ids"][idx]
        if doc_id not in seen_ids:
            combined_docs.append(all_docs["documents"][idx])
            combined_meta.append(all_docs["metadatas"][idx])
            combined_ids.append(doc_id)  # Store the id
            seen_ids.add(doc_id)
    
    # Limit to n_results total
    combined_docs = combined_docs[:n_results]
    combined_meta = combined_meta[:n_results]
    combined_ids = combined_ids[:n_results]
    
    # Format context and complete RAG as before
    formatted_docs = []
    for doc, meta in zip(combined_docs, combined_meta):
        page_num = meta.get("page_number", "unknown")
        formatted_docs.append(f"[Page {page_num}]: {doc}")
    
    context = "\n\n---\n\n".join(formatted_docs)
    filled_prompt = prompt.format(context=context, query=query)
    response = llm.invoke(filled_prompt)
    
    # Create a mock results object for print_results compatibility
    mock_results = {
        "documents": [combined_docs],
        "metadatas": [combined_meta],
        "distances": [[0.0] * len(combined_docs)],  # Placeholder distances
        "ids": [combined_ids]  # Add this line
    }
    
    return {
        "query": query,
        "answer": response.content if hasattr(response, 'content') else str(response),
        "source_documents": mock_results
    }

In [24]:
# Test our RAG pipeline with a question
test_query = "What happened when Harry first met Hagrid?"
response = answer_with_hybrid_rag(test_query)

print(f"Question: {test_query}")
print(f"\nAnswer: {response['answer']}")
print("\nSources:")
print_results(response["source_documents"])

Question: What happened when Harry first met Hagrid?

Answer: I don't have enough information to answer this question. The provided context describes several interactions between Harry and Hagrid, but none of them explicitly detail their first meeting.

Sources:


In [None]:
# Test with multiple questions to evaluate system
results_for_export = []

test_questions = [
    "Who is Voldemort and why is he feared?",
    "What are the four houses at Hogwarts?",
    "How did Harry survive the killing curse as a baby?"
]

for question in test_questions:
    print("\n" + "="*50)
    print(f"Question: {question}")
    response = answer_with_hybrid_rag(question)
    print(f"\nAnswer: {response['answer']}")
    print("\nTop source:")
    if len(response["source_documents"]["documents"][0]) > 0:
        top_doc = response["source_documents"]["documents"][0][0]
        top_meta = response["source_documents"]["metadatas"][0][0]
        page = top_meta.get("page_number", "N/A")
        print(f"[Page {page}]:\n{top_doc}")  # Print full chunk
        # Save for export
        results_for_export.append({
            "question": question,
            "answer": response['answer'],
            "page": page,
            "chunk": top_doc
        })
    else:
        print("No sources found.")
        results_for_export.append({
            "question": question,
            "answer": response['answer'],
            "page": None,
            "chunk": None
        })

# Export results to a well-formatted text file
with open(multiquery_rag_output_path, "w", encoding="utf-8") as f:
    f.write("RAG Multi-Query Evaluation Results\n")
    f.write("="*60 + "\n\n")
    for idx, res in enumerate(results_for_export, 1):
        f.write(f"Question {idx}: {res['question']}\n")
        f.write(f"Answer:\n{res['answer']}\n\n")
        if res["chunk"]:
            f.write(f"Top Source Chunk (Page {res['page']}):\n{res['chunk']}\n")
        else:
            f.write("Top Source Chunk: No sources found.\n")
        f.write("-"*60 + "\n\n")
print(f"\nResults exported to {multiquery_rag_output_path}")


Question: Who is Voldemort and why is he feared?

Answer: Voldemort is a powerful wizard who started gaining followers about twenty years ago, causing "Dark days" where people didn't know who to trust or dare to get friendly with strange wizards or witches. He is referred to as "Lord Voldemort" by his follower Quirrell, who considers him a "great wizard" and his "master." Voldemort believes "there is no good and evil, there is only power, and those too weak to seek it." He had powers Dumbledore says he will never have, and Dumbledore is described as the "only one Voldemort was frightened of." He is still "out there somewhere, perhaps looking for another body to share."

Voldemort is feared because of the "dark days" he caused. People are so afraid of him that they often avoid saying his name, referring to him as "You-Know-Who," as Dumbledore explains that "Fear of a name increases fear of the thing itself." Hagrid gulps and shudders at the mention of his name, and Professor McGonagall