**Narrowing search space using meta data filter search, leads to faster retrivel ar scale**

In [None]:
! pip install PyMuPDF
! pip install langchain-community
! pip uninstall camelot -y
! pip install "camelot-py[cv]"
! pip install langchain-qdrant

In [2]:
import os
import fitz  # PyMuPDF
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ---------- CONFIG ----------
PDF_FOLDER = "./pdfs"   # folder containing your PDF files
CHUNK_SIZE = 75
CHUNK_OVERLAP = 25
# ----------------------------

# Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

def extract_text_from_pdf(pdf_path):
    """Extract all text from a PDF file (page by page)."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    doc.close()
    return text.strip()

def create_chunks_from_pdfs(pdf_folder=PDF_FOLDER):
    """Create chunks from all PDFs in a folder with only 'source' metadata."""
    all_chunks = []

    pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
    if not pdf_files:
        raise FileNotFoundError(f"No PDFs found in {pdf_folder}")

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)

        # Split into chunks
        chunks = splitter.split_text(pdf_text)

        # Create Document objects with only 'source' metadata
        for chunk in chunks:
            doc = Document(page_content=chunk, metadata={"source": pdf_file})
            all_chunks.append(doc)

        print(f"{pdf_file}: {len(chunks)} chunks created")

    print(f"\n‚úÖ Total chunks created: {len(all_chunks)}")
    return all_chunks


# Example run
if __name__ == "__main__":
    documents = create_chunks_from_pdfs()
    # Show sample
    for i, doc in enumerate(documents[:3], start=1):
        print(f"\nChunk {i} Metadata:", doc.metadata)
        print("Content:", doc.page_content[:200].replace("\n", " "), "...")


finance_policy.pdf: 4 chunks created
hr_rules.pdf: 3 chunks created
it_manual.pdf: 3 chunks created

‚úÖ Total chunks created: 10

Chunk 1 Metadata: {'source': 'finance_policy.pdf'}
Content: Finance Department Policy: ...

Chunk 2 Metadata: {'source': 'finance_policy.pdf'}
Content: All employees must submit their expense reports by the 5th of each month. ...

Chunk 3 Metadata: {'source': 'finance_policy.pdf'}
Content: the 5th of each month. ...


In [None]:
# Assuming you already have this list from previous step
# documents = create_chunks_from_pdfs()

def display_all_chunks(documents):
    """Display all chunks with metadata and a short content preview."""
    print(f"\nüìÑ Total chunks: {len(documents)}\n")
    for i, doc in enumerate(documents, start=1):
        print(f"---- Chunk {i} ----")
        print(f"Source : {doc.metadata.get('source')}")
        print(f"Content:\n{doc.page_content.strip()}\n")

# Example usage

documents = create_chunks_from_pdfs()
display_all_chunks(documents)


In [5]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# ‚úÖ Initialize Gemini embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# ‚úÖ Local Qdrant path and collection
qdrant_path = "./local_qdrant_store"
collection_name = "pdf_chunks_store"

# ‚úÖ Store chunks locally (not remote)
vectorstore = Qdrant.from_documents(
    documents=documents,
    embedding=embeddings,
    path=qdrant_path,  # ‚úÖ Use 'path' instead of 'location'
    collection_name=collection_name,
)

print(f"\n‚úÖ Stored {len(documents)} chunks with metadata in local Qdrant collection: '{collection_name}'")



‚úÖ Stored 10 chunks with metadata in local Qdrant collection: 'pdf_chunks_store'


In [1]:
from qdrant_client import QdrantClient
from langchain_qdrant import Qdrant
# from langchain_community.vectorstores import Qdrant
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# ‚úÖ Initialize Gemini embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# ‚úÖ Connect to the same local folder (read-only mode)
# Use 'prefer_grpc=False' to avoid locking conflicts
client = QdrantClient(path="./local_qdrant_store", prefer_grpc=False)

collection_name = "pdf_chunks_store"

# ‚úÖ Reconnect to the existing Qdrant collection
vectorstore = Qdrant(
    client=client,
    collection_name=collection_name,
    embeddings=embeddings
)

  vectorstore = Qdrant(


In [6]:
import time

query = "What is the hr leave policy?"
metadata_filter = {"source": "finance_policy.pdf"}

# ‚úÖ Start timer
start_time = time.time()

results = vectorstore.similarity_search(
    query=query,
    filter=metadata_filter,
    k=10
)

# ‚úÖ End timer
elapsed_time = time.time() - start_time

print(f"\nüîç Filtered Search (source = {metadata_filter['source']})")
print(f"‚è±Ô∏è Time taken: {elapsed_time:.4f} seconds\n")

for i, doc in enumerate(results, start=1):
    print(f"Result {i}: {doc.page_content[:120]}...")
    print(f"Metadata: {doc.metadata}\n")



üîç Filtered Search (source = finance_policy.pdf)
‚è±Ô∏è Time taken: 0.7412 seconds

Result 1: Reimbursements are processed within 10 working days after approval....
Metadata: {'source': 'finance_policy.pdf', '_id': 'b695d502daff438c8911b365442eae80', '_collection_name': 'pdf_chunks_store'}

Result 2: Finance Department Policy:...
Metadata: {'source': 'finance_policy.pdf', '_id': '7eb4deb0948942259151416397312767', '_collection_name': 'pdf_chunks_store'}

Result 3: the 5th of each month....
Metadata: {'source': 'finance_policy.pdf', '_id': '8d3df158d3db4d61a52680985b8d6ebf', '_collection_name': 'pdf_chunks_store'}

Result 4: All employees must submit their expense reports by the 5th of each month....
Metadata: {'source': 'finance_policy.pdf', '_id': '4256b750fb994cb19b4508bbc7bf549f', '_collection_name': 'pdf_chunks_store'}



In [7]:
import time

query = "What is the hr leave policy?"

# ‚úÖ Start timer
start_time = time.time()

results_unfiltered = vectorstore.similarity_search(
    query=query,
    k=10  # same number as filtered version for fair comparison
)

# ‚úÖ End timer
elapsed_time = time.time() - start_time

print(f"\nüîç Unfiltered Search (across all documents)")
print(f"‚è±Ô∏è Time taken: {elapsed_time:.4f} seconds\n")

for i, doc in enumerate(results_unfiltered, start=1):
    print(f"Result {i}: {doc.page_content[:120]}...")
    print(f"Metadata: {doc.metadata}\n")



üîç Unfiltered Search (across all documents)
‚è±Ô∏è Time taken: 0.6614 seconds

Result 1: Sick leaves must be reported to HR within 24 hours with a medical note....
Metadata: {'source': 'hr_rules.pdf', '_id': 'd037bf5985744cc2a6b1c08334148c7e', '_collection_name': 'pdf_chunks_store'}

Result 2: HR Department Guidelines:...
Metadata: {'source': 'hr_rules.pdf', '_id': '18fc78e0dd1b454ebd454880ce78c97a', '_collection_name': 'pdf_chunks_store'}

Result 3: Every employee is entitled to 14 annual leaves per year....
Metadata: {'source': 'hr_rules.pdf', '_id': '61b715e83a474aaea4ad18ab5b065a43', '_collection_name': 'pdf_chunks_store'}

Result 4: Reimbursements are processed within 10 working days after approval....
Metadata: {'source': 'finance_policy.pdf', '_id': 'b695d502daff438c8911b365442eae80', '_collection_name': 'pdf_chunks_store'}

Result 5: Finance Department Policy:...
Metadata: {'source': 'finance_policy.pdf', '_id': '7eb4deb0948942259151416397312767', '_collection_name': 'pdf_ch