<a href="https://colab.research.google.com/github/Fahad-Blog/Data-Science-Portfolio/blob/main/Pdf_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
from pypdf import PdfReader  # NEW: Library to read PDFs

# --- CONFIGURATION ---
MONGO_URI = "Instruction : Get your MongoDB URI by creating a new cluster"
DB_NAME = "feedback_db"
COLLECTION_NAME = "resume_pdf_file"
PDF_PATH = "/Bio.pdf" # NEW: Path to your PDF

# 1. Connect to MongoDB
try:
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    print("‚úÖ Connected to MongoDB Atlas")
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    exit()

# 2. Load the Embedding Model
print("‚è≥ Loading AI Model (this happens once)...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- NEW SECTION: PDF PROCESSING & CHUNKING ---

def extract_and_chunk_pdf(file_path, chunk_size=500, overlap=50):
    """
    Reads a PDF and splits it into smaller text chunks.
    chunk_size: Number of characters per chunk.
    overlap: Number of characters to repeat between chunks (prevents context loss).
    """
    try:
        reader = PdfReader(file_path)
        full_text = ""

        # Extract text from all pages
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

        # Sliding Window Chunking Logic
        chunks = []
        start = 0
        text_length = len(full_text)

        while start < text_length:
            # Define the end of the chunk
            end = start + chunk_size

            # Create the chunk
            chunk = full_text[start:end]

            # Clean up newlines for better embedding quality
            clean_chunk = chunk.replace('\n', ' ').strip()

            if len(clean_chunk) > 10: # Filter out tiny empty chunks
                chunks.append(clean_chunk)

            # Move the window forward, minus the overlap
            start += (chunk_size - overlap)

        print(f"üìÑ Processed PDF. Extracted {len(chunks)} chunks.")
        return chunks

    except Exception as e:
        print(f"‚ùå Error reading PDF: {e}")
        return []

# 3. Prepare Data (PDF instead of CSV)
# We convert the chunks into the dictionary format the rest of the script expects
print("üìÇ Reading and chunking PDF...")
pdf_chunks = extract_and_chunk_pdf(PDF_PATH)

# Convert list of strings to list of dicts (to match previous dataframe structure)
documents_to_insert = []
for chunk in pdf_chunks:
    documents_to_insert.append({
        "text": chunk,
        "source": "Bio.pdf",  # Metadata to know where it came from
        "type": "pdf_fragment"
    })

# 4. Generate Embeddings & Insert Data
print("üöÄ Generating Embeddings and Indexing Data...")

# Process the prepared documents
final_docs = []
for doc in documents_to_insert:
    # Text -> Vector
    vector_embedding = model.encode(doc['text']).tolist()

    # Add embedding to the document
    doc['embedding'] = vector_embedding
    final_docs.append(doc)

# Insert into MongoDB
if len(final_docs) > 0:
    # Optional: Clear old data if you want a fresh start
    # collection.delete_many({})

    collection.insert_many(final_docs)
    print(f"‚úÖ {len(final_docs)} chunked documents inserted into MongoDB!")
else:
    print("‚ö†Ô∏è No data found to insert.")

# --- THE SEARCH PHASE ---

def semantic_search(query, limit=2):
    print(f"\nüîé Searching for: '{query}'")

    query_vector = model.encode(query).tolist()

    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index_pdf",
                "path": "embedding",
                "queryVector": query_vector,
                "numCandidates": 100,
                "limit": limit
            }
        },
        {
            "$project": {
                "_id": 0,
                "text": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    results = collection.aggregate(pipeline)

    for result in results:
        # Print only the first 200 chars of the result to keep output clean
        preview = result['text'][:200] + "..."
        print(f" [Score: {result['score']:.4f}] {preview}")

# Test Cases (Adjust these based on the content of Bio.pdf)
semantic_search("Glass")


‚úÖ Connected to MongoDB Atlas
‚è≥ Loading AI Model (this happens once)...
üìÇ Reading and chunking PDF...
üìÑ Processed PDF. Extracted 22 chunks.
üöÄ Generating Embeddings and Indexing Data...
‚úÖ 22 chunked documents inserted into MongoDB!

üîé Searching for: 'Glass'
 [Score: 0.6477] nd corrected critical quality parameters using advanced Statistical Process Control (SPC) tools  and design  experiments, ensuring high product reliability and process compliance.      Quality Assuran...
 [Score: 0.6477] nd corrected critical quality parameters using advanced Statistical Process Control (SPC) tools  and design  experiments, ensuring high product reliability and process compliance.      Quality Assuran...
