<a href="https://colab.research.google.com/github/Fahad-Blog/Data-Science-Portfolio/blob/main/Pdf_search_using_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. SETUP EVERYTHING (Run once per session) ---
import os
import subprocess
import time

# Install dependencies only if they aren't there
if subprocess.run("command -v ollama", shell=True).returncode != 0:
    print("Installing Ollama and tools...")
    !sudo apt update && sudo apt install -y pciutils
    !curl -fsSL https://ollama.com/install.sh | sh
    !pip install sentence-transformers datasets pypdf pymongo ollama --quiet
else:
    print("Ollama already installed.")

# Set GPU path
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia'

# Start the server if it's not running
status = subprocess.run("pgrep ollama", shell=True)
if status.returncode != 0:
    print("Starting Ollama server...")
    subprocess.Popen(['nohup', 'ollama', 'serve'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(5) # Give it time to breathe

# Pull your preferred model (it will skip if already pulled)
!ollama pull llama3:8b
print("‚úÖ Setup Complete. You can now run your script cells below.")

Ollama already installed.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G

In [None]:
!pip install -U google-genai --quiet

In [None]:
pip install pymongo sentence_transformers pypdf



In [12]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
from pypdf import PdfReader  # NEW: Library to read PDFs
from google import genai
from google.genai import types

# import ollama

from google.colab import userdata
password = userdata.get('password')
gemini_api_key = userdata.get('gemini_api_key')

# --- CONFIGURATION ---
MONGO_URI = f"mongodb+srv://fahadanwaran10_db_user:{password}@ai-infra-project.hzlb8kg.mongodb.net/?appName=AI-infra-project"
DB_NAME = "feedback_db"
COLLECTION_NAME = "resume_pdf_file"
PDF_PATH = "/content/Mohammad Fahad Anwar - Resume.pdf" # NEW: Path to your PDF

# 1. Connect to MongoDB
try:
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    print("‚úÖ Connected to MongoDB Atlas")
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    exit()

# 2. Load the Embedding Model
print("‚è≥ Loading AI Model (this happens once)...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- NEW SECTION: PDF PROCESSING & CHUNKING ---

def extract_and_chunk_pdf(file_path, chunk_size=1000, overlap=50):
    """
    Reads a PDF and splits it into smaller text chunks.
    chunk_size: Number of characters per chunk.
    overlap: Number of characters to repeat between chunks (prevents context loss).
    """
    try:
        reader = PdfReader(file_path)
        full_text = ""

        # Extract text from all pages
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

        # Sliding Window Chunking Logic
        chunks = []
        start = 0
        text_length = len(full_text)

        while start < text_length:
            # Define the end of the chunk
            end = start + chunk_size

            # Create the chunk
            chunk = full_text[start:end]

            # Clean up newlines for better embedding quality
            clean_chunk = chunk.replace('\n', ' ').strip()

            if len(clean_chunk) > 10: # Filter out tiny empty chunks
                chunks.append(clean_chunk)

            # Move the window forward, minus the overlap
            start += (chunk_size - overlap)

        print(f"üìÑ Processed PDF. Extracted {len(chunks)} chunks.")
        return chunks

    except Exception as e:
        print(f"‚ùå Error reading PDF: {e}")
        return []

# 3. Prepare Data (PDF instead of CSV)
# We convert the chunks into the dictionary format the rest of the script expects
print("üìÇ Reading and chunking PDF...")
pdf_chunks = extract_and_chunk_pdf(PDF_PATH)

# Convert list of strings to list of dicts (to match previous dataframe structure)
documents_to_insert = []
for chunk in pdf_chunks:
    documents_to_insert.append({
        "text": chunk,
        "source": "DAX 1.pdf",  # Metadata to know where it came from
        "type": "pdf_fragment"
    })

# 4. Generate Embeddings & Insert Data
print("üöÄ Generating Embeddings and Indexing Data...")

# Process the prepared documents
final_docs = []
for doc in documents_to_insert:
    # Text -> Vector
    vector_embedding = model.encode(doc['text']).tolist()

    # Add embedding to the document
    doc['embedding'] = vector_embedding
    final_docs.append(doc)

# Insert into MongoDB
if len(final_docs) > 0:
    # Optional: Clear old data if you want a fresh start
    # collection.delete_many({})

    collection.insert_many(final_docs)
    print(f"‚úÖ {len(final_docs)} chunked documents inserted into MongoDB!")
else:
    print("‚ö†Ô∏è No data found to insert.")

# --- THE SEARCH PHASE ---
client = genai.Client(api_key=userdata.get('gemini_api_key'))

def get_answer_from_llm(query):
    # 1. Retrieve the most relevant chunks (Semantic Search)
    query_vector = model.encode(query).tolist()

    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index_pdf",
                "path": "embedding",
                "queryVector": query_vector,
                "numCandidates": 100,
                "limit": 10 # Increased to 3 for better context
            }
        },
        {
            "$project": {
                "_id": 0,
                "text": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    results = list(collection.aggregate(pipeline))

    # 2. Combine results into one context block
    context_text = "\n".join([r['text'] for r in results])

    # 3. Create the prompt for the local LLM
    prompt = f"""
    You are a highly skilled Document Analyst. Your task is to provide a detailed,
    professional, and structured response based on the PDF context provided.

    GUIDELINES:
    1. Use a professional tone.
    2. If the context allows, use bullet points or numbered lists for clarity.
    3. Provide a deep dive into the specifics; do not be brief.
    4. If the answer is not in the context, explain what is missing.

    Context:
    {context_text}

    Question:
    {query}
    """

    # 4. Call Ollama (Running locally on your laptop)
    print("üß† Gemini is thinking...")
    response = client.models.generate_content(
        model='gemini-2.5-flash',
        contents=prompt,
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(include_thoughts=True)
        )
    )

    return response.text

# --- INTERACTIVE CHAT LOOP ---
print("\n‚ú® Ready! You can now chat with your PDF. (Type 'exit' to stop)")
while True:
    user_input = input("\nüë§ You: ")
    if user_input.lower() == 'exit':
        break

    answer = get_answer_from_llm(user_input)
    print(f"\nü§ñ AI: {answer}")


‚úÖ Connected to MongoDB Atlas
‚è≥ Loading AI Model (this happens once)...
üìÇ Reading and chunking PDF...
üìÑ Processed PDF. Extracted 11 chunks.
üöÄ Generating Embeddings and Indexing Data...
‚úÖ 11 chunked documents inserted into MongoDB!

‚ú® Ready! You can now chat with your PDF. (Type 'exit' to stop)

üë§ You: What are the companies that the candidate has worked?
üß† Gemini is thinking...

ü§ñ AI: Based on the provided context, the candidate has worked for the following companies:

*   **MongoDB (Gurugram):** The candidate held the position of Solutions Architect (Analytics and BI).
*   **Deloitte USI (Gurugram):** The candidate worked as a Senior Data Analyst.
*   **Sterlite Technologies Limited (Aurangabad, Maharashtra):** The candidate served as an Assistant Manager, Customer Analytics.

üë§ You: revisit again. You are missing one more company name
üß† Gemini is thinking...

ü§ñ AI: Based on a thorough analysis of the provided document, the following companies are exp

In [None]:
pip install pymongo sentence_transformers pypdf ollama

