In [1]:
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# PDF to Text Extraction
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.strip()

In [3]:
# Text Chunking with Overlapping
def chunk_text(text, chunk_size=512, overlap=100):
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i+chunk_size])
        i += chunk_size - overlap
    return chunks

In [4]:
# Embedding and FAISS Indexing
def create_faiss_index(chunks, model):
    embeddings = model.encode(chunks, convert_to_tensor=True).cpu().numpy()
    dimension = embeddings.shape[1]

    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))

    return index, embeddings

In [5]:

# Main Execution Flow
if __name__ == "__main__":
    pdf_path = input("Enter the path of the PDF file: ")

    # Extract text
    print("\nExtracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    # Chunk and embed
    print("\nChunking and Embedding...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    chunks = chunk_text(text)
    
    index, embeddings = create_faiss_index(chunks, model)

    # Save to .pkl file
    print("\nSaving index, embeddings, and chunks to 'pdf_index.pkl'...")
    with open("pdf_index.pkl", "wb") as f:
        pickle.dump((index, embeddings, chunks), f)

    print("\n✅ Indexing completed and saved successfully!")



Extracting text from PDF...

Chunking and Embedding...

Saving index, embeddings, and chunks to 'pdf_index.pkl'...

✅ Indexing completed and saved successfully!
