In [None]:

import os, glob, textwrap
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np

PDF_DIR = "./pdfs"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 200
TOP_K = 3

# ---- Load and chunk PDFs ----
def extract_chunks(path):
    reader = PdfReader(path)
    chunks = []
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        text = " ".join(text.split())
        start = 0
        while start < len(text):
            end = min(len(text), start + CHUNK_SIZE)
            chunk = text[start:end]
            if chunk.strip():
                chunks.append((os.path.basename(path), i, chunk))
            start = end - CHUNK_OVERLAP
            if start < 0: start = 0
    return chunks

docs = []
for pdf in glob.glob(os.path.join(PDF_DIR, "*.pdf")):
    docs.extend(extract_chunks(pdf))

print(f"Loaded {len(docs)} text chunks from {len(glob.glob(PDF_DIR+'/*.pdf'))} PDFs.")

# ---- Build embeddings ----
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode([c[2] for c in docs], normalize_embeddings=True)

# ---- Q&A loop ----
print("\nAsk questions about your PDFs (type 'exit' to quit):\n")
while True:
    q = input("Q> ").strip()
    if q.lower() in {"exit", "quit"}:
        break
    q_emb = model.encode([q], normalize_embeddings=True)[0]
    sims = np.dot(embeddings, q_emb)
    top_idx = np.argsort(-sims)[:TOP_K]

    print("\nAnswer (based on retrieved text):\n")
    for i in top_idx:
        fname, page, chunk = docs[i]
        print(f"[{fname} p.{page}] {textwrap.shorten(chunk, width=200)}\n")
