Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
from google.colab import files
uploaded = files.upload()     # choose your 6 PDF files here
pdf_paths = list(uploaded.keys())
print("Uploaded PDFs:", pdf_paths)


Saving BAJHLIP23020V012223.pdf to BAJHLIP23020V012223 (2).pdf
Saving CHOTGDP23004V012223.pdf to CHOTGDP23004V012223 (2).pdf
Saving EDLHLGA23009V012223.pdf to EDLHLGA23009V012223 (1).pdf
Saving HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf to HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document (1).pdf
Saving HDFHLIP23024V072223.pdf to HDFHLIP23024V072223 (1).pdf
Saving ICIHLIP22012V012223.pdf to ICIHLIP22012V012223 (2).pdf
Uploaded PDFs: ['BAJHLIP23020V012223 (2).pdf', 'CHOTGDP23004V012223 (2).pdf', 'EDLHLGA23009V012223 (1).pdf', 'HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document (1).pdf', 'HDFHLIP23024V072223 (1).pdf', 'ICIHLIP22012V012223 (2).pdf']


In [None]:
!pip install pdfplumber sentence-transformers faiss-cpu tqdm nltk
import os, re, pdfplumber, faiss, numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt', quiet=True)

# ---------------- CONFIG ----------------
MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE, CHUNK_OVERLAP, TOP_K = 800, 150, 5
# ----------------------------------------

def clean_text(t): return re.sub(r"\s+", " ", t).strip()

def extract_pdf_chunks(pdf_path):
    """Extract text chunks with page numbers from a single PDF."""
    chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for pageno, page in enumerate(pdf.pages, 1):
            text = page.extract_text() or ""
            text = clean_text(text)
            if not text: continue
            start = 0
            while start < len(text):
                end = start + CHUNK_SIZE
                chunk = text[start:end]
                chunks.append({
                    "pdf_name": os.path.basename(pdf_path),
                    "page_no": pageno,
                    "chunk_text": chunk
                })
                start = end - CHUNK_OVERLAP
                if start >= len(text): break
    return chunks

def build_index_from_pdfs(pdf_paths):
    """Read & embed all PDFs the user uploaded."""
    corpus = []
    for path in tqdm(pdf_paths, desc="Extracting PDFs"):
        corpus.extend(extract_pdf_chunks(path))
    print(f"Total chunks: {len(corpus)}")
    model = SentenceTransformer(MODEL_NAME)
    texts = [c["chunk_text"] for c in corpus]
    print("Encoding embeddings ...")
    embs = model.encode(texts, convert_to_numpy=True,
                        normalize_embeddings=True,
                        show_progress_bar=True)
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embs)
    print("✅ Index ready.")
    return model, index, corpus

def query_index(q, model, index, corpus, k=TOP_K):
    qv = model.encode([q], convert_to_numpy=True, normalize_embeddings=True)
    D,I = index.search(qv, k)
    res=[]
    for d,i in zip(D[0],I[0]):
        if 0<=i<len(corpus):
            r = corpus[i].copy(); r["score"]=float(d); res.append(r)
    return res

def synthesize_answer(query, retrieved):
    if not retrieved:
        return {"answer":"No relevant clauses found.","refs":[]}
    refs=[{"policy":r["pdf_name"],"page":r["page_no"],
           "score":round(r["score"],3),
           "excerpt":clean_text(r["chunk_text"][:600])}
          for r in retrieved]
    return {"answer":"Here are the most relevant clauses:", "refs":refs}

def recommend_policy(user_need, model, index, corpus, top_n=3):
    res = query_index(user_need, model, index, corpus, k=30)
    scores={}
    for r in res:
        scores.setdefault(r["pdf_name"],[]).append(r["score"])
    avg={p:np.mean(sorted(s,reverse=True)[:5]) for p,s in scores.items()}
    ranked=sorted(avg.items(),key=lambda x:x[1],reverse=True)
    recs=[]
    for i,(name,score) in enumerate(ranked[:top_n],1):
        related=[r for r in res if r["pdf_name"]==name][:2]
        snips=[clean_text(r["chunk_text"][:250]) for r in related]
        recs.append({"rank":i,"policy":name,"score":round(score,3),"snips":snips})
    return recs

def add_uploaded_policy(path, corpus, model, index):
    """Add a new user-uploaded PDF dynamically."""
    new_chunks = extract_pdf_chunks(path)
    new_embs = model.encode([c["chunk_text"] for c in new_chunks],
                             convert_to_numpy=True, normalize_embeddings=True)
    index.add(new_embs)
    corpus.extend(new_chunks)
    print(f"Added {len(new_chunks)} chunks from {os.path.basename(path)}")
    return corpus, index

# ---------- MAIN INTERFACE ----------
model, index, corpus = build_index_from_pdfs(pdf_paths)

while True:
    mode = input("\nMode (qa / recommend / upload / exit): ").strip().lower()
    if mode == "exit":
        break
    elif mode == "qa":
        q = input("Enter your question: ").strip()
        results = query_index(q, model, index, corpus, k=TOP_K)
        ans = synthesize_answer(q, results)
        print("\nAnswer:", ans["answer"])
        for ref in ans["refs"]:
            print(f"- {ref['policy']} (Page {ref['page']}, Score {ref['score']})")
            print("  ", ref["excerpt"], "\n")
    elif mode == "recommend":
        need = input("Describe desired coverages: ").strip()
        recs = recommend_policy(need, model, index, corpus)
        print("\n🏆 Recommended Policies:")
        for r in recs:
            print(f"{r['rank']}. {r['policy']} (Score {r['score']})")
            for s in r["snips"]:
                print("   🩺", s)
            print()
    elif mode == "upload":
        from google.colab import files
        up = files.upload()
        path = list(up.keys())[0]
        corpus, index = add_uploaded_policy(path, corpus, model, index)
    else:
        print("Invalid mode.")


Extracting PDFs: 100%|██████████| 6/6 [01:04<00:00, 10.83s/it]


Total chunks: 1481
Encoding embeddings ...


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

✅ Index ready.

Mode (qa / recommend / upload / exit): qa
Enter your question: is knee surgery included?

Answer: Here are the most relevant clauses:
- BAJHLIP23020V012223 (2).pdf (Page 45, Score 0.484)
   copic Meniscectomy - Knee 85 intraluminal Brachytherapy 284 Treatment of clavicle dislocation 86 Electron Therapy 285 Arthroscopic meniscus repair 87 TSET-Total Electron Skin Therapy 286 Haemarthrosis knee- lavage 88 Extracorporeal Irradiation of Blood Products 287 Abscess knee joint drainage 89 Telecobalt Therapy 288 Carpal tunnel release 90 Telecesium Therapy 289 Closed reduction of minor dislocation 91 External mould Brachytherapy 290 Repair of knee cap tendon 92 Interstitial Brachytherapy 291 ORIF with K wire fixation- small bones 93 Intracavity Brachytherapy 292 Release of midfoot joint 9 

- BAJHLIP23020V012223 (2).pdf (Page 46, Score 0.478)
   raft duct fistula 318 Biopsy elbow joint lining 119 Removal cartilage graft 319 Removal of wrist prosthesis 120 Myocutaneous flap 320 B

Saving EDLHLGA23009V012223.pdf to EDLHLGA23009V012223 (2).pdf
Added 11 chunks from EDLHLGA23009V012223 (2).pdf

Mode (qa / recommend / upload / exit): exit
