# Detyra 4 — Vector Databases

Qëllimi: të krijoni një index vektorial (FAISS) dhe të kryeni retrieval.


In [None]:
# === Setup (run this cell first) ===
import os, json, re, math, random
from pathlib import Path

PROJECT_ROOT = Path("/content")  # In Colab, students will upload the project folder or mount Drive
DOCS_DIR = PROJECT_ROOT / "documents"
QA_PATH = PROJECT_ROOT / "qa" / "qa_benchmark_40.json"

print("Docs dir:", DOCS_DIR)
print("QA path:", QA_PATH)

# Tip: If you're using Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# PROJECT_ROOT = Path('/content/drive/MyDrive/<YOUR_PROJECT_FOLDER>')


In [None]:
!pip -q install sentence-transformers faiss-cpu pymupdf
import fitz, re, numpy as np
from sentence_transformers import SentenceTransformer
import faiss


## 1) Ndërtoni koleksion njësish nga disa dokumente
Ekstraktoni tekst nga MIL-ENG-001..003 dhe segmentoni në njësi.
**TODO:** ruani metadatat (doc_name, unit_id).


In [None]:
def extract_text(pdf_path):
    doc = fitz.open(str(pdf_path))
    txt = ""
    for p in doc:
        txt += p.get_text() + "\n"
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def segment_units_simple(raw: str):
    # Split on labels; improve as needed
    split = re.split(r"(?=\b(FRAZA|TERM|Dialog|Gabim)\b)", raw)
    units = []
    buf = ""
    for part in split:
        if part in ["FRAZA","TERM","Dialog","Gabim"]:
            if buf.strip():
                units.append(buf.strip())
            buf = part
        else:
            buf += " " + part
    if buf.strip():
        units.append(buf.strip())
    units = [u for u in units if len(u) > 120]
    return units

pdfs = []
for name in ["MIL-ENG-001", "MIL-ENG-002", "MIL-ENG-003"]:
    match = list(DOCS_DIR.glob(f"{name}*.pdf"))
    if match: pdfs.append(match[0])

records = []
for pdf in pdfs:
    raw = extract_text(pdf)
    units = segment_units_simple(raw)
    for i,u in enumerate(units):
        records.append({"doc": pdf.name, "unit_id": i, "text": u})

print("Total units:", len(records))
print(records[0]["doc"], records[0]["text"][:200])


## 2) Embeddings + FAISS index


In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
texts = [r["text"] for r in records]
emb = model.encode(texts, normalize_embeddings=True)
dim = emb.shape[1]

index = faiss.IndexFlatIP(dim)
index.add(emb.astype("float32"))
print("FAISS size:", index.ntotal)


## 3) Query & top-k results
**TODO:** printoni edhe metadatat për çdo rezultat.


In [None]:
import numpy as np
def search(query, k=5):
    q = model.encode([query], normalize_embeddings=True).astype("float32")
    scores, idx = index.search(q, k)
    hits = []
    for s,i in zip(scores[0], idx[0]):
        hits.append((float(s), records[int(i)]))
    return hits

hits = search("Çfarë do të thotë Wilco?", k=5)
for s,r in hits:
    print("\nScore:", s, "| Doc:", r["doc"], "| unit:", r["unit_id"])
    print(r["text"][:250])


## Dorëzimi
- Krijoni index FAISS
- Demonstroni 3 query
- Shpjegoni: pse FAISS + embeddings janë të dobishme për tutorin.
