In [1]:
from pathlib import Path
import json
import numpy as np
import faiss
import sqlite3
import re
from collections import Counter, defaultdict
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data_cscl"
CHUNKS_PATH = DATA_DIR / "chunks.json"

print("Base dir:", BASE_DIR)
print("Chunks path:", CHUNKS_PATH)


Base dir: C:\Users\lenovo\Desktop\cs_module5_hybrid
Chunks path: C:\Users\lenovo\Desktop\cs_module5_hybrid\data_cscl\chunks.json


In [2]:
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    all_chunks = json.load(f)

print("Total chunks:", len(all_chunks))
print("Chunk keys:", all_chunks[0].keys())


Total chunks: 1029
Chunk keys: dict_keys(['paper_id', 'title', 'chunk_id', 'text'])


In [3]:
EMB_PATH = DATA_DIR / "embeddings_text3_small.npy"

texts = [c["text"] for c in all_chunks]
print("Total chunks:", len(texts))

model = SentenceTransformer("all-MiniLM-L6-v2")

BATCH_SIZE = 32
emb_list = []

for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Local Embedding"):
    batch = texts[i:i + BATCH_SIZE]
    batch_emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    emb_list.append(batch_emb)

embeddings = np.vstack(emb_list).astype("float32")
np.save(EMB_PATH, embeddings)

print(EMB_PATH, embeddings.shape)


Total chunks: 1029


Local Embedding: 100%|█████████████████████████████████████████████████████████████████| 33/33 [00:24<00:00,  1.34it/s]

C:\Users\lenovo\Desktop\cs_module5_hybrid\data_cscl\embeddings_text3_small.npy (1029, 384)





In [4]:
embeddings = np.load(EMB_PATH).astype("float32")
print("Embeddings shape:", embeddings.shape)

faiss.normalize_L2(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product index
index.add(embeddings)

print("Vectors in FAISS index:", index.ntotal)

FAISS_PATH = DATA_DIR / "faiss_index_text3_small.bin"
faiss.write_index(index, str(FAISS_PATH))
print("FAISS index saved to:", FAISS_PATH)


Embeddings shape: (1029, 384)
Vectors in FAISS index: 1029
FAISS index saved to: C:\Users\lenovo\Desktop\cs_module5_hybrid\data_cscl\faiss_index_text3_small.bin


In [5]:
DB_PATH = DATA_DIR / "rag_hybrid.db"

# Remove existing DB if needed
if DB_PATH.exists():
    DB_PATH.unlink()

conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

# Create papers table
cur.execute("""
CREATE TABLE papers (
    paper_id TEXT PRIMARY KEY,
    title    TEXT
)
""")

# Create chunks table
cur.execute("""
CREATE TABLE chunks (
    chunk_id TEXT PRIMARY KEY,
    paper_id TEXT,
    text     TEXT
)
""")

# Insert unique papers and chunks
seen_papers = set()
for c in all_chunks:
    pid = c["paper_id"]
    if pid not in seen_papers:
        cur.execute(
            "INSERT INTO papers (paper_id, title) VALUES (?, ?)",
            (pid, c["title"])
        )
        seen_papers.add(pid)

for c in all_chunks:
    cur.execute(
        "INSERT INTO chunks (chunk_id, paper_id, text) VALUES (?, ?, ?)",
        (c["chunk_id"], c["paper_id"], c["text"])
    )

conn.commit()
print("Database created successfully!")
print("Location:", DB_PATH)


Database created successfully!
Location: C:\Users\lenovo\Desktop\cs_module5_hybrid\data_cscl\rag_hybrid.db


In [6]:
# Simple tokenizer
def simple_tokenize(text: str):
    """Simple tokenizer: lowercase, keep word characters only."""
    text = text.lower()
    return re.findall(r"\b\w+\b", text)

# Collect term statistics from all chunks
bm25_data = []
doc_freq = Counter()
doc_lengths = Counter()
N = len(all_chunks)

for c in tqdm(all_chunks, desc="Collecting term stats"):
    cid = c["chunk_id"]
    tokens = simple_tokenize(c["text"])
    doc_lengths[cid] = len(tokens)
    term_counts = Counter(tokens)
    for term, tf in term_counts.items():
        doc_freq[term] += 1
        bm25_data.append((term, cid, tf))

avg_dl = sum(doc_lengths.values()) / N
k1 = 1.5
b = 0.75

def bm25_idf(term):
    df = doc_freq[term]
    return np.log((N - df + 0.5) / (df + 0.5))

# Add BM25 inverted index table
cur.execute("DROP TABLE IF EXISTS bm25_inverted;")
cur.execute("""
CREATE TABLE bm25_inverted (
    term    TEXT,
    chunk_id TEXT,
    tf      INTEGER,
    idf     REAL,
    score   REAL
)
""")

# Insert rows
for term, cid, tf in tqdm(bm25_data, desc="Building BM25 table"):
    dl = doc_lengths[cid]
    idf = bm25_idf(term)
    tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / avg_dl))
    score = idf * tf_norm
    cur.execute(
        "INSERT INTO bm25_inverted (term, chunk_id, tf, idf, score) VALUES (?, ?, ?, ?, ?)",
        (term, cid, tf, idf, score)
    )

conn.commit()
print("BM25 inverted index built.")


Collecting term stats: 100%|█████████████████████████████████████████████████████| 1029/1029 [00:00<00:00, 5416.98it/s]
Building BM25 table: 100%|█████████████████████████████████████████████████| 177245/177245 [00:00<00:00, 260462.43it/s]

BM25 inverted index built.





In [7]:
# Vector search using FAISS
def vector_search(query: str, top_k: int = 10):
    q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)
    scores, idxs = index.search(q_emb, top_k)
    idxs = idxs[0]
    scores = scores[0]

    results = []
    for i, s in zip(idxs, scores):
        c = all_chunks[int(i)]
        results.append((c["chunk_id"], float(s)))
    return results

# BM25 search using SQLite
def bm25_search(query: str, top_k: int = 10):
    """BM25-like search using inverted index table."""
    terms = simple_tokenize(query)
    placeholders = ",".join("?" * len(terms))
    sql = f"""
    SELECT chunk_id, SUM(tf * idf) AS score
    FROM bm25_inverted
    WHERE term IN ({placeholders})
    GROUP BY chunk_id
    ORDER BY score DESC
    LIMIT ?;
    """
    rows = cur.execute(sql, (*terms, top_k)).fetchall()
    return [(cid, float(score)) for cid, score in rows]

# Hybrid search
def hybrid_search(query: str, top_k: int = 10, alpha: float = 0.5):
    """Hybrid search: BM25 + vector similarity."""
    bm25_raw = bm25_search(query, top_k=top_k * 5)
    vec_raw = vector_search(query, top_k=top_k * 5)

    bm25_scores = {cid: score for cid, score in bm25_raw}
    vec_scores = {cid: score for cid, score in vec_raw}

    all_ids = set(bm25_scores.keys()) | set(vec_scores.keys())
    combined = []

    for cid in all_ids:
        s_bm25 = bm25_scores.get(cid, 0.0)
        s_vec = vec_scores.get(cid, 0.0)
        score = alpha * s_bm25 + (1 - alpha) * s_vec
        combined.append((cid, score))

    combined.sort(key=lambda x: x[1], reverse=True)
    return combined[:top_k]


In [8]:
# Simple manual queries for evaluation
queries = [
    "What is speaker diarization?",
    "How do neural models handle machine translation?",
    "What are common approaches for sentiment analysis?",
    "How is question answering evaluated?",
    "What methods improve entity recognition?",
]

def evaluate_search(search_fn, name: str, top_k: int = 5):
    """Approximate evaluation: how many queries hit at least one new paper."""
    total = len(queries)
    hit = 0

    for q in queries:
        results = search_fn(q, top_k=top_k)
        paper_ids = set()
        for cid, _ in results:
            # Map chunk_id back to paper_id
            # all_chunks is a list; use a small map for speed if needed
            for c in all_chunks:
                if c["chunk_id"] == cid:
                    paper_ids.add(c["paper_id"])
                    break
        if len(paper_ids) > 0:
            hit += 1

    print(f"{name} hit-rate@{top_k}: {hit / total:.2f} ({hit}/{total})")


# Vector-only
evaluate_search(vector_search, "Vector only", top_k=5)

# Hybrid, different alpha
for a in [0.3, 0.5, 0.7]:
    eval_fn = lambda q, top_k=5, a=a: hybrid_search(q, top_k=top_k, alpha=a)
    evaluate_search(eval_fn, f"Hybrid alpha={a}", top_k=5)


Vector only hit-rate@5: 1.00 (5/5)
Hybrid alpha=0.3 hit-rate@5: 1.00 (5/5)
Hybrid alpha=0.5 hit-rate@5: 1.00 (5/5)
Hybrid alpha=0.7 hit-rate@5: 1.00 (5/5)
