In [None]:
# ============================================
# Module 10: Retrieval-Augmented Generation & Vector Search
# Lab 3 – Comparing Similarity Search Methods (BM25 vs. FAISS)
# ============================================
# Author: Dr. Dasha Trofimova
# Course: M.Sc. Applied Data Science & AI
# --------------------------------------------
# Learning Goals:
# - Understand similarity search and document retrieval fundamentals
# - Compare lexical vs. embedding-based retrieval (BM25 vs. FAISS)
# - Analyze trade-offs in recall, speed, and semantic matching
# --------------------------------------------
# Lab Objectives:
# 1. Create a small text corpus of sample sentences
# 2. Implement BM25 retrieval using rank-bm25 or ElasticSearch
# 3. Implement vector-based retrieval using FAISS or Chroma
# 4. Compare top-k results qualitatively and quantitatively
# 5. Discuss which method suits RAG pipelines best
# ============================================
!pip install rank_bm25 sentence-transformers faiss-cpu


In [None]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

corpus = [
    "The cat sat on the mat.",
    "A small kitten was sleeping on a rug.",
    "This paper introduces a novel transformer architecture for language modeling.",
    "We propose a new retrieval method using dense vector search.",
    "The dog chased the ball across the park.",
    "Neural embeddings capture semantic similarity beyond exact keywords."
]

query = "new transformer model for language"

print("Corpus:")
for i, doc in enumerate(corpus):
    print(f"{i}: {doc}")

print("\nQuery:", query)


In [None]:
import re

def tokenize(text):
    # basic tokenizer: lowercase and split on non-letters
    return [t for t in re.split(r"[^a-zA-Z0-9]+", text.lower()) if t]

# Tokenize corpus and query
tokenized_corpus = [tokenize(doc) for doc in corpus]
tokenized_query = tokenize(query)

bm25 = BM25Okapi(tokenized_corpus)

bm25_scores = bm25.get_scores(tokenized_query)

print("BM25 scores:")
for idx, score in sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True):
    print(f"{idx}: {score:.4f} -> {corpus[idx]}")


In [None]:
# Load a small embedding model
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)

# Encode corpus and query
corpus_embeddings = embed_model.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)
query_embedding = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]

# cosine similarity of normalized vectors = dot product
cosine_scores = np.dot(corpus_embeddings, query_embedding)

print(f"Embedding model: {embed_model_name}")
print("\nCosine similarities (semantic):")
for idx, score in sorted(enumerate(cosine_scores), key=lambda x: x[1], reverse=True):
    print(f"{idx}: {score:.4f} -> {corpus[idx]}")


In [None]:
# dimension of embeddings
dim = corpus_embeddings.shape[1]

# Create FAISS index for Inner Product (IP)
index = faiss.IndexFlatIP(dim)

# Add our doc embeddings
index.add(corpus_embeddings)  # each row = one document

# Search top-k
k = 3
D, I = index.search(np.array([query_embedding]), k)

print("FAISS top-k results (by cosine similarity):")
for rank, (doc_idx, score) in enumerate(zip(I[0], D[0]), start=1):
    print(f"Rank {rank} | score={score:.4f} | id={doc_idx} -> {corpus[doc_idx]}")


In [None]:
# side to side comparison of BM25, embedding and FAISS
import pandas as pd

# combine results into a dataframe
data = []
for i, doc in enumerate(corpus):
    data.append({
        "doc_id": i,
        "text": doc,
        "bm25_score": bm25_scores[i],
        "embedding_score": cosine_scores[i],
    })

df = pd.DataFrame(data)

# sort by each score to compare rankings
bm25_ranked = df.sort_values("bm25_score", ascending=False).reset_index(drop=True)
embed_ranked = df.sort_values("embedding_score", ascending=False).reset_index(drop=True)

print("=== Top by BM25 (keyword match) ===")
display(bm25_ranked.head(5))

print("=== Top by Embedding Similarity (semantic match) ===")
display(embed_ranked.head(5))
