In [5]:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Sample documents
documents = [
    "Artificial Intelligence is transforming the world.",
    "Machine Learning and Deep Learning are subsets of AI.",
    "Natural Language Processing helps computers understand human language.",
    "Hybrid search combines keyword and vector search for better accuracy."
]

In [7]:
# ========== Sparse Retrieval (BM25 with Whoosh) ==========
def create_bm25_index(documents):
    schema = Schema(content=TEXT(stored=True))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = create_in("indexdir", schema)
    
    writer = ix.writer()
    for doc in documents:
        writer.add_document(content=doc)
    writer.commit()
    return ix

def search_bm25(ix, query_text, top_k=3):
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(query_text)
        results = searcher.search(query, limit=top_k)
        return [(res["content"], res.score) for res in results]

In [8]:
# ========== Dense Retrieval (FAISS with Embeddings) ==========
model = SentenceTransformer("all-MiniLM-L6-v2")  # Embedding model

# Encode documents into embeddings
doc_embeddings = model.encode(documents, convert_to_numpy=True)

# Create FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

def search_dense(query_text, top_k=3):
    query_embedding = model.encode([query_text], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    return [(documents[i], 1 / (1 + distances[0][j])) for j, i in enumerate(indices[0])]  # Convert L2 to similarity score

# ========== Hybrid Retrieval (Combining Sparse + Dense) ==========
def hybrid_search(query_text, top_k=3, alpha=0.5):
    bm25_results = search_bm25(ix, query_text, top_k)
    dense_results = search_dense(query_text, top_k)

    # Normalize scores
    bm25_dict = {doc: score for doc, score in bm25_results}
    dense_dict = {doc: score for doc, score in dense_results}
    
    # Combine scores using weighted sum
    hybrid_scores = {}
    for doc in set(bm25_dict.keys()).union(dense_dict.keys()):
        hybrid_scores[doc] = alpha * bm25_dict.get(doc, 0) + (1 - alpha) * dense_dict.get(doc, 0)
    
    # Sort by combined score
    return sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
# ========== Running the Hybrid Search ==========
query = "AI and Deep Learning"
ix = create_bm25_index(documents)

bm25_results = search_bm25(ix, query)
dense_results = search_dense(query)
hybrid_results = hybrid_search(query)




🔍 BM25 Results: [('Machine Learning and Deep Learning are subsets of AI.', 5.875876565338681)]

🤖 Dense Results: [('Machine Learning and Deep Learning are subsets of AI.', np.float32(0.649146)), ('Artificial Intelligence is transforming the world.', np.float32(0.49773413)), ('Natural Language Processing helps computers understand human language.', np.float32(0.4177809))]

🔥 Hybrid Results: [('Machine Learning and Deep Learning are subsets of AI.', np.float32(3.2625113)), ('Artificial Intelligence is transforming the world.', np.float32(0.24886706)), ('Natural Language Processing helps computers understand human language.', np.float32(0.20889045))]


In [11]:
bm25_results

[('Machine Learning and Deep Learning are subsets of AI.', 5.875876565338681)]

In [12]:
dense_results

[('Machine Learning and Deep Learning are subsets of AI.',
  np.float32(0.649146)),
 ('Artificial Intelligence is transforming the world.',
  np.float32(0.49773413)),
 ('Natural Language Processing helps computers understand human language.',
  np.float32(0.4177809))]

In [13]:
hybrid_results

[('Machine Learning and Deep Learning are subsets of AI.',
  np.float32(3.2625113)),
 ('Artificial Intelligence is transforming the world.',
  np.float32(0.24886706)),
 ('Natural Language Processing helps computers understand human language.',
  np.float32(0.20889045))]

In [10]:
print("\n🔍 BM25 Results:", bm25_results)
print("\n🤖 Dense Results:", dense_results)
print("\n🔥 Hybrid Results:", hybrid_results)


🔍 BM25 Results: [('Machine Learning and Deep Learning are subsets of AI.', 5.875876565338681)]

🤖 Dense Results: [('Machine Learning and Deep Learning are subsets of AI.', np.float32(0.649146)), ('Artificial Intelligence is transforming the world.', np.float32(0.49773413)), ('Natural Language Processing helps computers understand human language.', np.float32(0.4177809))]

🔥 Hybrid Results: [('Machine Learning and Deep Learning are subsets of AI.', np.float32(3.2625113)), ('Artificial Intelligence is transforming the world.', np.float32(0.24886706)), ('Natural Language Processing helps computers understand human language.', np.float32(0.20889045))]
