## Hybrid Search Evaluation with Qdrant

This notebook demonstrates hybrid search combining dense embeddings and sparse BM25 vectors using Qdrant. We'll evaluate different search approaches and compare their performance using RRF (Reciprocal Rank Fusion).

Overview
Dense vectors: Capture semantic meaning, good for natural language queries
Sparse vectors (BM25): Excel at exact keyword matches, fast and lightweight
Hybrid Search: Combines both approaches for better overall performance

In [1]:
# Install required packages
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2" pandas numpy

### Start Qdrant server in Docker (run this once)
!docker run -d -p 6333:6333 -p 6334:6334 \
   -v "./qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

In [17]:
# Import required libraries
import json
import pandas as pd
import uuid
import os
import pickle
from typing import List, Dict, Any
import numpy as np
from qdrant_client import QdrantClient, models
import time
import hashlib
from datetime import datetime

### Load Data

Loading documents and ground truth data for evaluation

In [4]:
# Load documents and ground truth data
with open('../data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

df_ground_truth = pd.read_csv('../data/processed/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")
print(f"Document keys: {list(documents[0].keys()) if documents else 'No documents'}")
print(f"Ground truth columns: {df_ground_truth.columns.tolist()}")

Loaded 149 documents and 735 ground truth questions
Document keys: ['location', 'doc_id', 'content', 'id']
Ground truth columns: ['question', 'id']


In [5]:
# Connect to Qdrant server
client = QdrantClient("http://localhost:6333")

# Test connection by listing collections
try:
    collections = client.get_collections()
    print(f"Connected to Qdrant. Existing collections: {[c.name for c in collections.collections]}")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    print("Make sure Qdrant is running on localhost:6333")

Connected to Qdrant. Existing collections: ['vector-search-jinaai-jina-embeddings-v2-small-en']


In [6]:
# Examine data structure
print("Sample document:")
print(json.dumps(documents[0], indent=2))
print("\nSample ground truth entry:")
print(ground_truth[0])

Sample document:
{
  "location": "Andhra_Pradesh",
  "doc_id": "d4402d82c0",
  "content": "Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)",
  "id": "4f80b327"
}

Sample ground truth entry:
{'question': 'What are the must-see religious sites in Andhra Pradesh for pilgrims?', 'id': '4f80b327'}


### Create Sparse Vector Collection (BM25)

In [23]:
# Create collection for sparse vectors only
collection_name_sparse = "sparse-vector-bm25"

In [24]:
# Delete collection if it exists
try:
    client.delete_collection(collection_name_sparse)
    print(f"Deleted existing collection: {collection_name_sparse}")
except:
    pass

print(f"Created sparse collection: {collection_name_sparse}")

Deleted existing collection: sparse-vector-bm25
Created sparse collection: sparse-vector-bm25


In [25]:
# Create new collection with sparse vector configuration
client.create_collection(
    collection_name=collection_name_sparse,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,  # Enable IDF calculation for BM25
        )
    }
)

True

In [32]:
def upload_sparse_vectors(documents: List[Dict], collection_name: str) -> None:
    """Upload documents as sparse vectors using BM25 model"""
    
    points = []
    for doc in documents:
        text_content = doc["content"]
        original_id = doc["id"]
        
        # Convert string ID to valid UUID by hashing
        hash_object = hashlib.md5(original_id.encode())
        uuid_string = str(uuid.UUID(hash_object.hexdigest()))
        
        point = models.PointStruct(
            id=uuid_string,  # Use the generated UUID for Qdrant
            vector={
                "bm25": models.Document(
                    text=text_content, 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "content": text_content,
                "location": doc.get("location", ""),
                "doc_id": doc.get("doc_id", ""),
                "id": original_id  # Store original ID as 'id' in payload
            }
        )
        points.append(point)
    
    # Move this print OUTSIDE the loop
    print(f"Created {len(points)} points from {len(documents)} documents")
    
    # Upload in batches for better performance
    batch_size = 100
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        client.upsert(collection_name=collection_name, points=batch)
        print(f"Uploaded batch {i//batch_size + 1}/{(len(points)-1)//batch_size + 1}")
    
    print(f"Successfully uploaded {len(points)} documents to {collection_name}")

# Then call the function
start_time = time.time()
upload_sparse_vectors(documents, collection_name_sparse)
upload_time_sparse = time.time() - start_time
print(f"Sparse vector upload took {upload_time_sparse:.2f} seconds")

Created 149 points from 149 documents
Uploaded batch 1/2
Uploaded batch 2/2
Successfully uploaded 149 documents to sparse-vector-bm25
Sparse vector upload took 0.35 seconds


### Sparse Vector Search Functions


In [52]:
def sparse_search(query: str, collection_name: str, limit: int = 5) -> List[models.ScoredPoint]:
    """Perform sparse vector search using BM25"""
    
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )
    
    return results.points


In [53]:
# Test sparse search
test_query = "Is there any Unesco world heritage site in Karnataka or Andra Pradesh?"
sparse_results = sparse_search(test_query, collection_name_sparse, limit=5)

print(f"Sparse search results for: '{test_query}'")
for i, result in enumerate(sparse_results, 1):
    print(f"\n{i}. Score: {result.score:.4f}")
    print(f"   Content: {result.payload['content'][:150]}...")  # Changed from 'text' to 'content'
    print(f"   Location: {result.payload.get('location', 'N/A')}")  # Changed from 'section'
    print(f"   Doc ID: {result.payload.get('doc_id', 'N/A')}")  # Changed from 'section'
    print(f"   Original ID: {result.payload.get('original_id', 'N/A')}")

Sparse search results for: 'Is there any Unesco world heritage site in Karnataka or Andra Pradesh?'

1. Score: 22.8500
   Content: Thanks to its long and varied history Karnataka has an interesting mix of religious sites, remnants of historical empires, UNESCO World Heritage Sites...
   Location: Karnataka
   Doc ID: 9a1dcdf649
   Original ID: N/A

2. Score: 20.0857
   Content: 1  
![1_image_0.png](1_image_0.png)  
Hampi - the ruins of the fabulous Vijayanagar Empire. Considered to be one of the finest cities in its time circ...
   Location: Karnataka
   Doc ID: 0bbac20900
   Original ID: N/A

3. Score: 10.2497
   Content: religious and archeological site of Hampi. Full of other-worldly geology and nature. Northwestern Karnataka Good transport connections, but interior i...
   Location: Karnataka
   Doc ID: 0bbac20900
   Original ID: N/A

4. Score: 9.8457
   Content: Bangalore The state capital as well as the information technology capital of India. Cauvery Basin Southern Karnataka, in

### Create Hybrid Collection (Dense + Sparse Vectors)

In [54]:
# Create collection for hybrid search (both dense and sparse vectors)
collection_name_hybrid = "documents-hybrid-search"

In [55]:
# Delete collection if it exists
try:
    client.delete_collection(collection_name_hybrid)
    print(f"Deleted existing collection: {collection_name_hybrid}")
except:
    pass

Deleted existing collection: documents-hybrid-search


In [56]:
# Create hybrid collection with both vector types
client.create_collection(
    collection_name=collection_name_hybrid,
    vectors_config={
        # Dense vector configuration
        "jina-small": models.VectorParams(
            size=512,  # Dimension size for jina-embeddings-v2-small-en
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        # Sparse vector configuration  
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)
print(f"Created hybrid collection: {collection_name_hybrid}")

Created hybrid collection: documents-hybrid-search


In [57]:
def upload_hybrid_vectors(documents: List[Dict], collection_name: str) -> None:
    """Upload documents with both dense and sparse vectors"""
    
    points = []
    for doc in documents:
        text_content = doc["content"]
        original_id = doc["id"]
        
        # Convert string ID to valid UUID by hashing
        hash_object = hashlib.md5(original_id.encode())
        uuid_string = str(uuid.UUID(hash_object.hexdigest()))
        
        point = models.PointStruct(
            id=uuid_string,  # Use the generated UUID for Qdrant
            vector={
                # Dense vector using Jina model
                "jina-small": models.Document(
                    text=text_content,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                # Sparse vector using BM25
                "bm25": models.Document(
                    text=text_content, 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "content": text_content,
                "location": doc.get("location", ""),
                "doc_id": doc.get("doc_id", ""),
                "id": original_id  # Store original ID as 'id' in payload
            }
        )
        points.append(point)
    
    # Upload in batches
    batch_size = 50  # Smaller batches for hybrid upload
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        client.upsert(collection_name=collection_name, points=batch)
        print(f"Uploaded hybrid batch {i//batch_size + 1}/{(len(points)-1)//batch_size + 1}")
    
    print(f"Successfully uploaded {len(points)} documents with hybrid vectors")

# Upload documents (this will take longer due to dense embeddings)
start_time = time.time()
upload_hybrid_vectors(documents, collection_name_hybrid)
upload_time_hybrid = time.time() - start_time
print(f"Hybrid vector upload took {upload_time_hybrid:.2f} seconds")

Uploaded hybrid batch 1/3
Uploaded hybrid batch 2/3
Uploaded hybrid batch 3/3
Successfully uploaded 149 documents with hybrid vectors
Hybrid vector upload took 9.21 seconds


### Hybrid Search Functions


In [58]:
def dense_search(query: str, collection_name: str, limit: int = 5) -> List[models.ScoredPoint]:
    """Perform dense vector search using semantic embeddings"""
    
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="jinaai/jina-embeddings-v2-small-en",
        ),
        using="jina-small",
        limit=limit,
        with_payload=True,
    )
    
    return results.points

In [59]:
def multi_stage_search(query: str, collection_name: str, limit: int = 5) -> List[models.ScoredPoint]:
    """Multi-stage search: dense retrieval followed by sparse reranking"""
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(10 * limit),  # Prefetch more candidates for reranking
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )
    
    return results.points

In [60]:
def rrf_search(query: str, collection_name: str, limit: int = 5) -> List[models.ScoredPoint]:
    """Reciprocal Rank Fusion combining dense and sparse search"""
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            # Dense vector prefetch
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),  # Get more candidates for fusion
            ),
            # Sparse vector prefetch
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Apply RRF fusion to combine results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=limit,
        with_payload=True,
    )
    
    return results.points

In [61]:
# Test all search methods with the same query
test_query = "Is there a Unesco world heritage site in Karnataka or Andra Pradesh?"
print(f"Testing search methods with query: '{test_query}'\n")

Testing search methods with query: 'Is there a Unesco world heritage site in Karnataka or Andra Pradesh?'



In [62]:
# Dense search
dense_results = dense_search(test_query, collection_name_hybrid, limit=3)
print("=== DENSE SEARCH RESULTS ===")
for i, result in enumerate(dense_results, 1):
    print(f"{i}. Score: {result.score:.4f} | {result.payload['content'][:100]}...")

=== DENSE SEARCH RESULTS ===
1. Score: 0.8651 | Thanks to its long and varied history Karnataka has an interesting mix of religious sites, remnants ...
2. Score: 0.8552 | Owing to the multi-religious influence in Karnataka through its history, there are a vast number of ...
3. Score: 0.8372 | places are well known for beautiful mountain ranges, national parks, forests, wild animals and water...


In [63]:
# Sparse search on hybrid collection
sparse_results_hybrid = sparse_search(test_query, collection_name_hybrid, limit=3)
print("\n=== SPARSE SEARCH RESULTS ===")
for i, result in enumerate(sparse_results_hybrid, 1):
    print(f"{i}. Score: {result.score:.4f} | {result.payload['content'][:100]}...")


=== SPARSE SEARCH RESULTS ===
1. Score: 22.8500 | Thanks to its long and varied history Karnataka has an interesting mix of religious sites, remnants ...
2. Score: 20.0857 | 1  
![1_image_0.png](1_image_0.png)  
Hampi - the ruins of the fabulous Vijayanagar Empire. Consider...
3. Score: 10.2497 | religious and archeological site of Hampi. Full of other-worldly geology and nature. Northwestern Ka...


In [64]:
# Multi-stage search
multi_stage_results = multi_stage_search(test_query, collection_name_hybrid, limit=3)
print("\n=== MULTI-STAGE SEARCH RESULTS ===")
for i, result in enumerate(multi_stage_results, 1):
    print(f"{i}. Score: {result.score:.4f} | {result.payload['content'][:100]}...")


=== MULTI-STAGE SEARCH RESULTS ===
1. Score: 22.8500 | Thanks to its long and varied history Karnataka has an interesting mix of religious sites, remnants ...
2. Score: 20.0857 | 1  
![1_image_0.png](1_image_0.png)  
Hampi - the ruins of the fabulous Vijayanagar Empire. Consider...
3. Score: 10.2497 | religious and archeological site of Hampi. Full of other-worldly geology and nature. Northwestern Ka...


In [65]:
# RRF search
rrf_results = rrf_search(test_query, collection_name_hybrid, limit=3)
print("\n=== RRF HYBRID SEARCH RESULTS ===")
for i, result in enumerate(rrf_results, 1):
    print(f"{i}. Score: {result.score:.4f} | {result.payload['content'][:100]}...")


=== RRF HYBRID SEARCH RESULTS ===
1. Score: 1.0000 | Thanks to its long and varied history Karnataka has an interesting mix of religious sites, remnants ...
2. Score: 0.4333 | Owing to the multi-religious influence in Karnataka through its history, there are a vast number of ...
3. Score: 0.3500 | religious and archeological site of Hampi. Full of other-worldly geology and nature. Northwestern Ka...


### Evaluation Framework

In [66]:
def evaluate_search_method(search_function, collection_name: str, ground_truth: List[Dict], 
                         method_name: str, top_k: int = 5) -> Dict[str, Any]:
    """Evaluate a search method against ground truth data"""
    
    results = {
        'method': method_name,
        'total_queries': len(ground_truth),
        'hits_at_1': 0,
        'hits_at_3': 0, 
        'hits_at_5': 0,
        'mrr_scores': [],  # Mean Reciprocal Rank
        'search_times': [],
        'failed_queries': 0
    }
    
    print(f"\nEvaluating {method_name}...")
    
    for i, gt_item in enumerate(ground_truth):
        if i % 20 == 0:
            print(f"Processed {i}/{len(ground_truth)} queries")
            
        query = gt_item['question']
        expected_doc_id = gt_item['id']  # From your CSV
        
        try:
            # Measure search time
            start_time = time.time()
            search_results = search_function(query, collection_name, limit=top_k)
            search_time = time.time() - start_time
            results['search_times'].append(search_time)
            
            # Extract document IDs from results (now using 'id' instead of 'original_id')
            retrieved_doc_ids = [result.payload.get('id') for result in search_results]
            
            # Calculate hits@k and MRR
            if expected_doc_id in retrieved_doc_ids:
                rank = retrieved_doc_ids.index(expected_doc_id) + 1
                
                # Hits@k calculation
                if rank <= 1:
                    results['hits_at_1'] += 1
                if rank <= 3:
                    results['hits_at_3'] += 1
                if rank <= 5:
                    results['hits_at_5'] += 1
                    
                # MRR calculation
                results['mrr_scores'].append(1.0 / rank)
            else:
                results['mrr_scores'].append(0.0)
                
        except Exception as e:
            print(f"Failed query {i}: {e}")
            results['failed_queries'] += 1
            results['mrr_scores'].append(0.0)
            results['search_times'].append(0.0)
    
    # Calculate final metrics
    total_queries = results['total_queries']
    results['hit_rate_at_1'] = results['hits_at_1'] / total_queries
    results['hit_rate_at_3'] = results['hits_at_3'] / total_queries  
    results['hit_rate_at_5'] = results['hits_at_5'] / total_queries
    results['mean_reciprocal_rank'] = np.mean(results['mrr_scores'])
    results['avg_search_time'] = np.mean(results['search_times'])

    print(f"Completed evaluation of {method_name}")
    return results

In [68]:
# First, let's check the actual signatures of your search functions
# Run this to understand what parameters your functions accept:

print("Checking function signatures...")

# Check sparse_search function signature
import inspect
try:
    sparse_sig = inspect.signature(sparse_search)
    print(f"sparse_search signature: {sparse_sig}")
except Exception as e:
    print(f"Could not get sparse_search signature: {e}")

try:
    dense_sig = inspect.signature(dense_search)
    print(f"dense_search signature: {dense_sig}")
except Exception as e:
    print(f"Could not get dense_search signature: {e}")

try:
    multi_sig = inspect.signature(multi_stage_search)
    print(f"multi_stage_search signature: {multi_sig}")
except Exception as e:
    print(f"Could not get multi_stage_search signature: {e}")

try:
    rrf_sig = inspect.signature(rrf_search)
    print(f"rrf_search signature: {rrf_sig}")
except Exception as e:
    print(f"Could not get rrf_search signature: {e}")

# Common fixes based on typical function signatures:

# Option 1: If sparse_search takes (query, collection, top_k) or similar
def create_sparse_wrapper_v1():
    return lambda q, c, limit: sparse_search(q, c, top_k=limit)

# Option 2: If sparse_search takes (query, collection, k) 
def create_sparse_wrapper_v2():
    return lambda q, c, limit: sparse_search(q, c, k=limit)

# Option 3: If sparse_search takes (query, collection, size)
def create_sparse_wrapper_v3():
    return lambda q, c, limit: sparse_search(q, c, size=limit)

# Option 4: If sparse_search takes (query, collection, n_results)
def create_sparse_wrapper_v4():
    return lambda q, c, limit: sparse_search(q, c, n_results=limit)

# Option 5: If sparse_search only takes (query, collection) and has default limit
def create_sparse_wrapper_v5():
    # This ignores the limit parameter since the function doesn't support it
    return lambda q, c, limit: sparse_search(q, c)

# RECOMMENDED SOLUTION - Test each wrapper:
print("\nTesting wrapper options...")

# Test with a simple query to see which wrapper works
test_query = "test"
test_collection = collection_name_sparse

wrapper_options = [
    ("top_k", create_sparse_wrapper_v1),
    ("k", create_sparse_wrapper_v2), 
    ("size", create_sparse_wrapper_v3),
    ("n_results", create_sparse_wrapper_v4),
    ("no_limit_param", create_sparse_wrapper_v5)
]

working_wrapper = None
for name, wrapper_func in wrapper_options:
    try:
        wrapper = wrapper_func()
        result = wrapper(test_query, test_collection, 5)
        print(f"✓ Wrapper '{name}' works!")
        working_wrapper = wrapper
        break
    except Exception as e:
        print(f"✗ Wrapper '{name}' failed: {e}")

if working_wrapper:
    print(f"\nUsing working wrapper for evaluation...")
    
    # Run comprehensive evaluation with the working wrapper
    print("Starting comprehensive evaluation...")
    print(f"Evaluating on {len(ground_truth)} ground truth questions")

    evaluation_results = []

    # Evaluate sparse search with working wrapper
    sparse_eval = evaluate_search_method(
        working_wrapper,
        collection_name_sparse,
        ground_truth[:100],  # Use subset for faster evaluation
        "BM25 Sparse"
    )
    evaluation_results.append(sparse_eval)

    # For the other functions, create similar wrappers if needed
    # Check if they also have parameter issues:
    
    # Test dense_search
    try:
        dense_test = dense_search(test_query, collection_name_hybrid, limit=5)
        dense_wrapper = dense_search
        print("✓ dense_search accepts 'limit' parameter")
    except Exception as e:
        print(f"✗ dense_search parameter issue: {e}")
        # Try common alternatives
        try:
            dense_test = dense_search(test_query, collection_name_hybrid, top_k=5)
            dense_wrapper = lambda q, c, limit: dense_search(q, c, top_k=limit)
            print("✓ dense_search uses 'top_k' parameter")
        except:
            try:
                dense_test = dense_search(test_query, collection_name_hybrid, k=5)
                dense_wrapper = lambda q, c, limit: dense_search(q, c, k=limit)
                print("✓ dense_search uses 'k' parameter")
            except:
                dense_wrapper = lambda q, c, limit: dense_search(q, c)
                print("✓ dense_search ignores limit parameter")

    # Evaluate dense search
    dense_eval = evaluate_search_method(
        dense_wrapper,
        collection_name_hybrid, 
        ground_truth[:100],
        "Dense Semantic"
    )
    evaluation_results.append(dense_eval)

    # Test multi_stage_search
    try:
        multi_test = multi_stage_search(test_query, collection_name_hybrid, limit=5)
        multi_wrapper = multi_stage_search
        print("✓ multi_stage_search accepts 'limit' parameter")
    except Exception as e:
        print(f"✗ multi_stage_search parameter issue: {e}")
        try:
            multi_test = multi_stage_search(test_query, collection_name_hybrid, top_k=5)
            multi_wrapper = lambda q, c, limit: multi_stage_search(q, c, top_k=limit)
            print("✓ multi_stage_search uses 'top_k' parameter")
        except:
            try:
                multi_test = multi_stage_search(test_query, collection_name_hybrid, k=5)
                multi_wrapper = lambda q, c, limit: multi_stage_search(q, c, k=limit)
                print("✓ multi_stage_search uses 'k' parameter")
            except:
                multi_wrapper = lambda q, c, limit: multi_stage_search(q, c)
                print("✓ multi_stage_search ignores limit parameter")

    # Evaluate multi-stage search
    multi_stage_eval = evaluate_search_method(
        multi_wrapper,
        collection_name_hybrid,
        ground_truth[:100], 
        "Multi-stage (Dense→Sparse)"
    )
    evaluation_results.append(multi_stage_eval)

    # Test rrf_search
    try:
        rrf_test = rrf_search(test_query, collection_name_hybrid, limit=5)
        rrf_wrapper = rrf_search
        print("✓ rrf_search accepts 'limit' parameter")
    except Exception as e:
        print(f"✗ rrf_search parameter issue: {e}")
        try:
            rrf_test = rrf_search(test_query, collection_name_hybrid, top_k=5)
            rrf_wrapper = lambda q, c, limit: rrf_search(q, c, top_k=limit)
            print("✓ rrf_search uses 'top_k' parameter")
        except:
            try:
                rrf_test = rrf_search(test_query, collection_name_hybrid, k=5)
                rrf_wrapper = lambda q, c, limit: rrf_search(q, c, k=limit)
                print("✓ rrf_search uses 'k' parameter")
            except:
                rrf_wrapper = lambda q, c, limit: rrf_search(q, c)
                print("✓ rrf_search ignores limit parameter")

    # Evaluate RRF hybrid search
    rrf_eval = evaluate_search_method(
        rrf_wrapper,
        collection_name_hybrid,
        ground_truth[:100],
        "RRF Hybrid"
    )
    evaluation_results.append(rrf_eval)

    print("\nEvaluation completed!")
    
else:
    print("\n❌ Could not find working wrapper for sparse_search!")
    print("Please check your sparse_search function definition and parameters.")

Checking function signatures...
sparse_search signature: (query: str, collection_name: str, limit: int = 5) -> List[qdrant_client.http.models.models.ScoredPoint]
dense_search signature: (query: str, collection_name: str, limit: int = 5) -> List[qdrant_client.http.models.models.ScoredPoint]
multi_stage_search signature: (query: str, collection_name: str, limit: int = 5) -> List[qdrant_client.http.models.models.ScoredPoint]
rrf_search signature: (query: str, collection_name: str, limit: int = 5) -> List[qdrant_client.http.models.models.ScoredPoint]

Testing wrapper options...
✗ Wrapper 'top_k' failed: sparse_search() got an unexpected keyword argument 'top_k'
✗ Wrapper 'k' failed: sparse_search() got an unexpected keyword argument 'k'
✗ Wrapper 'size' failed: sparse_search() got an unexpected keyword argument 'size'
✗ Wrapper 'n_results' failed: sparse_search() got an unexpected keyword argument 'n_results'
✓ Wrapper 'no_limit_param' works!

Using working wrapper for evaluation...
Starti

### Results Analysis and Comparison

In [69]:
# Create comprehensive results summary
def create_results_summary(evaluation_results: List[Dict]) -> pd.DataFrame:
    """Create a summary DataFrame of all evaluation results"""
    
    summary_data = []
    
    for result in evaluation_results:
        summary_data.append({
            'Method': result['method'],
            'Hit Rate @1': f"{result['hit_rate_at_1']:.3f}",
            'Hit Rate @3': f"{result['hit_rate_at_3']:.3f}", 
            'Hit Rate @5': f"{result['hit_rate_at_5']:.3f}",
            'Mean Reciprocal Rank': f"{result['mean_reciprocal_rank']:.3f}",
            'Avg Search Time (ms)': f"{result['avg_search_time']*1000:.1f}",
            'Failed Queries': result['failed_queries'],
            'Total Queries': result['total_queries']
        })
    
    return pd.DataFrame(summary_data)

In [70]:
# Generate results summary
results_df = create_results_summary(evaluation_results)
print("=== HYBRID SEARCH EVALUATION RESULTS ===")
print(results_df.to_string(index=False))

=== HYBRID SEARCH EVALUATION RESULTS ===
                    Method Hit Rate @1 Hit Rate @3 Hit Rate @5 Mean Reciprocal Rank Avg Search Time (ms)  Failed Queries  Total Queries
               BM25 Sparse       0.180       0.300       0.360                0.246                 23.4               0            100
            Dense Semantic       0.190       0.380       0.460                0.291                 26.4               0            100
Multi-stage (Dense→Sparse)       0.180       0.300       0.360                0.246                 34.8               0            100
                RRF Hybrid       0.220       0.350       0.460                0.298                 38.0               0            100


In [71]:
# Display detailed analysis
print("\n=== DETAILED PERFORMANCE ANALYSIS ===")
for result in evaluation_results:
    print(f"\n{result['method']}:")
    print(f"  • Hit Rate @1: {result['hit_rate_at_1']:.3f} ({result['hits_at_1']}/{result['total_queries']})")
    print(f"  • Hit Rate @3: {result['hit_rate_at_3']:.3f} ({result['hits_at_3']}/{result['total_queries']})")
    print(f"  • Hit Rate @5: {result['hit_rate_at_5']:.3f} ({result['hits_at_5']}/{result['total_queries']})")
    print(f"  • Mean Reciprocal Rank: {result['mean_reciprocal_rank']:.3f}")
    print(f"  • Average Search Time: {result['avg_search_time']*1000:.1f} ms")
    print(f"  • Failed Queries: {result['failed_queries']}")


=== DETAILED PERFORMANCE ANALYSIS ===

BM25 Sparse:
  • Hit Rate @1: 0.180 (18/100)
  • Hit Rate @3: 0.300 (30/100)
  • Hit Rate @5: 0.360 (36/100)
  • Mean Reciprocal Rank: 0.246
  • Average Search Time: 23.4 ms
  • Failed Queries: 0

Dense Semantic:
  • Hit Rate @1: 0.190 (19/100)
  • Hit Rate @3: 0.380 (38/100)
  • Hit Rate @5: 0.460 (46/100)
  • Mean Reciprocal Rank: 0.291
  • Average Search Time: 26.4 ms
  • Failed Queries: 0

Multi-stage (Dense→Sparse):
  • Hit Rate @1: 0.180 (18/100)
  • Hit Rate @3: 0.300 (30/100)
  • Hit Rate @5: 0.360 (36/100)
  • Mean Reciprocal Rank: 0.246
  • Average Search Time: 34.8 ms
  • Failed Queries: 0

RRF Hybrid:
  • Hit Rate @1: 0.220 (22/100)
  • Hit Rate @3: 0.350 (35/100)
  • Hit Rate @5: 0.460 (46/100)
  • Mean Reciprocal Rank: 0.298
  • Average Search Time: 38.0 ms
  • Failed Queries: 0


In [72]:
# Find best performing method
best_mrr_method = max(evaluation_results, key=lambda x: x['mean_reciprocal_rank'])
best_speed_method = min(evaluation_results, key=lambda x: x['avg_search_time'])
best_hit5_method = max(evaluation_results, key=lambda x: x['hit_rate_at_5'])

print(f"\n=== KEY FINDINGS ===")
print(f"Best Overall Performance (MRR): {best_mrr_method['method']} ({best_mrr_method['mean_reciprocal_rank']:.3f})")
print(f"Fastest Search Method: {best_speed_method['method']} ({best_speed_method['avg_search_time']*1000:.1f} ms)")
print(f"Best Hit Rate @5: {best_hit5_method['method']} ({best_hit5_method['hit_rate_at_5']:.3f})")


=== KEY FINDINGS ===
Best Overall Performance (MRR): RRF Hybrid (0.298)
Fastest Search Method: BM25 Sparse (23.4 ms)
Best Hit Rate @5: Dense Semantic (0.460)


### Results Analysis

In [73]:
# Setup results directory and file paths
import os
from pathlib import Path

# Define base results directory 
results_base_dir = Path.home() / "Brahman.ai" / "results"
results= results_base_dir / "hybrid_search_evaluation"

# Generate timestamp for unique file naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define output file paths
results_summary_path = results / f"evaluation_summary_{timestamp}.csv"
detailed_results_path = results / f"detailed_results_{timestamp}.json"
analysis_report_path = results / f"analysis_report_{timestamp}.txt"
metrics_comparison_path = results / f"metrics_comparison_{timestamp}.csv"

print(f"Results will be saved to: {results}")
print(f"Timestamp: {timestamp}")

Results will be saved to: C:\Users\Adi\Brahman.ai\results\hybrid_search_evaluation
Timestamp: 20250816_171653


In [75]:
# Save evaluation results summary as CSV
results_df.to_csv(results_summary_path, index=False)
print(f"✅ Saved summary results to: {results_summary_path.name}")

# Save detailed results as JSON for future analysis
detailed_results = {
    "evaluation_metadata": {
        "timestamp": timestamp,
        "total_ground_truth_questions": len(ground_truth),
        "evaluation_subset_size": 100,
        "qdrant_collections": {
            "sparse_only": collection_name_sparse,
            "hybrid": collection_name_hybrid
        },
        "embedding_models": {
            "dense": "jinaai/jina-embeddings-v2-small-en",
            "sparse": "Qdrant/bm25"
        }
    },
    "evaluation_results": evaluation_results,
    "performance_summary": {
        "best_mrr_method": {
            "method": best_mrr_method['method'],
            "score": best_mrr_method['mean_reciprocal_rank']
        },
        "fastest_method": {
            "method": best_speed_method['method'], 
            "time_ms": best_speed_method['avg_search_time'] * 1000
        },
        "best_hit5_method": {
            "method": best_hit5_method['method'],
            "hit_rate": best_hit5_method['hit_rate_at_5']
        }
    }
}

with open(detailed_results_path, 'w') as f:
    json.dump(detailed_results, f, indent=2)
print(f"✅ Saved detailed results to: {detailed_results_path.name}")

✅ Saved summary results to: evaluation_summary_20250816_171653.csv
✅ Saved detailed results to: detailed_results_20250816_171653.json
