# Hybrid Search Evaluation
This notebook evaluates the performance of hybrid search combining keyword and vector search methods.

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import minsearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os

In [None]:
# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

## Initialize Hybrid Search Components
Setup both vector and keyword search components

In [None]:
# Initialize sentence transformer model for vector search
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
print(f"Loaded sentence transformer model: {model_name}")

# Initialize variables for search components
documents = None
document_vectors = None
keyword_index = None

## Load Data
Loading documents and ground truth data for evaluation

In [None]:
print("Loading data...")

# Load documents from processed JSON file
with open('data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

# Load ground truth dataset for evaluation
df_ground_truth = pd.read_csv('data/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")

## Setup Keyword Search Index
Configure and build the keyword search component

In [None]:
print("Setting up keyword search index...")

# Initialize MinSearch index for keyword search
keyword_index = minsearch.Index(
    text_fields=["content"],
    keyword_fields=["location", "doc_id", "id"]
)

# Fit the index with document data
keyword_index.fit(documents)

print("Keyword search index ready!")

## Setup Vector Search
Generate embeddings for all documents

In [None]:
print("Generating document embeddings...")

# Extract text content from documents
texts = [doc['content'] for doc in documents]

# Generate embeddings for all documents
document_vectors = model.encode(texts, show_progress_bar=True)

print(f"Generated embeddings for {len(document_vectors)} documents")

## Define Hybrid Search Function
Create the hybrid search logic that combines both methods

In [None]:
def hybrid_search(query, num_results=5, alpha=0.5):
    """
    Hybrid search combining keyword and vector search
    alpha: weight for vector search (1-alpha for keyword search)
    """
    # Get keyword search results (more results for better fusion)
    keyword_results = keyword_index.search(
        query=query,
        num_results=num_results * 2
    )
    
    # Get vector search results
    query_vector = model.encode([query])
    similarities = cosine_similarity(query_vector, document_vectors)[0]
    
    # Create document scores dictionary
    doc_scores = {}
    
    # Add keyword scores (normalize by position)
    for i, doc in enumerate(keyword_results):
        doc_id = doc['id']
        keyword_score = 1.0 / (i + 1)  # Reciprocal rank
        doc_scores[doc_id] = {
            'doc': doc,
            'keyword_score': keyword_score,
            'vector_score': 0.0
        }
    
    # Add vector scores for all documents
    for i, doc in enumerate(documents):
        doc_id = doc['id']
        vector_score = float(similarities[i])
        
        if doc_id in doc_scores:
            doc_scores[doc_id]['vector_score'] = vector_score
        else:
            # Only add if vector score is reasonable (threshold to avoid noise)
            if vector_score > 0.1:
                doc_scores[doc_id] = {
                    'doc': doc,
                    'keyword_score': 0.0,
                    'vector_score': vector_score
                }
    
    # Calculate hybrid scores
    for doc_id in doc_scores:
        keyword_score = doc_scores[doc_id]['keyword_score']
        vector_score = doc_scores[doc_id]['vector_score']
        
        # Combine scores using alpha weighting
        hybrid_score = (1 - alpha) * keyword_score + alpha * vector_score
        doc_scores[doc_id]['hybrid_score'] = hybrid_score
    
    # Sort by hybrid score and return top results
    sorted_docs = sorted(
        doc_scores.items(),
        key=lambda x: x[1]['hybrid_score'],
        reverse=True
    )
    
    # Prepare final results with scores
    results = []
    for doc_id, scores in sorted_docs[:num_results]:
        doc = scores['doc'].copy()
        doc['hybrid_score'] = scores['hybrid_score']
        doc['keyword_score'] = scores['keyword_score']
        doc['vector_score'] = scores['vector_score']
        results.append(doc)
        
    return results

print("Hybrid search function defined!")

## Test Different Alpha Values
Evaluate hybrid search with different weight combinations

In [None]:
# Test different alpha values for hybrid search weighting
alpha_values = [0.3, 0.5, 0.7]  # Different weights for vector vs keyword
results = {}

for alpha in alpha_values:
    print(f"\nEvaluating hybrid search with alpha={alpha}...")
    
    # Initialize list to store relevance results for this alpha
    relevance_total = []
    
    # Evaluate each ground truth question
    for q in tqdm(ground_truth, desc=f"Evaluating alpha={alpha}"):
        doc_id = q['id']  # Ground truth document ID
        
        # Perform hybrid search with current alpha
        search_results = hybrid_search(q['question'], alpha=alpha)
        
        # Check if correct document is in results
        relevance = [d['id'] == doc_id for d in search_results]
        relevance_total.append(relevance)
    
    # Calculate metrics for this alpha
    hit_count = sum(1 for line in relevance_total if True in line)
    hit_rate = hit_count / len(relevance_total)
    
    # Calculate Mean Reciprocal Rank (MRR)
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
                break
    
    mrr = total_score / len(relevance_total)
    
    # Store results for this alpha
    metrics = {
        'hit_rate': hit_rate,
        'mrr': mrr,
        'total_questions': len(relevance_total)
    }
    
    results[f'alpha_{alpha}'] = {
        'metrics': metrics,
        'relevance_results': relevance_total
    }
    
    print(f"Alpha {alpha} - Hit Rate: {metrics['hit_rate']:.4f}, MRR: {metrics['mrr']:.4f}")

## Find Best Alpha and Save Results

In [None]:
# Find best alpha based on MRR score
best_alpha = max(results.keys(), key=lambda k: results[k]['metrics']['mrr'])

# Prepare final results for saving
final_results = {
    'method': 'hybrid_search',
    'model_name': model_name,
    'best_alpha': best_alpha,
    'alpha_results': results
}

# Save results to JSON file
with open('results/hybrid_search_results.json', 'w') as f:
    json.dump(final_results, f, indent=2)

# Display final results
print(f"\nBest Hybrid Search Results (Alpha: {best_alpha}):")
best_metrics = results[best_alpha]['metrics']
print(f"Hit Rate: {best_metrics['hit_rate']:.4f}")
print(f"MRR: {best_metrics['mrr']:.4f}")
print(f"Total Questions: {best_metrics['total_questions']}")
print("\nResults saved to: results/hybrid_search_results.json")