# Vector Search Evaluation
This notebook evaluates the performance of vector-based semantic search using sentence transformers.

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os

In [None]:
# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

### Initialize Vector Search Model
Setup sentence transformer model for embedding generation

In [None]:
# Initialize sentence transformer model for vector embeddings
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
print(f"Loaded sentence transformer model: {model_name}")

### Load Data
Loading documents and ground truth data for evaluation

In [None]:
print("Loading data...")

# Load documents from processed JSON file
with open('data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

# Load ground truth dataset for evaluation
df_ground_truth = pd.read_csv('data/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")

### Generate Document Embeddings
Create vector representations for all documents

In [None]:
print("Generating document embeddings...")

# Extract text content from documents
texts = [doc['content'] for doc in documents]

# Generate embeddings for all documents
document_vectors = model.encode(texts, show_progress_bar=True)

print(f"Generated embeddings for {len(document_vectors)} documents")

### Evaluation Process
Evaluate vector search performance using ground truth data

In [None]:
print("Evaluating vector search...")

# Initialize list to store relevance results
relevance_total = []

# Iterate through each ground truth question
for q in tqdm(ground_truth, desc="Evaluating retrieval"):
    doc_id = q['id']  # Ground truth document ID
    query = q['question']
    
    # Generate query embedding
    query_vector = model.encode([query])
    
    # Calculate cosine similarities with all documents
    similarities = cosine_similarity(query_vector, document_vectors)[0]
    
    # Get top 5 most similar documents
    top_indices = np.argsort(similarities)[::-1][:5]
    
    # Create results with similarity scores
    results = []
    for idx in top_indices:
        doc = documents[idx].copy()
        doc['similarity_score'] = float(similarities[idx])
        results.append(doc)
    
    # Check if correct document is in results
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

print("Evaluation completed!")

### Calculate Metrics
Compute Hit Rate and Mean Reciprocal Rank (MRR)

In [None]:
print("Calculating metrics...")

# Calculate Hit Rate
hit_count = sum(1 for line in relevance_total if True in line)
hit_rate = hit_count / len(relevance_total)

# Calculate Mean Reciprocal Rank (MRR)
total_score = 0.0
for line in relevance_total:
    for rank in range(len(line)):
        if line[rank] == True:
            total_score += 1 / (rank + 1)
            break

mrr = total_score / len(relevance_total)

# Create metrics dictionary
metrics = {
    'hit_rate': hit_rate,
    'mrr': mrr,
    'total_questions': len(relevance_total)
}

print(f"Metrics calculated successfully!")

### Save Results and Display Summary

In [None]:
# Prepare results for saving
results = {
    'method': 'vector_search',
    'model_name': model_name,
    'metrics': metrics,
    'relevance_results': relevance_total
}

# Save results to JSON file
with open('results/vector_search_results.json', 'w') as f:
    json.dump(results, f, indent=2)

# Display final results
print(f"\nVector Search Results:")
print(f"Hit Rate: {metrics['hit_rate']:.4f}")
print(f"MRR: {metrics['mrr']:.4f}")
print(f"Total Questions: {metrics['total_questions']}")
print("\nResults saved to: results/vector_search_results.json")