# Keyword Search Evaluation
This notebook evaluates the performance of keyword-based search using MinSearch for document retrieval.

In [6]:
# Import required libraries
import json
import pandas as pd
import minsearch
from tqdm.auto import tqdm
import os

## Load Data
Loading documents and ground truth data for evaluation

In [7]:
print(f"Current working directory: {os.getcwd()}")

# Load documents from processed JSON file
# Using ../ to go up one level from notebooks/ to root, then down to data/processed/
with open('../data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

# Load ground truth dataset for evaluation from CSV file

df_ground_truth = pd.read_csv('../data/processed/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")

Current working directory: c:\Users\Adi\Brahman.ai\notebooks
Loaded 149 documents and 735 ground truth questions


## Setup MinSearch Index
Configure and build the keyword search index

In [None]:
print("Setting up MinSearch index...")

# Initialize MinSearch index with text and keyword fields
index = minsearch.Index(
    text_fields=["content"],  # Search in content field
    keyword_fields=["location", "doc_id", "id"]  # Filter fields
)

# Fit the index with document data
index.fit(documents)

print("MinSearch index ready!")

## Evaluation Process
Evaluate retrieval performance using ground truth data

In [None]:
print("Evaluating keyword search...")

# Initialize list to store relevance results
relevance_total = []

# Iterate through each ground truth question
for q in tqdm(ground_truth, desc="Evaluating retrieval"):
    doc_id = q['id']  # Ground truth document ID
    
    # Perform MinSearch query
    results = index.search(
        query=q['question'],
        num_results=5
    )
    
    # Check if correct document is in results
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

print("Evaluation completed!")

## Calculate Metrics
Compute Hit Rate and Mean Reciprocal Rank (MRR)

In [None]:
print("Calculating metrics...")

# Calculate Hit Rate
hit_count = sum(1 for line in relevance_total if True in line)
hit_rate = hit_count / len(relevance_total)

# Calculate Mean Reciprocal Rank (MRR)
total_score = 0.0
for line in relevance_total:
    for rank in range(len(line)):
        if line[rank] == True:
            total_score += 1 / (rank + 1)
            break

mrr = total_score / len(relevance_total)

# Create metrics dictionary
metrics = {
    'hit_rate': hit_rate,
    'mrr': mrr,
    'total_questions': len(relevance_total)
}

print(f"Metrics calculated successfully!")

## Save Results and Display Summary

In [None]:
# Prepare results for saving
results = {
    'method': 'keyword_search',
    'metrics': metrics,
    'relevance_results': relevance_total
}

# Save results to JSON file
with open('results/keyword_search_results.json', 'w') as f:
    json.dump(results, f, indent=2)

# Display final results
print(f"\nKeyword Search Results:")
print(f"Hit Rate: {metrics['hit_rate']:.4f}")
print(f"MRR: {metrics['mrr']:.4f}")
print(f"Total Questions: {metrics['total_questions']}")
print("\nResults saved to: results/keyword_search_results.json")