# Keyword Search Evaluation
This notebook evaluates the performance of keyword-based search using MinSearch for document retrieval.

In [12]:
# Import required libraries
import json
import pandas as pd
import minsearch
from tqdm.auto import tqdm
import os

### Load Data
Loading documents and ground truth data for evaluation

In [13]:
print(f"Current working directory: {os.getcwd()}")

# Load documents from processed JSON file
# Using ../ to go up one level from notebooks/ to root, then down to data/processed/
with open('../data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

# Load ground truth dataset for evaluation from CSV file

df_ground_truth = pd.read_csv('../data/processed/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")

Current working directory: c:\Users\Adi\Brahman.ai\notebooks
Loaded 149 documents and 735 ground truth questions


In [None]:
documents[:3]


[{'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': 'Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)',
  'id': '4f80b327'},
 {'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': 'Andhra Pradesh (AP) is a state in Southern India, with Bay of Bengal on the east and shares boundaries with Telangana on the north, Chhattisgarh and Odisha on the north-east, Tamil Nadu on the south and Karnataka on the west. Vijayawada is the capital of this state.',
  'id': 'a411c9aa'},
 {'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': 'Northern Coast (Alluri Sitharama Raju, Anakapalli, East Godavari, Kakinada, Konaseema, Parvathipuram Manyam, Srikakulam, Visakhapatnam, Vizianagaram, Yanam) Central Coast (Eluru, Krishna, NTR, West Godavari) Southern Coast (Bapatla, Guntur, Nellore, Palnadu, Prakasam, Tirupati) Rayalaseema (Annamayya, Anantapur, Chittoor, Kadapa, Kurnool, Nandyal, Sri Sathya Sai)  \n![0_ima

In [21]:
ground_truth[:3]

[{'question': 'What are the must-see religious sites in Andhra Pradesh for pilgrims?',
  'id': '4f80b327'},
 {'question': 'Which natural attractions and caves can tourists explore in Andhra Pradesh?',
  'id': '4f80b327'},
 {'question': 'Where in Andhra Pradesh can I visit museums related to military and tribal culture?',
  'id': '4f80b327'}]

### Setup MinSearch Index
Configure and build the keyword search index

In [None]:
print("Setting up MinSearch index...")

# Initialize MinSearch index with text and keyword fields
index = minsearch.Index(
    text_fields=["content"],  # Search in content field
    keyword_fields=["location", "doc_id", "id"]  # Filter fields
)

# Fit the index with document data
index.fit(documents)

Setting up MinSearch index...


<minsearch.minsearch.Index at 0x228a7cf0a90>

### Evaluation Process
Evaluate retrieval performance using ground truth data

In [24]:
print("Evaluating keyword search...")

# Initialize list to store relevance results
relevance_total = []

# Iterate through each ground truth question
for q in tqdm(ground_truth, desc="Evaluating retrieval"):
    doc_id = q['id']  # Ground truth document ID
    
    # Perform MinSearch query
    results = index.search(
        query=q['question'],
        num_results=5
    )
    
    # Check if correct document is in results
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

print("Evaluation completed!")

Evaluating keyword search...


Evaluating retrieval:   0%|          | 0/735 [00:00<?, ?it/s]

Evaluation completed!


### Calculate Metrics
Compute Hit Rate and Mean Reciprocal Rank (MRR)

In [25]:
print("Calculating metrics...")

# Calculate Hit Rate
hit_count = sum(1 for line in relevance_total if True in line)
hit_rate = hit_count / len(relevance_total)

# Calculate Mean Reciprocal Rank (MRR)
total_score = 0.0
for line in relevance_total:
    for rank in range(len(line)):
        if line[rank] == True:
            total_score += 1 / (rank + 1)
            break

mrr = total_score / len(relevance_total)

# Create metrics dictionary
metrics = {
    'hit_rate': hit_rate,
    'mrr': mrr,
    'total_questions': len(relevance_total)
}

print(f"Metrics calculated successfully!")

Calculating metrics...
Metrics calculated successfully!


### Display Summary and Save Results 

In [28]:
# Display final results
print(f"\nKeyword Search Results:")
print(f"Hit Rate: {metrics['hit_rate']:.4f}")
print(f"MRR: {metrics['mrr']:.4f}")
print(f"Total Questions: {metrics['total_questions']}")


Keyword Search Results:
Hit Rate: 0.4939
MRR: 0.3682
Total Questions: 735


In [34]:
# Prepare results for saving
results = {
    'method': 'keyword_search',
    'metrics': metrics,
    'relevance_results': relevance_total
}

In [35]:
for i, rec in enumerate(results['relevance_results'][:5]):
    print(f"{i+1}: {rec}")

1: [False, False, False, False, False]
2: [False, False, False, False, False]
3: [False, False, False, False, False]
4: [False, False, False, False, False]
5: [False, False, False, False, False]


In [37]:
# Save results to JSON file
with open('../results/keyword_search_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("\nResults saved to: results/keyword_search_results.json")


Results saved to: results/keyword_search_results.json
