# Retrieval Experiments

I will be comparing the following retrieval methods:

1. BM25
2. Dense Retrieval
3. Hybrid Retrieval
4. Reranked Retrieval

In [25]:
import numpy as np
import pandas as pd
import time
import re
import string
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util

In [None]:
def load_data(filepath="data/sample_food_data_base.csv"):
    """
    Loads the food nutrition CSV and creates a single text 'document'
    for each food item to be used as our search corpus.
    """

    df = pd.read_csv(filepath)
    df = df.fillna('')  # Handle any empty cells
    df['document_text'] = df.apply(
        lambda row: f"Food: {row['food_name']}. "
                    f"Ingredients: {row['ingredients']}. "
                    f"Vitamins: {row['vitamins']}. "
                    f"Nutrition: {row['nutrients_with_grams']}.",
        axis=1
    )
    corpus = df['document_text'].tolist()
    return df, corpus

def clean_text(text):
    """A simple text cleaning"""
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

I have made a annotated the query and relevant documents for evaluation. This will be used to compute the evaluation metrics.

In [27]:
EVAL_QUERIES = {
    "q1": {
        "query": "What ingredients are in Spaghetti Bolognese?",
        "relevant_docs": [5]  # Index 5 is Spaghetti Bolognese
    },
    "q2": {
        "query": "high protein breakfast",
        "relevant_docs": [9, 13, 26]  # 9: Greek Yogurt, 13: Scrambled Eggs, 26: Omelette
    },
    "q3": {
        "query": "what is the food with avocado",
        "relevant_docs": [1, 20, 24]  # 1: Avocado Toast, 20: Veggie Burger, 24: Fish Tacos
    },
    "q4": {
        "query": "low fat soup",
        "relevant_docs": [7, 29]  # 7: Lentil Soup (5g), 29: Minestrone Soup (4g)
    },
    "q5": {
        "query": "vitamins in salmon and quinoa",
        "relevant_docs": [2]  # 2: Salmon with Quinoa
    }
}

# RETRIEVAL CLASS


In [None]:
class RetrievalExperiment:
    def __init__(self, corpus, df):
        self.corpus = corpus
        self.df = df
        self.doc_ids = list(range(len(corpus)))

        #BM25
        print("Setting up BM25...")
        tokenized_corpus = [clean_text(doc) for doc in self.corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        
        #Dense Retriever (SentenceTransformer)
        print("Loading SentenceTransformer model... (This may take a moment)")
        # Using CPU to avoid CUDA errors
        self.dense_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
        
        print("Creating dense embeddings for the corpus...")
        self.corpus_embeddings = self.dense_model.encode(self.corpus, convert_to_tensor=True, device='cpu')

        #Reranker (Cross-Encoder)
        print("Loading Cross-Encoder model for reranking...")
        self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2', device='cpu')

        print("Initialization complete.")

    def search_bm25(self, query, k=5):
        """Performs a BM25 (keyword) search."""
        tokenized_query = clean_text(query)
        doc_scores = self.bm25.get_scores(tokenized_query)
        #top scores
        top_k_indices = np.argsort(doc_scores)[::-1][:k]
        return [(idx, doc_scores[idx]) for idx in top_k_indices]

    def search_dense(self, query, k=5):
        """Performs a Dense (semantic) search."""
        query_embedding = self.dense_model.encode(query, convert_to_tensor=True, device='cpu')
        cos_scores = util.cos_sim(query_embedding, self.corpus_embeddings)[0]
        top_results = np.argpartition(-cos_scores, range(k))[:k]
        return [(idx.item(), cos_scores[idx].item()) for idx in top_results]

    def search_hybrid(self, query, k=5, bm25_weight=0.5, dense_weight=0.5):
        """Performs a hybrid search by combining BM25 and Dense scores."""
        bm25_results = self.search_bm25(query, k=len(self.corpus))
        dense_results = self.search_dense(query, k=len(self.corpus))

        bm25_scores = {doc_id: score for doc_id, score in bm25_results}
        dense_scores = {doc_id: score for doc_id, score in dense_results}

        max_bm25 = max(bm25_scores.values()) if bm25_scores else 1
        min_bm25 = min(bm25_scores.values()) if bm25_scores else 0
        max_dense = max(dense_scores.values()) if dense_scores else 1
        min_dense = min(dense_scores.values()) if dense_scores else 0

        bm25_range = max_bm25 - min_bm25 if (max_bm25 - min_bm25) != 0 else 1
        dense_range = max_dense - min_dense if (max_dense - min_dense) != 0 else 1

        hybrid_scores = {}
        all_doc_ids = set(bm25_scores.keys()) | set(dense_scores.keys())

        for doc_id in all_doc_ids:
            norm_bm25 = (bm25_scores.get(doc_id, min_bm25) - min_bm25) / bm25_range
            norm_dense = (dense_scores.get(doc_id, min_dense) - min_dense) / dense_range
            hybrid_scores[doc_id] = (norm_bm25 * bm25_weight) + (norm_dense * dense_weight)

        sorted_hybrid = sorted(hybrid_scores.items(), key=lambda item: item[1], reverse=True)
        return sorted_hybrid[:k]

    def search_and_rerank(self, query, retrieve_k=20, rerank_k=5):
        """Two-stage search: Retrieve (Hybrid) and Rerank (Cross-Encoder)."""
        candidate_results = self.search_hybrid(query, k=retrieve_k)
        candidate_ids = [doc_id for doc_id, score in candidate_results]

        if not candidate_ids:
            return []

        rerank_pairs = [[query, self.corpus[doc_id]] for doc_id in candidate_ids]
        rerank_scores = self.reranker.predict(rerank_pairs)

        reranked_results = []
        for i in range(len(candidate_ids)):
            doc_id = candidate_ids[i]
            score = rerank_scores[i]
            reranked_results.append((doc_id, score))

        reranked_results.sort(key=lambda x: x[1], reverse=True)
        return reranked_results[:rerank_k]

# EVALUATION METRICS

In [93]:

# finds how quickly the first relevant document appears in the retrieved list
def calculate_reciprocal_rank(retrieved_docs, relevant_docs):
    retrieved_ids = [doc_id for doc_id, score in retrieved_docs]
    for rank, doc_id in enumerate(retrieved_ids, 1):
        if doc_id in relevant_docs:
            return 1.0 / rank
    return 0.0

#contet based precision at k
def calculate_precision_at_k(retrieved_docs, relevant_docs, k):
    print("precision at k")
    print(retrieved_docs)
    retrieved_ids_at_k = [doc_id for doc_id, score in retrieved_docs[:k]]
    relevant_found = sum(1 for doc_id in retrieved_ids_at_k if doc_id in relevant_docs)
    print(relevant_found)
    return relevant_found / k

#calculate average metric
def calculate_average_metric(evaluation_results, metric_name):
    total_metric = sum(result[metric_name] for result in evaluation_results.values())
    return total_metric / len(evaluation_results)

Retrieval (k=K_VALUE): When you search, the system will return exactly 5 documents (the 5 highest scoring ones).
Evaluation (Precision@K): When calculating precision, it only checks those top 5 results.
If a relevant document is at position #6, it is considered "missed" because it's outside the top 5.

In [None]:
# load our sample data
df, corpus = load_data("data/sample_food_data_base.csv")

print(df)
print(corpus)


experiment = RetrievalExperiment(corpus, df)
K_VALUE = 5

          date      time                        food_name  \
0   2025-10-17  09:15:00            Grilled Chicken Salad   
1   2025-10-17  12:30:00                    Avocado Toast   
2   2025-10-17  18:45:00               Salmon with Quinoa   
3   2025-10-18  13:00:00                    Beef Stir-Fry   
4   2025-10-18  19:00:00                  Vegetable Curry   
5   2025-10-19  12:45:00              Spaghetti Bolognese   
6   2025-10-19  20:00:00                 Mushroom Risotto   
7   2025-10-20  13:15:00                      Lentil Soup   
8   2025-10-20  10:00:00                 Blueberry Muffin   
9   2025-10-21  08:30:00        Greek Yogurt with Berries   
10  2025-10-21  07:45:00             Oatmeal with Almonds   
11  2025-10-22  12:10:00              Chicken Caesar Wrap   
12  2025-10-22  19:30:00                 Margherita Pizza   
13  2025-10-23  08:00:00      Scrambled Eggs with Spinach   
14  2025-10-23  12:30:00                    Tuna Sandwich   
15  2025-10-24  15:00:00

# BM25 EVALUATION

In [None]:
print("\n--- BM25 RESULTS ---")
results_bm25 = {}
for q_id, eval_data in EVAL_QUERIES.items():
    query = eval_data["query"]
    relevant_docs = eval_data["relevant_docs"]

    print(f"Query: {query}")
    print(f"Relevant Docs: {relevant_docs}")
    
    start_time = time.time()

    bm25_retrieved = experiment.search_bm25(query, k=K_VALUE)
    bm25_latency = time.time() - start_time
    
    rr = calculate_reciprocal_rank(bm25_retrieved, relevant_docs)
    p_at_k = calculate_precision_at_k(bm25_retrieved, relevant_docs, k=K_VALUE)
    
    results_bm25[q_id] = {
        "rr": rr,
        "p_at_k": p_at_k,
        "latency": bm25_latency,
        "retrieved": bm25_retrieved
    }
    print(f"Query: {query}")
    print(f"  RR: {rr:.4f}, P@{K_VALUE}: {p_at_k:.4f}, Latency: {bm25_latency:.4f}s")


print("\nAverage BM25 Metrics:")
print(f"MRR: {calculate_average_metric(results_bm25, 'rr'):.4f}")
print(f"P@{K_VALUE}: {calculate_average_metric(results_bm25, 'p_at_k'):.4f}")
print(f"Avg Latency: {calculate_average_metric(results_bm25, 'latency'):.4f}s")

avg_bm25 = {'rr':   calculate_average_metric(results_bm25, 'rr'), 'p_at_k': calculate_average_metric(results_bm25, 'p_at_k'), 'latency': calculate_average_metric(results_bm25, 'latency')}


--- BM25 RESULTS ---
Query: What ingredients are in Spaghetti Bolognese?
Relevant Docs: [5]
precision at k
precision at k
[(np.int64(5), np.float64(7.936389133473743)), (np.int64(8), np.float64(0.5966039800950955)), (np.int64(24), np.float64(0.5789268251293149)), (np.int64(7), np.float64(0.5789268251293149)), (np.int64(17), np.float64(0.5789268251293149))]
1
Query: What ingredients are in Spaghetti Bolognese?
  RR: 1.0000, P@5: 0.2000, Latency: 0.0005s
Query: high protein breakfast
Relevant Docs: [9, 13, 26]
precision at k
precision at k
[(np.int64(14), np.float64(0.8098976310099224)), (np.int64(15), np.float64(0.8098976310099224)), (np.int64(21), np.float64(0.8015909886405899)), (np.int64(19), np.float64(0.8015909886405899)), (np.int64(25), np.float64(0.8015909886405899))]
0
Query: high protein breakfast
  RR: 0.0000, P@5: 0.0000, Latency: 0.0002s
Query: what is the food with avocado
Relevant Docs: [1, 20, 24]
precision at k
precision at k
[(np.int64(1), np.float64(3.3750096113307424

# DENSE EVALUATION

In [81]:
results_dense = {}

for q_id, eval_data in EVAL_QUERIES.items():
    query = eval_data["query"]
    relevant_docs = eval_data["relevant_docs"]
    
    start_time = time.time()
    dense_retrieved = experiment.search_dense(query, k=K_VALUE)
    dense_latency = time.time() - start_time
    
    rr = calculate_reciprocal_rank(dense_retrieved, relevant_docs)
    p_at_k = calculate_precision_at_k(dense_retrieved, relevant_docs, k=K_VALUE)
    
    results_dense[q_id] = {
        "rr": rr,
        "p_at_k": p_at_k,
        "latency": dense_latency,
        "retrieved": dense_retrieved
    }
    print(f"Query: {query}")
    print(f"  RR: {rr:.4f}, P@{K_VALUE}: {p_at_k:.4f}, Latency: {dense_latency:.4f}s")

print("\nAverage Dense Metrics:")
print(f"MRR: {calculate_average_metric(results_dense, 'rr'):.4f}")
print(f"P@{K_VALUE}: {calculate_average_metric(results_dense, 'p_at_k'):.4f}")
print(f"Avg Latency: {calculate_average_metric(results_dense, 'latency'):.4f}s")

avg_dense = {'rr':   calculate_average_metric(results_dense, 'rr'), 'p_at_k': calculate_average_metric(results_dense, 'p_at_k'), 'latency': calculate_average_metric(results_dense, 'latency')}


Query: What ingredients are in Spaghetti Bolognese?
  RR: 1.0000, P@5: 0.2000, Latency: 0.3011s
Query: high protein breakfast
  RR: 1.0000, P@5: 0.2000, Latency: 0.0349s
Query: what is the food with avocado
  RR: 1.0000, P@5: 0.4000, Latency: 0.0366s
Query: low fat soup
  RR: 0.5000, P@5: 0.2000, Latency: 0.0223s
Query: vitamins in salmon and quinoa
  RR: 1.0000, P@5: 0.2000, Latency: 0.0248s

Average Dense Metrics:
MRR: 0.9000
P@5: 0.2400
Avg Latency: 0.0840s


# HYBRID EVALUATION

In [82]:
results_hybrid = {}

for q_id, eval_data in EVAL_QUERIES.items():
    query = eval_data["query"]
    relevant_docs = eval_data["relevant_docs"]
    
    start_time = time.time()
    hybrid_retrieved = experiment.search_hybrid(query, k=K_VALUE)
    hybrid_latency = time.time() - start_time
    
    rr = calculate_reciprocal_rank(hybrid_retrieved, relevant_docs)
    p_at_k = calculate_precision_at_k(hybrid_retrieved, relevant_docs, k=K_VALUE)
    
    results_hybrid[q_id] = {
        "rr": rr,
        "p_at_k": p_at_k,
        "latency": hybrid_latency,
        "retrieved": hybrid_retrieved
    }
    print(f"Query: {query}")
    print(f"  RR: {rr:.4f}, P@{K_VALUE}: {p_at_k:.4f}, Latency: {hybrid_latency:.4f}s")

print("\nAverage Hybrid Metrics:")
print(f"MRR: {calculate_average_metric(results_hybrid, 'rr'):.4f}")
print(f"P@{K_VALUE}: {calculate_average_metric(results_hybrid, 'p_at_k'):.4f}")
print(f"Avg Latency: {calculate_average_metric(results_hybrid, 'latency'):.4f}s")

avg_hybrid = {'rr':   calculate_average_metric(results_hybrid, 'rr'), 'p_at_k': calculate_average_metric(results_hybrid, 'p_at_k'), 'latency': calculate_average_metric(results_hybrid, 'latency')}

Query: What ingredients are in Spaghetti Bolognese?
  RR: 1.0000, P@5: 0.2000, Latency: 0.0431s
Query: high protein breakfast
  RR: 1.0000, P@5: 0.2000, Latency: 0.0148s
Query: what is the food with avocado
  RR: 1.0000, P@5: 0.2000, Latency: 0.0152s
Query: low fat soup
  RR: 0.5000, P@5: 0.2000, Latency: 0.0133s
Query: vitamins in salmon and quinoa
  RR: 1.0000, P@5: 0.2000, Latency: 0.0144s

Average Hybrid Metrics:
MRR: 0.9000
P@5: 0.2000
Avg Latency: 0.0202s


# RERANKED EVALUATION 

In [86]:
results_reranked = {}

for q_id, eval_data in EVAL_QUERIES.items():
    query = eval_data["query"]
    relevant_docs = eval_data["relevant_docs"]
    
    start_time = time.time()
    reranked_retrieved = experiment.search_and_rerank(query, retrieve_k=20, rerank_k=K_VALUE)
    reranked_latency = time.time() - start_time
    
    rr = calculate_reciprocal_rank(reranked_retrieved, relevant_docs)
    p_at_k = calculate_precision_at_k(reranked_retrieved, relevant_docs, k=K_VALUE)
    
    results_reranked[q_id] = {
        "rr": rr,
        "p_at_k": p_at_k,
        "latency": reranked_latency,
        "retrieved": reranked_retrieved
    }
    print(f"Query: {query}")
    print(f"  RR: {rr:.4f}, P@{K_VALUE}: {p_at_k:.4f}, Latency: {reranked_latency:.4f}s")

print("\nAverage Reranked Metrics:")
print(f"MRR: {calculate_average_metric(results_reranked, 'rr'):.4f}")
print(f"P@{K_VALUE}: {calculate_average_metric(results_reranked, 'p_at_k'):.4f}")
print(f"Avg Latency: {calculate_average_metric(results_reranked, 'latency'):.4f}s")

avg_reranked = {'rr': calculate_average_metric(results_reranked, 'rr'), 'p_at_k': calculate_average_metric(results_reranked, 'p_at_k'), 'latency': calculate_average_metric(results_reranked, 'latency')}


Query: What ingredients are in Spaghetti Bolognese?
  RR: 1.0000, P@5: 0.2000, Latency: 0.4552s
Query: high protein breakfast
  RR: 0.5000, P@5: 0.2000, Latency: 0.2958s
Query: what is the food with avocado
  RR: 1.0000, P@5: 0.2000, Latency: 0.3060s
Query: low fat soup
  RR: 0.5000, P@5: 0.2000, Latency: 0.2641s
Query: vitamins in salmon and quinoa
  RR: 1.0000, P@5: 0.2000, Latency: 0.2905s

Average Reranked Metrics:
MRR: 0.8000
P@5: 0.2000
Avg Latency: 0.3223s


In [87]:
print("BM25")
print(avg_bm25)
print("Dense")
print(avg_dense)
print("Hybrid")
print(avg_hybrid)
print("Reranked")
print(avg_reranked)



BM25
{'rr': 0.8, 'p_at_k': 0.2, 'latency': 0.00030384063720703127}
Dense
{'rr': 0.9, 'p_at_k': 0.24000000000000005, 'latency': 0.08396105766296387}
Hybrid
{'rr': 0.9, 'p_at_k': 0.2, 'latency': 0.020176458358764648}
Reranked
{'rr': 0.8, 'p_at_k': 0.2, 'latency': 0.32231893539428713}
