In [23]:
# Install Hugging Face datasets library and rank_bm25
!pip install datasets rank_bm25 --quiet

# **Import the necessary libraries**

In [24]:
# Import libraries
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import pandas as pd

# **load quora dataset for IR**

In [25]:
# Load a small dataset from Hugging Face
dataset = load_dataset("quora", split="train[:1000]")  # only first 1000 examples

# Let's look at one sample
print(dataset[0])

{'questions': {'id': [1, 2], 'text': ['What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?']}, 'is_duplicate': False}


# **Prepare the corpus and queries.**

In [26]:
# Build the corpus and queries
corpus = []
queries = []

for example in dataset:
    question1 = example['questions']['text'][0]
    question2 = example['questions']['text'][1]

    corpus.append(question1)
    queries.append(question2)  # We will pretend we want to retrieve question1 given question2

# **build our BM25 model iver the corpus**

In [27]:
# Tokenize the corpus for BM25
tokenized_corpus = [doc.split() for doc in corpus]

# Build the BM25 index
bm25 = BM25Okapi(tokenized_corpus)

# **Retrieve top 3 documents for the first query**

In [33]:
query = queries[0]
tokenized_query = query.split()

# Get BM25 scores for all documents
doc_scores = bm25.get_scores(tokenized_query)

# Rank documents by score (highest first)
top_n = 3
top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

# Show results
print(f"Query:\n{query}\n")
print("Top documents:")
for idx in top_n_indices:
    print(f"Score: {doc_scores[idx]:.2f} | Document: {corpus[idx]}")


Query:
What is the step by step guide to invest in share market?

Top documents:
Score: 40.76 | Document: What is the step by step guide to invest in share market in india?
Score: 9.68 | Document: What is the best way to invest in oil and natural gas?
Score: 9.32 | Document: What stocks are the best to invest in right now?


In [34]:
# Make a simple search function

def search(query, bm25_model, corpus, top_n=3):
    tokenized_query = query.split()
    doc_scores = bm25_model.get_scores(tokenized_query)
    top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

    print(f"\nQuery:\n{query}\n")
    print(f"Top {top_n} matching documents:")
    for idx in top_n_indices:
        print(f"Score: {doc_scores[idx]:.2f} | Document: {corpus[idx]}")

# Example: Search manually
search("how to invest money in india", bm25, corpus, top_n=5)


Query:
how to invest money in india

Top 5 matching documents:
Score: 8.20 | Document: What stocks are the best to invest in right now?
Score: 7.54 | Document: What is the best way to invest in oil and natural gas?
Score: 7.53 | Document: What is the step by step guide to invest in share market in india?
Score: 6.45 | Document: What is best way to make money online?
Score: 6.15 | Document: What are the easy ways to earn money online?


In [35]:
search("your custom query here", bm25, corpus, top_n=5)


Query:
your custom query here

Top 5 matching documents:
Score: 7.00 | Document: Who are the best custom kiosk manufacturers in India?
Score: 5.25 | Document: What is your job and your salary? Are you satisfied with your current job?
Score: 4.72 | Document: What's your bucket list?
Score: 4.46 | Document: Do you regret your divorce?
Score: 4.16 | Document: Could we use cherenkov atmosphere radiation (with gamma rays or similar) to image the surface of a planet from here with ground based telescopes?


# **Evaluation**

In [36]:
# Function to calculate Precision@k
def precision_at_k(relevant_indices, retrieved_indices, k):
    retrieved_k = retrieved_indices[:k]
    hits = sum(1 for idx in retrieved_k if idx in relevant_indices)
    return hits / k

# Full evaluation over multiple queries
def evaluate_bm25(queries, corpus, bm25_model, dataset, k=3):
    precisions = []

    for i, query in enumerate(queries):
        # Prepare the query
        tokenized_query = query.split()
        doc_scores = bm25_model.get_scores(tokenized_query)
        top_k_indices = sorted(range(len(doc_scores)), key=lambda idx: doc_scores[idx], reverse=True)[:k]

        # Ground truth: the original question should match the corpus[i]
        relevant_indices = [i]

        # Precision@k for this query
        precision = precision_at_k(relevant_indices, top_k_indices, k)
        precisions.append(precision)

    average_precision = sum(precisions) / len(precisions)
    print(f"\nAverage Precision@{k}: {average_precision:.4f}")
    return average_precision

# Run evaluation
evaluate_bm25(queries, corpus, bm25, dataset, k=3)


Average Precision@3: 0.2373


0.2373333333333357

# **Function to retrieve and show results in a nice table**

In [37]:
def search_pretty(query, bm25_model, corpus, top_n=5):
    tokenized_query = query.split()
    doc_scores = bm25_model.get_scores(tokenized_query)
    top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

    results = []
    for idx in top_n_indices:
        results.append({
            "Score": round(doc_scores[idx], 2),
            "Document": corpus[idx]
        })

    df = pd.DataFrame(results)
    print(f"\n Query:\n{query}\n")
    display(df)  # nice display in colab

# Example usage
search_pretty("how to invest in stock market", bm25, corpus, top_n=5)


 Query:
how to invest in stock market



Unnamed: 0,Score,Document
0,13.29,What is the best source to learn stock market ...
1,12.79,What is the step by step guide to invest in sh...
2,8.2,What stocks are the best to invest in right now?
3,7.54,What is the best way to invest in oil and natu...
4,5.82,Does Fab currently offer new employees stock o...
