In [1]:
# Install Hugging Face datasets library and rank_bm25
!pip install datasets rank_bm25

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

# **Import the necessary libraries**

In [21]:
# Import libraries
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import pandas as pd

# **load quora dataset for IR**

In [3]:
# Load a small dataset from Hugging Face
dataset = load_dataset("quora", split="train[:1000]")  # only first 1000 examples

# Let's look at one sample
print(dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

quora.py:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

{'questions': {'id': [1, 2], 'text': ['What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?']}, 'is_duplicate': False}


# **Prepare the corpus and queries.**

In [12]:
# Build the corpus and queries
corpus = []
queries = []

for example in dataset:
    question1 = example['questions']['text'][0]
    question2 = example['questions']['text'][1]

    corpus.append(question1)
    queries.append(question2)  # We will pretend we want to retrieve question1 given question2

# **build our BM25 model iver the corpus**

In [13]:
# Tokenize the corpus for BM25
tokenized_corpus = [doc.split() for doc in corpus]

# Build the BM25 index
bm25 = BM25Okapi(tokenized_corpus)

# **Watch and Learn ^-^ !**

In [14]:
# Example: retrieve top 3 documents for the first query
query = queries[0]
tokenized_query = query.split()

# Get BM25 scores for all documents
doc_scores = bm25.get_scores(tokenized_query)

# Rank documents by score (highest first)
top_n = 3
top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

# Show results
print(f"Query:\n{query}\n")
print("Top documents:")
for idx in top_n_indices:
    print(f"Score: {doc_scores[idx]:.2f} | Document: {corpus[idx]}")


Query:
What is the step by step guide to invest in share market?

Top documents:
Score: 40.76 | Document: What is the step by step guide to invest in share market in india?
Score: 9.68 | Document: What is the best way to invest in oil and natural gas?
Score: 9.32 | Document: What stocks are the best to invest in right now?


In [16]:
# Make a simple search function

def search(query, bm25_model, corpus, top_n=3):
    tokenized_query = query.split()
    doc_scores = bm25_model.get_scores(tokenized_query)
    top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

    print(f"\nQuery:\n{query}\n")
    print(f"Top {top_n} matching documents:")
    for idx in top_n_indices:
        print(f"Score: {doc_scores[idx]:.2f} | Document: {corpus[idx]}")

# Example: Search manually
search("how to invest money in india", bm25, corpus, top_n=5)


Query:
how to invest money in india

Top 5 matching documents:
Score: 8.20 | Document: What stocks are the best to invest in right now?
Score: 7.54 | Document: What is the best way to invest in oil and natural gas?
Score: 7.53 | Document: What is the step by step guide to invest in share market in india?
Score: 6.45 | Document: What is best way to make money online?
Score: 6.15 | Document: What are the easy ways to earn money online?


In [17]:
search("your custom query here", bm25, corpus, top_n=5)


Query:
your custom query here

Top 5 matching documents:
Score: 7.00 | Document: Who are the best custom kiosk manufacturers in India?
Score: 5.25 | Document: What is your job and your salary? Are you satisfied with your current job?
Score: 4.72 | Document: What's your bucket list?
Score: 4.46 | Document: Do you regret your divorce?
Score: 4.16 | Document: Could we use cherenkov atmosphere radiation (with gamma rays or similar) to image the surface of a planet from here with ground based telescopes?


# **Evaluation**

In [18]:
# Function to calculate Precision@k
def precision_at_k(relevant_indices, retrieved_indices, k):
    retrieved_k = retrieved_indices[:k]
    hits = sum(1 for idx in retrieved_k if idx in relevant_indices)
    return hits / k

# Full evaluation over multiple queries
def evaluate_bm25(queries, corpus, bm25_model, dataset, k=3):
    precisions = []

    for i, query in enumerate(queries):
        # Prepare the query
        tokenized_query = query.split()
        doc_scores = bm25_model.get_scores(tokenized_query)
        top_k_indices = sorted(range(len(doc_scores)), key=lambda idx: doc_scores[idx], reverse=True)[:k]

        # Ground truth: the original question should match the corpus[i]
        relevant_indices = [i]

        # Precision@k for this query
        precision = precision_at_k(relevant_indices, top_k_indices, k)
        precisions.append(precision)

    average_precision = sum(precisions) / len(precisions)
    print(f"\nAverage Precision@{k}: {average_precision:.4f}")
    return average_precision

# Run evaluation
evaluate_bm25(queries, corpus, bm25, dataset, k=3)


Average Precision@3: 0.2373


0.2373333333333357

# **Function to retrieve and show results in a nice table**

In [22]:
def search_pretty(query, bm25_model, corpus, top_n=5):
    tokenized_query = query.split()
    doc_scores = bm25_model.get_scores(tokenized_query)
    top_n_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]

    results = []
    for idx in top_n_indices:
        results.append({
            "Score": round(doc_scores[idx], 2),
            "Document": corpus[idx]
        })

    df = pd.DataFrame(results)
    print(f"\n Query:\n{query}\n")
    display(df)  # nice display in colab

# Example usage
search_pretty("how to invest in stock market", bm25, corpus, top_n=5)


 Query:
how to invest in stock market



Unnamed: 0,Score,Document
0,13.29,What is the best source to learn stock market ...
1,12.79,What is the step by step guide to invest in sh...
2,8.2,What stocks are the best to invest in right now?
3,7.54,What is the best way to invest in oil and natu...
4,5.82,Does Fab currently offer new employees stock o...
