In [1]:
# Imports
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
import json
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Timov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Timov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Tokenizer
def tokenize(text):
    return nltk.word_tokenize(text.lower())

# JSON loader
def open_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Loading Dataset NevIR

In [7]:
# Function for the NevIR Dataset (Exclusion yet to be done)

def nevir(data, data2=None):
    correct = 0
    total = 0
    # No rewritten dataset provided
    if data2 is None:
        print("Evaluating NevIR original Dataset.")
    # Rewritten Dataset provided, make a loopup set for those queries
    else:
        print("Evaluating NevIR queries rewritten.")
        data2_lookup = {item['id']: item for item in data2}
    for i, sample in enumerate(tqdm(data, desc="Evaluating BM25 performance on NevIR")):
        # Take the original documents (never rewritten)
        docs = [sample['doc1'], sample['doc2']]
        # If no rewritten queries, take original queries
        if data2 is None:
            q1 = sample['q1']
            q2 = sample['q2']
        # Otherwise take the rewritten queries that match the id on the documents (to have correct pairs)
        else:
            sid = sample['id']
            if sid not in data2_lookup:
                print(f"ID {sid} not found in data2, skipping.")
                continue
            q1 = data2_lookup[sid]['rewritten_query_q1']
            q2 = data2_lookup[sid]['rewritten_query_q2']
        # Tokenize the documents and initialize a bm25 instance
        tokenized_docs = [tokenize(d) for d in docs]
        bm25 = BM25Okapi(tokenized_docs)

        # Score each query
        scores_q1 = bm25.get_scores(tokenize(q1))
        scores_q2 = bm25.get_scores(tokenize(q2))
        # q1 should match doc1, q2 should match doc2
        correct += int(np.argmax(scores_q1) == 0)
        correct += int(np.argmax(scores_q2) == 1)
        total += 2

    accuracy = correct / total
    print(f"BM25 Pairwise Accuracy on NevIR: {accuracy * 100:.2f}%")
    return accuracy

In [8]:
# NEVIR test set
ds_nevir = load_dataset("orionweller/NevIR")
data_nevir = ds_nevir["test"]

# Run NevIR standard and with the rewritten dataset
nevir(data_nevir)
data2_nevir = open_json("NevIR_test_final.json")
nevir(data_nevir, data2_nevir)

Evaluating NevIR original Dataset.


Evaluating BM25 performance on NevIR: 100%|██████████| 1383/1383 [00:01<00:00, 1272.08it/s]


BM25 Pairwise Accuracy on NevIR: 48.16%
Evaluating NevIR queries rewritten.


Evaluating BM25 performance on NevIR: 100%|██████████| 1383/1383 [00:01<00:00, 1314.15it/s]

BM25 Pairwise Accuracy on NevIR: 48.81%





0.4880694143167028

In [11]:
# Get the corpus and load it into a dictionary based on title (yet to decide if this is the best)
def get_corpus(filename):
    corpus = {}
    with open(filename, "r") as f:
        total_docs = sum(1 for _ in f)
    with open(filename, "r") as f:
        for line in tqdm(f, total=total_docs):
            d = json.loads(line)
            corpus[d['title']] = {"text": d['text'], "tokens": tokenize(d['text'])}
    return corpus

In [10]:
# Load the dataset that includes rewritten queries, and get the corpus
data_quest = open_json("test_negations_final.json")
corpus = get_corpus("documents.jsonl")

100%|██████████| 325505/325505 [05:52<00:00, 924.11it/s] 


In [14]:
# We do this beforehand because it takes quite some time
tokenized_corpus = [doc["tokens"] for doc in corpus.values()]
bm25 = BM25Okapi(tokenized_corpus)

In [29]:
# Function to determine which docs are relevant
def get_relevance(title, relevance_dict):
    ratings = relevance_dict.get(title, {})
    for r in ratings:
        if r == "Definitely relevant" or r == "Likely relevant":
            return True
    return False

In [21]:
# Fucntion for the Quest Dataset (Exclusion yet to be done)

def quest(data, bm25, corpus, baseline=True, exclusion=False):
    total_rr = 0
    correct_at_1 = 0
    nr_of_queries = len(data)
    titles = list(corpus.keys())
    for sample in tqdm(data, desc="Evaluating BM25 on Quest Dataset"):
        # If baseline use the original query
        if baseline:
            query = sample['original_query_cleaned']
        # Otherwise use the rewritten query
        else:
            query = sample['rewritten_query']
        # Get the relevance ratings (Giving relevant titles for the query)
        relevance_ratings = sample.get('metadata', {}).get('relevance_ratings') or {}

        # Tokenize the query and get bm25 scores, sort them from high to low take top_k (100)
        scores = bm25.get_scores(tokenize(query))
        top_k = 100
        ranked_indices = np.argsort(scores)[-top_k:][::-1]

        # Calculate Reciprocal Rank, to calculate MRR later on
        rr = 0
        for rank, idx in enumerate(ranked_indices, start=1):
            if get_relevance(titles[idx], relevance_ratings):
                rr = 1.0 / rank
                break
        total_rr += rr
        # Check acc for the best ranked indice, to get accuracy@1
        if get_relevance(titles[ranked_indices[0]], relevance_ratings):
            correct_at_1 += 1

    # Final metrics
    mrr = total_rr / nr_of_queries
    acc_at_1 = correct_at_1 / nr_of_queries

    print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
    print(f"Accuracy@1: {acc_at_1:.4f}")

In [22]:
quest(data_quest, bm25, corpus)
quest(data_quest, bm25, corpus, False)

Evaluating BM25 on Quest Dataset: 100%|██████████| 207/207 [05:01<00:00,  1.46s/it]


Mean Reciprocal Rank (MRR): 0.0875
Accuracy@1: 0.0338


Evaluating BM25 on Quest Dataset: 100%|██████████| 207/207 [02:08<00:00,  1.61it/s]

Mean Reciprocal Rank (MRR): 0.1389
Accuracy@1: 0.0725





In [23]:
# Load ExcluIR rewritten Dataset
with open('ExcluIR_test_manual_final.json', 'r', encoding='utf-8') as f:
    data_exclu = json.load(f)

# Load the corpus (stored as Json with lists)
with open('corpus.json', 'r', encoding='utf-8') as f:
    corpus_exclu = json.load(f)

In [27]:
# Fucntion for the ExcluIR Dataset (Exclusion yet to be done)
# Corpus will be used later for the exclusion check

def excl(data, corpus, bm25, baseline=True, exclusion=False):
    total_rr = 0
    correct_at_1 = 0
    nr_of_queries = len(data)
    for sample in tqdm(data, desc="Evaluating BM25 on ExcluIR Dataset"):
        # If baseline use the original query
        if baseline:
            query = sample['question0']
        # Otherwise use the rewritten query
        else:
            query = sample['rewritten_query']
        # Get the relevance ratings (Giving relevant document indices for the query)
        relevant_indices = set(sample['corpus_sub_index'])

        # Tokenize the query and get bm25 scores, sort them from high to low take top_k (100)
        scores = bm25.get_scores(tokenize(query))
        top_k = 100
        ranked_indices = np.argsort(scores)[-top_k:][::-1]

        # Calculate Reciprocal Rank, to calculate MRR later on
        rr = 0
        for rank, idx in enumerate(ranked_indices, start=1):
            if idx in relevant_indices:
                rr = 1.0 / rank
                break
        total_rr += rr
        # Check acc for the best ranked indice, to get accuracy@1
        if ranked_indices[0] in relevant_indices:
            correct_at_1 += 1

    # Final metrics
    mrr = total_rr / nr_of_queries
    acc_at_1 = correct_at_1 / nr_of_queries

    print(f"Baseline BM25 MRR: {mrr:.4f}")
    print(f"Baseline BM25 Acc@1: {acc_at_1:.4f}")


In [24]:
# We do this beforehand because it takes quite some time
tokenized_corpus_exclu = [tokenize(doc) for doc in corpus_exclu]
bm25_exclu = BM25Okapi(tokenized_corpus_exclu)

In [28]:
excl(data_exclu, corpus_exclu, bm25_exclu)
excl(data_exclu, corpus_exclu, bm25_exclu, False)

Evaluating BM25 on ExcluIR Dataset: 100%|██████████| 3452/3452 [26:21<00:00,  2.18it/s]


Baseline BM25 MRR: 0.8996
Baseline BM25 Acc@1: 0.8627


Evaluating BM25 on ExcluIR Dataset: 100%|██████████| 3452/3452 [18:44<00:00,  3.07it/s]

Baseline BM25 MRR: 0.7914
Baseline BM25 Acc@1: 0.7335



