# Imports
The following imports are required to run the code in the notebook, look at the comments for any necessary explanation on the package.

In [2]:
# Imports
import json # JSON Package ( Used for opening datasets, corpus )
import bm25s # The BM25 Sparse Retrieval Implementation ( Used for Retrieval of Doc Scores )
import nltk  #  Natural Language ToolKit ( Used for Tokenization )
from tqdm import tqdm # Progress Bar ( Used for Showing Progress of Functions )
import numpy as np # Scientific Computing ( Used for Mathematical Functions )

# Download Natural Language Toolkit tokenizer model(s) for word_tokenizer()
# NOTE : Change to 'quiet=False'  for any troubleshooting
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
print("Startup completed.")

Startup completed.


# Helper Functions
The following functions are helper functions - Look below to see the functionality of each Function and/or look at the comments within the function.
- open_json() : For opening a JSON file, such as the dataset or the corpus.
- tokenize() : For splitting text into tokens, required for making a BM25 retriever. Uses the punk(_tab) tokenizer model by NLTK.
- get_quest_corpus() : Corpus Loader for the QUEST Dataset Corpus - It is made in a special format, that cannot be easily stored with the simple open_json() function as we require the title separate from the text.
- apply_exclusion() : Applies a boost to, penalty to or zeros out scores for documents containing one of the terms in the exclusion_criteria list.


In [3]:
# JSON loader - For opening a JSON file, such as the dataset or the corpus
def open_json(filename):
    """
    Json File Opener

    Args:
        filename (string): the name of the json file to be opened
    Returns:
        dict: a dictionary of the json file
    """
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

# Tokenizer by NLTK - For splitting text into Tokens - Uses punkt(_tab) tokenizer model
def tokenize(text):
    """
    Tokenizer by NLTK

    Args:
        text (str): document or query text to be tokenized
    Returns:
        list[tokens]: tokenized document or query text
    """
    return nltk.word_tokenize(text.lower())

# Corpus Loader for the Quest Dataset Corpus - It is made in a special format, that cannot be easily stored with the simple open_json() function
def get_quest_corpus(filename):
    """
    Corpus Loader for the Quest Dataset Corpus

    Args:
        filename (string): the name of the json file containing the QUEST corpus
    Returns:
        list[titles]: list of titles from QUEST corpus
        list[documents]: list of documents from QUEST corpus
    """
    titles = []
    documents = []
    print("Loading Quest Corpus...")
    with open(filename, "r") as f:
            for line in tqdm(f):
                doc  = json.loads(line)
                titles.append(doc.get('title', ''))
                documents.append(doc.get('text', ''))
    return titles, documents

# Applies a boost to, penalty to or zeros out scores for documents containing one of the terms in the exclusion_criteria list.
def apply_exclusion(scores, documents, exclusion_criteria, mode, weight=0.5):
    """
    Rescores scores based on whether documents include exclusion_criteria terms. Rescoring is based on weight (0.5 for this research) and a mode (check args).

    Args:
        scores (list): a list of scores
        documents (list): a list of document text
        exclusion_criteria (list): a list of exclusion_criteria terms
        mode (string):
            'boost' - Boost the score of documents including exclusion criteria
            'penalize' - Penalize the score of documents including exclusion criteria
            'filter' - 'Filter out' documents including exclusion criteria (zero their scores out)
        weight (float): weight of exclusion mode impact boost/penalize
            1.0 -- Doubles score for Boost, zero's score for Penalize - Extreme
            0.5 -- Average weight impact - What we use in this Research!
            0.0 -- No weight impact - boost/penalize don't do anything
    Returns:
        list[float]: a list of scores after applying exclusion
    """
    # If exclusion criteria not provided, don't alter the scores
    if not exclusion_criteria:
        return scores
    # Check all documents for exclusion_criteria terms and prepare list for applying weighted mode.
    matches = np.array([
        any(criteria.lower() in doc.lower() for criteria in exclusion_criteria) for doc in documents
    ], dtype=bool)
    if mode == 'boost':
        scores[matches] *= (1 + weight)
    elif mode == 'penalize':
        scores[matches] *= (1 - weight)
    elif mode == 'filter':
        scores[matches] = 0.0
    return scores

# Evaluation Function NEVIR

In our Research this function is used to test the performance of the BM25 retriever on the NEVIR dataset with or without our applied strategy.
1. Baseline
2. Rewritten with LLM
3. Rewritten with LLM, boosting documents containing exclusion criteria terms provided by LLM
4. Rewritten with LLM, penalizing documents containing exclusion criteria terms provided by LLM
5. Rewritten with LLM, filtering out (making score 0) documents containing exclusion criteria terms provided by LLM

In [103]:
def eval_nevir(data, baseline=True, exclusion=False, mode='boost', weight=0.5):
    """
    Evaluation function for NevIr that can run on different modes:
        - Baseline
        - Rewritten
        - Rewritten with Boost Exclusion
        - Rewritten with Penalize Exclusion
        - Rewritten with Filter Exclusion
    Args:
        data (dict): a dictionary of the json file containing the dataset
        baseline (bool): if True, we are doing baseline testing, otherwise we are using rewritten queries
        exclusion (bool): if True, we take the exclusion criteria and use the apply_exclusion function on the scores
        mode (string): decides the exclusion mode
        weight (float): decides the impact of the exclusion mode (except for filter - 0.5 for our Research)
    Returns:
        pairwise_accuracy (float): Pairwise accuracy of the entire dataset with the BM25 sparse retriever
    """
    # Variables to calculate final pairwise_accuracy metric
    correct = 0
    total = 0
    print(f"Evaluating NevIR | Rewritten: {not baseline} | Exclusion: {exclusion} | Mode: {mode} | Weight: {weight}")
    # Loop through all dataset samples
    for sample in tqdm(data):
        # Documents are always standard
        docs = [sample['doc1'], sample['doc2']]
        # If baseline, take original queries
        if baseline:
            q1 = sample['q1']
            q2 = sample['q2']
        # Otherwise take the rewritten querie
        else:
            q1 = sample['rewritten_query_q1']
            q2 = sample['rewritten_query_q2']

        # Tokenize the documents and initialize a bm25 instance
        tokenized_docs = [tokenize(d) for d in docs]
        bm25 = bm25s.BM25()
        bm25.index(tokenized_docs, show_progress=False)

        # Tokenize and then score each query
        scores_q1 = bm25.get_scores(tokenize(q1))
        scores_q2 = bm25.get_scores(tokenize(q2))

        # On exclusion, we take the exclusion criteria's for both queries and apply the exclusion helper function to both scores
        if exclusion and not baseline:
            exclusion_q1 = sample['exclusion_criteria_q1']
            exclusion_q2 = sample['exclusion_criteria_q2']
            scores_q1 = apply_exclusion(scores_q1, docs, exclusion_q1, mode, weight)
            scores_q2 = apply_exclusion(scores_q2, docs, exclusion_q2, mode, weight)

        # Pairwise accuracy, so if both scores are correct we count it as a correct result for this entry
        if (np.argmax(scores_q1) == 0) and (np.argmax(scores_q2) == 1):
            correct += 1

        # Total is always +1 for every entry
        total += 1

    pairwise_accuracy = correct / total
    print(f"Pairwise Accuracy for this run: {pairwise_accuracy * 100:.2f}%")
    return pairwise_accuracy

# NEVIR Testing

- First, we load the NEVIR dataset.
- Afterwards, we call the evaluate function for the 5 scenario's described above in this document.

*Please note that weight=0.5 is used throughout all parts of our research*

In [101]:
nevir_dataset = open_json("NevIR_test_final.json")

In [102]:
eval_nevir(nevir_dataset)
eval_nevir(nevir_dataset, baseline=False)
eval_nevir(nevir_dataset, baseline=False, exclusion=True, mode='boost', weight=0.5)
eval_nevir(nevir_dataset, baseline=False, exclusion=True, mode='penalize', weight=0.5)
eval_nevir(nevir_dataset, baseline=False, exclusion=True, mode='filter', weight=0.5)

Evaluating NevIR | Rewritten: False | Exclusion: False | Mode: boost | Weight: 0.5


100%|██████████| 1383/1383 [00:01<00:00, 1171.55it/s]


Pairwise Accuracy for this run: 4.70%
Evaluating NevIR | Rewritten: True | Exclusion: False | Mode: boost | Weight: 0.5


100%|██████████| 1383/1383 [00:01<00:00, 1210.47it/s]


Pairwise Accuracy for this run: 8.60%
Evaluating NevIR | Rewritten: True | Exclusion: True | Mode: boost | Weight: 0.5


100%|██████████| 1383/1383 [00:01<00:00, 1205.03it/s]


Pairwise Accuracy for this run: 8.60%
Evaluating NevIR | Rewritten: True | Exclusion: True | Mode: penalize | Weight: 0.5


100%|██████████| 1383/1383 [00:01<00:00, 1202.20it/s]


Pairwise Accuracy for this run: 8.75%
Evaluating NevIR | Rewritten: True | Exclusion: True | Mode: filter | Weight: 0.5


100%|██████████| 1383/1383 [00:01<00:00, 1206.39it/s]

Pairwise Accuracy for this run: 9.62%





0.09616775126536514

# Evaluation Function QUEST

In our Research this function is used to test the performance of the BM25 retriever on the QUEST dataset with or without our applied strategy.
1. Baseline
2. Rewritten with LLM
3. Rewritten with LLM, boosting documents containing exclusion criteria terms provided by LLM
4. Rewritten with LLM, penalizing documents containing exclusion criteria terms provided by LLM
5. Rewritten with LLM, filtering out (making score 0) documents containing exclusion criteria terms provided by LLM

In [104]:
def eval_quest(data, retriever, documents, titles, baseline=True, exclusion=False, mode='penalize', weight=0.5, top_k=100):
    """
    Evaluation function for QUEST that can run on different modes:
        - Baseline
        - Rewritten
        - Rewritten with Boost Exclusion
        - Rewritten with Penalize Exclusion
        - Rewritten with Filter Exclusion
    Args:
        data (dict): a dictionary of the json file containing the dataset
        baseline (bool): if True, we are doing baseline testing, otherwise we are using rewritten queries
        exclusion (bool): if True, we take the exclusion criteria and use the apply_exclusion function on the scores
        mode (string): decides the exclusion mode
        weight (float): decides the impact of the exclusion mode (except for filter - 0.5 for our Research)
        top_k (int): decides the number of highest-scoring documents to take into account when calculating the final metrics
    Returns:
        avg_f1 (float): Average F1 score over all queries in the dataset
        avg_prec (float): Average precision score over all queries in the dataset
        avg_rec (float): Average recall score over all queries in the dataset
    """
    # Variables to calculate final metrics (taken from QUEST paper)
    f1_list = []
    prec_list = []
    rec_list = []
    print(f"Evaluating QUEST | Rewritten: {not baseline} | Exclusion: {exclusion} | Mode: {mode} | Weight: {weight}")

    # If baseline, take original queries
    if baseline:
        queries = [sample.get('original_query_cleaned') for sample in data]
    # Otherwise take the rewritten queries
    else:
        queries = [sample.get('rewritten_query') for sample in data]

    query_tokens = [tokenize(query) for query in tqdm(queries, desc='Tokenizing Queries')]
    # Take a larger initial top_k selection to make sure we can apply scoring boost/penalties and after that take the actual top_k
    initial_k = top_k * 10 if (exclusion and not baseline) else top_k
    # Get indices and scores for all queries
    results_indices, results_scores = retriever.retrieve(query_tokens, k=initial_k)

    # Loop through all dataset samples
    for i, sample in tqdm(enumerate(data), total=len(data), desc="Calculating Metrics"):
        # Get the indices and scores for this sample
        top_indices = results_indices[i]
        top_scores = results_scores[i]
        # On exclusion, we take the exclusion criteria's
        if exclusion and not baseline:
            excl_criteria = sample.get('exclusion_criteria', [])
            if excl_criteria and documents:
                # Take the documents contained in the top_k*100 indices, and use the helper function to get new scores
                local_docs = [documents[idx] for idx in top_indices]
                local_scores = top_scores.astype(float)
                local_scores = apply_exclusion(local_scores, local_docs, excl_criteria, mode, weight)
                # Sort new scores and take the top_k (Standard 100)
                sorted_local_scores = np.argsort(local_scores)[::-1]
                top_indices = top_indices[sorted_local_scores][:top_k]
        # Everything below this is 'indirectly' copied from the QUEST dataset code
        # https://github.com/google-research/language/blob/master/language/quest/eval/run_eval.py
        # The gold_titles in our case are the titles of documents that are relevant for our query
        gold_titles = set(sample['docs'])
        # The predicted titles in our case are the titles (top_k) predicted to be relevant for our query by the bm25 retriever
        retrieved_titles = [titles[idx] for idx in top_indices]
        predicted_titles = set(retrieved_titles)
        # Calculates the metrics
        tp = len(gold_titles.intersection(predicted_titles))
        fp = len(predicted_titles.difference(gold_titles))
        fn = len(gold_titles.difference(predicted_titles))
        if tp:
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            f1 = 2 * prec * rec / (prec + rec)
        else:
            prec = 0.0
            rec = 0.0
            f1 = 0.0

        f1_list.append(f1)
        prec_list.append(prec)
        rec_list.append(rec)
    # Take the mean over all queries
    avg_f1 = np.mean(f1_list) if f1_list else 0.0
    avg_prec = np.mean(prec_list) if prec_list else 0.0
    avg_rec = np.mean(rec_list) if rec_list else 0.0

    print(f"Quest Results --> F1: {avg_f1:.4f} | PREC: {avg_prec:.4f} | REC: {avg_rec:.4f}")
    print("----------------------------------------------------------------------------------------------------------------------------")
    return avg_f1, avg_prec, avg_rec

# QUEST Testing

- First, we load the QUEST dataset.
- Then we use the helper to load the corpus titles and documents.
- Then we initialize a BM25 instance on the corpus documents tokenized using NLTK tokenizer.
- Afterwards, we call the evaluate function for the 5 scenario's described above in this document.

*Please note that weight=0.5 is used throughout all parts of our research*<br>
*Also note that k=100 is chosen from the original QUEST research paper*

In [105]:
# Load the dataset that includes rewritten queries
quest_dataset = open_json("test_negations_final.json")
# Load the corpus titles and documents into lists
quest_titles, quest_documents = get_quest_corpus("documents.jsonl")
# Initialize a bm25 instance on the tokenized corpus documents
quest_bm25 = bm25s.BM25()
quest_tokenized_docs = [tokenize(doc) for doc in tqdm(quest_documents)]
quest_bm25.index(quest_tokenized_docs)

Loading Quest Corpus...


325505it [00:02, 117244.23it/s]
100%|██████████| 325505/325505 [05:37<00:00, 965.26it/s] 
                                                                                           

In [106]:
eval_quest(quest_dataset, quest_bm25, quest_documents, quest_titles)
eval_quest(quest_dataset, quest_bm25, quest_documents, quest_titles, False)
eval_quest(quest_dataset, quest_bm25, quest_documents, quest_titles, False, True, mode='boost', weight=0.5, top_k=100)
eval_quest(quest_dataset, quest_bm25, quest_documents, quest_titles, False, True, mode='penalize', weight=0.5, top_k=100)
eval_quest(quest_dataset, quest_bm25, quest_documents, quest_titles, False, True, mode='filter', weight=0.5, top_k=100)

Evaluating QUEST | Rewritten: False | Exclusion: False | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 207/207 [00:00<00:00, 38987.87it/s]
Calculating Metrics: 100%|██████████| 207/207 [00:00<00:00, 46819.51it/s]


Quest Results --> F1: 0.0254 | PREC: 0.0143 | REC: 0.1180
----------------------------------------------------------------------------------------------------------------------------
Evaluating QUEST | Rewritten: True | Exclusion: False | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 207/207 [00:00<00:00, 61922.90it/s]
Calculating Metrics: 100%|██████████| 207/207 [00:00<00:00, 44058.71it/s]


Quest Results --> F1: 0.0366 | PREC: 0.0207 | REC: 0.1677
----------------------------------------------------------------------------------------------------------------------------
Evaluating QUEST | Rewritten: True | Exclusion: True | Mode: boost | Weight: 0.5


Tokenizing Queries: 100%|██████████| 207/207 [00:00<00:00, 61280.42it/s]
Calculating Metrics: 100%|██████████| 207/207 [00:02<00:00, 70.48it/s]


Quest Results --> F1: 0.0313 | PREC: 0.0177 | REC: 0.1717
----------------------------------------------------------------------------------------------------------------------------
Evaluating QUEST | Rewritten: True | Exclusion: True | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 207/207 [00:00<00:00, 53729.87it/s]
Calculating Metrics: 100%|██████████| 207/207 [00:02<00:00, 71.88it/s]


Quest Results --> F1: 0.0348 | PREC: 0.0197 | REC: 0.1854
----------------------------------------------------------------------------------------------------------------------------
Evaluating QUEST | Rewritten: True | Exclusion: True | Mode: filter | Weight: 0.5


Tokenizing Queries: 100%|██████████| 207/207 [00:00<00:00, 60659.61it/s]
Calculating Metrics: 100%|██████████| 207/207 [00:02<00:00, 72.14it/s]

Quest Results --> F1: 0.0343 | PREC: 0.0194 | REC: 0.1834
----------------------------------------------------------------------------------------------------------------------------





(np.float64(0.03432507627850995),
 np.float64(0.019414575313126035),
 np.float64(0.1834254767814691))

# Evaluation Function EXCLUIR

In our Research this function is used to test the performance of the BM25 retriever on the EXCLUIR dataset with or without our applied strategy.
1. Baseline
2. Rewritten with LLM
3. Rewritten with LLM, boosting documents containing exclusion criteria terms provided by LLM
4. Rewritten with LLM, penalizing documents containing exclusion criteria terms provided by LLM
5. Rewritten with LLM, filtering out (making score 0) documents containing exclusion criteria terms provided by LLM

In [4]:
def eval_excl(data, retriever, corpus_list, baseline=True, exclusion=False, mode='penalize', weight=0.5):
    """
    Evaluation function for EXCLUIR that can run on different modes:
        - Baseline
        - Rewritten
        - Rewritten with Boost Exclusion
        - Rewritten with Penalize Exclusion
        - Rewritten with Filter Exclusion
    Args:
        data (dict): a dictionary of the json file containing the dataset
        baseline (bool): if True, we are doing baseline testing, otherwise we are using rewritten queries
        exclusion (bool): if True, we take the exclusion criteria and use the apply_exclusion function on the scores
        mode (string): decides the exclusion mode
        weight (float): decides the impact of the exclusion mode (except for filter - 0.5 for our Research)
    Returns:
        avg_r1 (float): average R@1 score  (Recall @ 1)
        avg_mrr (float): average MRR@10 score (MRR @ 10)
        avg_rr (float): average RR score (RightRank over top half of highest scores)
    """
    # Variables to calculate final metrics (taken from EXCLUIR paper)
    rec1_list = []
    mrr10_list = []
    rr_list = []
    print(f"Evaluating EXCLUIR | Rewritten: {not baseline} | Exclusion: {exclusion} | Mode: {mode} | Weight: {weight}")
    # If baseline, take original queries
    if baseline:
        queries = [sample.get('RQ_rewrite') for sample in data]
    # Otherwise take the rewritten queries
    else:
        queries = [sample.get('rewritten_query') for sample in data]

    query_tokens = [tokenize(query) for query in tqdm(queries, desc='Tokenizing Queries')]
    # Take a larg initial top_k selection to make sure we can apply scoring boost/penalties and still accurately represent @10 or @100 etc. metrics
    initial_k = round(len(corpus_list))
    # Get indices and scores for all queries
    results_indices, results_scores = retriever.retrieve(query_tokens, k=initial_k)

    # Loop through all dataset samples
    for i, sample in tqdm(enumerate(data), total=len(data), desc="Calculating metrics"):
        # Get the indices and scores for this sample
        top_indices = results_indices[i]
        top_scores = results_scores[i]
        # On exclusion, we take the exclusion criteria's
        if exclusion and not baseline:
            excl_criteria = sample.get('exclusion_criteria', [])
            if excl_criteria and corpus_list:
            	# Take the documents contained in the candidate indices (half of corpus), and use the helper function to get new scores
                local_docs = [corpus_list[idx] for idx in top_indices]
                local_scores = top_scores.astype(float)
                local_scores = apply_exclusion(local_scores, local_docs, excl_criteria, mode, weight)
                # Sort new scores and take their indices
                sorted_local_scores = np.argsort(local_scores)[::-1]
                top_indices = top_indices[sorted_local_scores]

        # Take the negative document index and positive document index from the EXCLUIR data sample (bad vs good document)
        neg_idx = int(sample['corpus_sub_index'][0])
        pos_idx = int(sample['corpus_sub_index'][1])
        # Find if and where they are in the top_indices (if not they are on index 'infinite)
        try:
            neg_rank = np.where(top_indices == neg_idx)[0][0]
        except IndexError:
            neg_rank = float('inf')
        try:
            pos_rank = np.where(top_indices == pos_idx)[0][0]
        except IndexError:
            pos_rank = float('inf')

        # If good document in top1 spot, recall for this query is 1, otherwise 0
        rec1 = 1.0 if pos_rank == 0 else 0.0
        rec1_list.append(rec1)

        # If good document in top10 spot, mrr is 1/(rank+1), if outside spot 0
        mrr = 0.0
        if pos_rank < 10:
            mrr = 1.0 / (pos_rank + 1)
        mrr10_list.append(mrr)

        # If good document ranked better than bad document RR is 0, otherwise 0
        rr_val = 1.0 if pos_rank < neg_rank else 0.0
        rr_list.append(rr_val)
    # Take the mean over all queries
    avg_r1 = np.mean(rec1_list)
    avg_mrr = np.mean(mrr10_list)
    avg_rr = np.mean(rr_list)

    print(f"EXCLUIR Results --> R@1: {avg_r1:.4f} | MRR@10: {avg_mrr:.4f} | RightRank: {avg_rr:.4f}")
    print("----------------------------------------------------------------------------------------------------------------------------")
    return avg_r1, avg_mrr, avg_rr


# EXCLUIR Testing

- First, we load the EXCLUIR dataset.
- Then we use the helper function to load the documents from the corpus.
- Then we initialize a BM25 instance on the corpus documents tokenized using NLTK tokenizer.
- Afterwards, we call the evaluate function for the 5 scenario's described above in this document.

*Please note that weight=0.5 is used throughout all parts of our research*

In [5]:
# Load the dataset that includes rewritten queries
excluir_data = open_json('ExcluIR_test_manual_final.json')
# Load the corpus that includes documents
excluir_documents = open_json('corpus.json')
# Initialize a bm25 instance on the tokenized corpus documents
excluir_bm25 = bm25s.BM25()
excluir_tokenized = [tokenize(doc) for doc in tqdm(excluir_documents)]
excluir_bm25.index(excluir_tokenized)

100%|██████████| 90406/90406 [00:15<00:00, 5869.96it/s]
                                                                                          

In [6]:
eval_excl(excluir_data, excluir_bm25, excluir_documents)
eval_excl(excluir_data, excluir_bm25, excluir_documents, False)
eval_excl(excluir_data, excluir_bm25, excluir_documents, False, True, mode='boost', weight=0.5)
eval_excl(excluir_data, excluir_bm25, excluir_documents, False, True, mode='penalize', weight=0.5)
eval_excl(excluir_data, excluir_bm25, excluir_documents, False, True, mode='filter', weight=0.5)

Evaluating EXCLUIR | Rewritten: False | Exclusion: False | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 3452/3452 [00:00<00:00, 21036.33it/s]
Calculating metrics: 100%|██████████| 3452/3452 [00:00<00:00, 14341.92it/s]


EXCLUIR Results --> R@1: 0.4983 | MRR@10: 0.6529 | RightRank: 0.5397
----------------------------------------------------------------------------------------------------------------------------
Evaluating EXCLUIR | Rewritten: True | Exclusion: False | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 3452/3452 [00:00<00:00, 27145.41it/s]
Calculating metrics: 100%|██████████| 3452/3452 [00:00<00:00, 16227.98it/s]


EXCLUIR Results --> R@1: 0.6347 | MRR@10: 0.7167 | RightRank: 0.8441
----------------------------------------------------------------------------------------------------------------------------
Evaluating EXCLUIR | Rewritten: True | Exclusion: True | Mode: boost | Weight: 0.5


Tokenizing Queries: 100%|██████████| 3452/3452 [00:00<00:00, 33257.47it/s]
Calculating metrics: 100%|██████████| 3452/3452 [08:11<00:00,  7.03it/s]


EXCLUIR Results --> R@1: 0.6188 | MRR@10: 0.7182 | RightRank: 0.7590
----------------------------------------------------------------------------------------------------------------------------
Evaluating EXCLUIR | Rewritten: True | Exclusion: True | Mode: penalize | Weight: 0.5


Tokenizing Queries: 100%|██████████| 3452/3452 [00:00<00:00, 31150.47it/s]
Calculating metrics: 100%|██████████| 3452/3452 [08:19<00:00,  6.91it/s]


EXCLUIR Results --> R@1: 0.4893 | MRR@10: 0.5634 | RightRank: 0.8943
----------------------------------------------------------------------------------------------------------------------------
Evaluating EXCLUIR | Rewritten: True | Exclusion: True | Mode: filter | Weight: 0.5


Tokenizing Queries: 100%|██████████| 3452/3452 [00:00<00:00, 24443.78it/s]
Calculating metrics: 100%|██████████| 3452/3452 [07:41<00:00,  7.47it/s]

EXCLUIR Results --> R@1: 0.4389 | MRR@10: 0.4946 | RightRank: 0.7784
----------------------------------------------------------------------------------------------------------------------------





(np.float64(0.4388760139049826),
 np.float64(0.4946250390847725),
 np.float64(0.7783893395133256))