In [18]:
!pip install ir_datasets
!pip install ir_measures
!pip install deep_translator
!pip install razdel
!pip install stopwordsiso
!pip3 install "ir-measures[cwl_eval]"

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [19]:
import ir_datasets
from collections import defaultdict
from tqdm.notebook import tqdm
from collections import Counter
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc
from itertools import chain
from razdel import tokenize
from nltk.stem.snowball import SnowballStemmer
import string
import pickle
import matplotlib.pyplot as plt



### Load data

In [8]:
# loading dataset and qrels for further evaluation
dataset = ir_datasets.load("neuclir/1/ru/trec-2023")
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

In [9]:
# extracting documents, queries and qrels
russian_documents = [(doc.doc_id, doc.title, doc.text) for doc in tqdm(dataset.docs_iter())]
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

0it [00:00, ?it/s]

[INFO] If you have a local copy of https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/3aabc798a3b5dd92d7c47db9521870b1
[INFO] [starting] https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true

https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.0%| 0.00/4.50G [00:00<?, ?B/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.3%| 11.5M/4.50G [00:00<00:39, 114MB/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.6%| 26.7M/4.50G [00:00<00:33, 133MB/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 1.0%| 43.7M/4.50G [00:00<00:30, 145MB/s][A
ht

In [10]:
# filtering out the documents that are present in qrels (for which relevance feedback is available)
qrels_ids = {entry[1] for entry in tqdm(qrels)}
russian_documents_subset = [doc for doc in tqdm(russian_documents) if doc[0] in qrels_ids]
len(russian_documents_subset)

  0%|          | 0/25634 [00:00<?, ?it/s]

  0%|          | 0/4627543 [00:00<?, ?it/s]

24871

### Helper functions

In [38]:
def evaluate(qrels, result):

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]

    metrics = [
        ir_measures.nDCG @ 20,   # nDCG@20
        ir_measures.AP,          # Average Precision
        #ir_measures.RBP(rel=1),  # Relevance Based Precision
        ir_measures.R @ 100,     # Recall@100
        ir_measures.R @ 1000     # Recall@1000
    ]

#     scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)
    scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)

    return scores

def print_document(document_id, collection):
    print(next((doc for doc in collection if doc[0] == document_id), None))

    from collections import defaultdict

def rbp(retrieved_results, qrels, p=0.8):
    # Create a dictionary of relevant documents for each query from qrels
    qrels_dict = defaultdict(set)

    # Unpack qrels and store relevant documents per query_id
    for qrel in qrels:
        query_id = qrel.query_id
        doc_id = qrel.doc_id
        relevance = qrel.relevance
        if relevance > 0:
            qrels_dict[query_id].add(doc_id)

    # Group the retrieved results by query_id
    grouped_retrieved_results = defaultdict(list)
    for query_id, doc_id, score in retrieved_results:
        grouped_retrieved_results[query_id].append((doc_id, score))

    # Initialize the rbp_scores dictionary
    rbp_scores = {}

    # Iterate over the grouped retrieved results
    for query_id, retrieved_docs in grouped_retrieved_results.items():
        score = 0
        # Rank each document for the query
        for rank, (doc_id, _) in enumerate(retrieved_docs, start=1):
            if doc_id in qrels_dict.get(query_id, set()):
                score += (1 - p) * (p ** (rank - 1))
        rbp_scores[query_id] = score

    return rbp_scores

def translate_query(query):
    translated_text = GoogleTranslator(source='auto', target='ru').translate(query[1])
    translated_tuple = (query[0], translated_text)

    return translated_tuple


def combine_documents(documents):
    combine_documents =  list(map(lambda doc: (doc[0], doc[1] + ' ' + doc[2]), documents))
    return combine_documents


import stopwordsiso




def preprocess(text, remove_stop: bool=True) -> list:
    """
    :text: str, text of the corresponding document
    :param remove_stop: bool indicating if stopwords should be removed (default True)

    :return: list(str) of tokens, stemmed, with removed punctuation
    """

    tokens = [_.text for _ in list(tokenize(text))]

    preprocessed =[]

    stemmer = SnowballStemmer("russian")
    stopwords_ru = stopwordsiso.stopwords("ru")
    punct = string.punctuation + "«»" + "—" + '–'

    for t in tokens:
        if t in punct:
            continue
        if remove_stop and t.lower() in stopwords_ru:
            continue
        preprocessed.append(stemmer.stem(t))

    return preprocessed





In [15]:
# To search on both document title and text, we will concatenate them, obtaining pairs (document_id, full text (title+text))
combined_documents = combine_documents(russian_documents_subset)
print(combined_documents[-1])

('20bc2d7b-91d1-428b-aa54-ef112e8a1e69', 'Экс-замминобороны США: ядерную войну запустит искусственный интеллект Вашингтон, , 06:51 — REGNUM О существенной опасности искусственного интеллекта в случае использования его военными заявил бывший заместитель министра обороны США Роберт Уорк, передает Breaking Defense.\n\nИскусственный интеллект Цитата из к/ф «Космическая одиссея 2001 года». Реж. Стэнли Кубрик. 1968. США — Великобритания\n\nПо мнению бывшего сотрудника Пентагона, если допустить искусственный интеллект (ИИ) к принятию решений, то существует вероятность, что имеющиеся риски будут оценены им неверно, и ядерное оружие будет применено в ситуации, когда реальной угрозы не будет существовать.\n\n«Представьте, что в системе управления ядерным оружием есть прогностическая система ИИ, которая запускается по определенным параметрам, — сказал он. — Это гораздо, гораздо, гораздо более тревожная перспектива, чем все опасности, что вы можете представить в связи с применением индивидуального

In [21]:
translated_queries = []
for query in english_queries:
  translated_queries.append(translate_query(query))

**TF-IDF**

For the implementation of TF-IDF approach we will employ TfidfVectorizer from scikit-learn. Since it offers incorporation of tokenization and stopword removal, we can pass our translated queries and documents directly.

We want to first create TF-IDF matrix based on our collection and then be able to pass queries and for each query get a list of documents with their respective cosine similarities ranked in descending order. For that, we will create a class of TF-IDF model with a ranking method.

The initial version of TF-IDF model includes default parameters of scikit-learn implementation:
- use_idf = True for enabling idf reweighting
- sublinear_tf meaning that tf scores are presented as the raw term frequencies

In [28]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('stopwords')


class TfIdfModel:
    def __init__(self, collection):
        # extracting  document IDs and texts
        self.doc_ids, doc_texts = zip(*collection)

        russian_stopwords = list(stopwords.words('russian'))

        # initializing vectorizer that can be applied to tokenized documents
        self.vectorizer = TfidfVectorizer(
            tokenizer=word_tokenize,  # Use NLTK's word_tokenize
            stop_words=russian_stopwords,  # Filter out Russian stopwords
            token_pattern=None  # Disable the internal token pattern to ensure external tokenizer is used
        )

        # fitting the vectorizer on the collection texts and transform
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)

    def rank_documents(self, query, rank_length):
        # extracting query ID and text
        query_id, query_text = query

        # transforming the query text into the vector space
        query_vector = self.vectorizer.transform([query_text])

        # Compute cosine similarities between query vector and precomputed matrix
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Pair each document ID with its corresponding cosine similarity score
        scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(self.doc_ids, cosine_similarities)]

        # Sort documents by similarity score in descending order
        sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        # Return ranked data
        ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:rank_length]]
        return ranked_data

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# we will fit the model on out document collection
model = TfIdfModel(combined_documents)

In [30]:
ranked_tfidf = []

for query in tqdm(translated_queries):
  rank_per_query = model.rank_documents(query, 1000)
  ranked_tfidf.append(rank_per_query)

flat_list_tfidf = list(chain.from_iterable(ranked_tfidf))

  0%|          | 0/76 [00:00<?, ?it/s]

In [31]:
print(flat_list_tfidf[:5])

[('200', '6726a11f-c2fa-4900-98ce-3357ca6a1aee', 0.18275552291010866), ('200', 'd84ebc49-2b1f-433b-b8c5-3f2572d6d273', 0.17262768542184287), ('200', '07135ef0-9faf-40fd-8cd0-f93ddca8a071', 0.16772486188206254), ('200', 'd21fe9e4-c71d-46b5-9ee3-8e8541c2a5ee', 0.1483759392238966), ('200', '2dc72e68-74be-43f3-9585-ccd42173258a', 0.140650921328917)]


In [32]:
qrels = [
    ir_measures.Qrel(query_id=query_id, doc_id=doc_id, relevance=score)
    for query_id, doc_id, score in qrels
]

In [33]:
performance_tfidf = evaluate(qrels, flat_list_tfidf)

In [34]:
RBP_tfidf = rbp(flat_list_tfidf, qrels)
RBP_1_tfidf = sum(RBP_tfidf.values()) / len(RBP_tfidf)
performance_tfidf["RBP(rel=1)"] = RBP_1_tfidf

In [35]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
R@100: 0.3672248482825866
R@1000: 0.709934318552958
AP: 0.17401886311677012
nDCG@20: 0.25476337535496335
RBP(rel=1): 0.28032323527912495


We want to see if we can improve the results with our customized language-specific preprocessing function. In order to bypass the inbuilt tokenization and pass as input lists of tokens as documents, we create an additional "dummy" function (solution suggested by David Batista https://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/)

In [36]:
def dummy_function(x):
    return x

In [39]:
#we preprocess all documents (tokenization, dropping punctuation and stopwords, stemming)
preprocessed_documents = [
    (doc_id, preprocess(doc_text))
    for doc_id, doc_text in combined_documents
]

In [40]:
#we preprocess every query (tokenization, dropping punctuation and stopwords, stemming)
preprocessed_queries = [
    (query_id, preprocess(query_text))
    for query_id, query_text in translated_queries
]

In [None]:
#we redifine TfidfModel class

class TfIdfModel:
    def __init__(self, collection):
        # extracting  document IDs and texts
        self.doc_ids, doc_texts = zip(*collection)

        # initializing vectorizer that can be applied to tokenized documents
        self.vectorizer = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_function,
            preprocessor=dummy_function,
            token_pattern=None
        )

        # fitting the vectorizer on the collection texts and transform
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)

    def rank_documents(self, query, rank_length):
        # extracting query ID and text
        query_id, query_text = query

        # transforming the query text into the vector space
        query_vector = self.vectorizer.transform([query_text])

        # Compute cosine similarities between query vector and precomputed matrix
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Pair each document ID with its corresponding cosine similarity score
        scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(self.doc_ids, cosine_similarities)]

        # Sort documents by similarity score in descending order
        sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        # Return ranked data
        ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:rank_length]]
        return ranked_data

In [None]:
# we will fit the model on out document collection
model = TfIdfModel(preprocessed_documents)

In [None]:
ranked_tfidf = []

for query in tqdm(preprocessed_queries):
  rank_per_query = model.rank_documents(query, 1000)
  ranked_tfidf.append(rank_per_query)

flat_list_tfidf = list(chain.from_iterable(ranked_tfidf))


  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
print(flat_list_tfidf[:5])

[('200', 'dbc0d493-44a5-4f5f-84ce-066d341a44a6', 0.5098904767089345), ('200', '21a96c19-6e65-4ad7-a5f1-bc51db6cd15c', 0.37061867956941574), ('200', '07135ef0-9faf-40fd-8cd0-f93ddca8a071', 0.3103747068071534), ('200', '0c1e52d0-4f1d-4109-9fb3-b84a4b0c99d9', 0.30283719434355616), ('200', 'c8cf62b1-d540-4524-995b-f01445ed351d', 0.29710340340092156)]


In [None]:
performance_tfidf = evaluate(qrels, flat_list_tfidf)

In [None]:
RBP_tfidf = rbp(flat_list_tfidf, qrels)
RBP_1_tfidf = sum(RBP_tfidf.values()) / len(RBP_tfidf)
performance_tfidf["RBP(rel=1)"] = RBP_1_tfidf

In [None]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
R@100: 0.44144864004908346
AP: 0.2575902476455747
nDCG@20: 0.30057360879973816
R@1000: 0.8839690860680056
RBP(rel=1): 0.32165782651910335


We want to improve the obtained results by modifying some parameters of the parameters. To begin, instead of taking raw term frequencies, we will use logarithm tf weighing: 1 + log(tf), by modifying sublinear_tf parameter of the vectorizer

In [None]:
class TfIdfModel_v2:
    def __init__(self, collection):
        # extracting  document IDs and texts
        self.doc_ids, doc_texts = zip(*collection)

        # initializing vectorizer that can be applied to tokenized documents
        self.vectorizer = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_function,
            preprocessor=dummy_function,
            token_pattern=None,
            sublinear_tf=True
        )

        # fitting the vectorizer on the collection texts and transform
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)

    def rank_documents(self, query, rank_length):
        # extracting query ID and text
        query_id, query_text = query

        # transforming the query text into the vector space
        query_vector = self.vectorizer.transform([query_text])

        # Compute cosine similarities between query vector and precomputed matrix
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Pair each document ID with its corresponding cosine similarity score
        scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(self.doc_ids, cosine_similarities)]

        # Sort documents by similarity score in descending order
        sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        # Return ranked data
        ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:rank_length]]
        return ranked_data

In [None]:
model_v2 = TfIdfModel_v2(preprocessed_documents)


In [None]:
ranked_tfidf_v2 = []

for query in tqdm(preprocessed_queries):
  rank_per_query = model_v2.rank_documents(query, 1000)
  ranked_tfidf_v2.append(rank_per_query)

flat_list_tfidf_v2 = list(chain.from_iterable(ranked_tfidf_v2))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_tfidf_v2 = evaluate(qrels, flat_list_tfidf_v2)

In [None]:
RBP_tfidf_v2 = rbp(flat_list_tfidf_v2, qrels)
RBP_1_tfidf_v2 = sum(RBP_tfidf_v2.values()) / len(RBP_tfidf_v2)
performance_tfidf_v2["RBP(rel=1)"] = RBP_1_tfidf_v2

In [None]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf_v2.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
R@100: 0.48378915616448215
AP: 0.28001360925051477
nDCG@20: 0.3196050100708838
R@1000: 0.8855943757303283
RBP(rel=1): 0.3646633901989546


With different tf weighing we achieved a slight improvement in ranking performance, since we accounted for non-linear nature of raltion between term frequency and document relevance.

We will try to improve it further by normalizing the term frequencies by the frequency of the most frequent term in the document. This option is not available within scikit-learn TfidfVectorizer, but we can implement it as a postprocessing step of the tfidf matrix

In [None]:
from scipy.sparse import csr_matrix

class TfIdfModel_v3:
    def __init__(self, collection):
        # extracting document IDs and texts
        self.doc_ids, doc_texts = zip(*collection)

        # initialize TfidfVectorizer with sublinear_tf=True
        self.vectorizer = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_function,
            preprocessor=dummy_function,
            token_pattern=None,
            sublinear_tf=True
        )

        # fitting and transforming the documents to get the TF-IDF matrix
        X = self.vectorizer.fit_transform(doc_texts)

        max_per_row = np.array(X.max(axis=1).toarray()).flatten()

        # compute the normalization factors: 1 + log(max_value_in_row)
        normalization_factors = 1 + np.log(max_per_row)

        # to avoid division by zero (in case there are empty rows):
        normalization_factors[normalization_factors == -np.inf] = 1  # we replace -inf with 1 for empty rows

        # scaling the sparse matrix rows
        X = X.multiply(1 / normalization_factors[:, np.newaxis])




        # max_per_row = np.array(tfidf_matrix.max(axis=1)).flatten()

        # normalization_factors = 1 + np.log(max_per_row)

        # tfidf_matrix = tfidf_matrix.multiply(1 / normalization_factors[:, np.newaxis])

        #print(type(tfidf_matrix))

        #tfidf_matrix = tfidf_matrix/(1 + np.log(max(tfidf_matrix, axis=1, keepdims=True)))

        self.tfidf_matrix = X



    def rank_documents(self, query, rank_length):
        # extracting query ID and text
        query_id, query_text = query

        # transforming the query text into the vector space
        query_vector = self.vectorizer.transform([query_text])

        # Compute cosine similarities between query vector and precomputed matrix
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Pair each document ID with its corresponding cosine similarity score
        scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(self.doc_ids, cosine_similarities)]

        # Sort documents by similarity score in descending order
        sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        # Return ranked data
        ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:rank_length]]
        return ranked_data



In [None]:
model_v3 = TfIdfModel_v3(preprocessed_documents)

In [None]:
ranked_tfidf_v3 = []

for query in tqdm(preprocessed_queries):
  rank_per_query = model_v3.rank_documents(query, 1000)
  ranked_tfidf_v3.append(rank_per_query)

flat_list_tfidf_v3 = list(chain.from_iterable(ranked_tfidf_v3))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_tfidf_v3 = evaluate(qrels, flat_list_tfidf_v3)

In [None]:
RBP_tfidf_v3 = rbp(flat_list_tfidf_v3, qrels)
RBP_1_tfidf_v3 = sum(RBP_tfidf_v3.values()) / len(RBP_tfidf_v3)
performance_tfidf_v3["RBP(rel=1)"] = RBP_1_tfidf_v3

In [None]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf_v3.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
R@100: 0.040253658252453195
AP: 0.017423518704029393
nDCG@20: 0.09514186832897815
R@1000: 0.042338365724836874
RBP(rel=1): 0.14500773263987515


In [None]:
class TfIdfModel_v4:
    def __init__(self, collection):
        # extracting  document IDs and texts
        self.doc_ids, doc_texts = zip(*collection)

        # initializing vectorizer that can be applied to tokenized documents
        self.vectorizer = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_function,
            preprocessor=dummy_function,
            token_pattern=None,
            #max_df=0.1,
            min_df=0.001,
            sublinear_tf=True
        )

        # fitting the vectorizer on the collection texts and transform
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)

    def rank_documents(self, query, rank_length):
        # extracting query ID and text
        query_id, query_text = query

        # transforming the query text into the vector space
        query_vector = self.vectorizer.transform([query_text])

        # Compute cosine similarities between query vector and precomputed matrix
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Pair each document ID with its corresponding cosine similarity score
        scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(self.doc_ids, cosine_similarities)]

        # Sort documents by similarity score in descending order
        sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        # Return ranked data
        ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:rank_length]]
        return ranked_data

In [None]:
model_v4 = TfIdfModel_v4(preprocessed_documents)

In [None]:
ranked_tfidf_v4 = []

for query in tqdm(preprocessed_queries):
  rank_per_query = model_v4.rank_documents(query, 1000)
  ranked_tfidf_v4.append(rank_per_query)

flat_list_tfidf_v4 = list(chain.from_iterable(ranked_tfidf_v4))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_tfidf_v4 = evaluate(qrels, flat_list_tfidf_v4)

In [None]:
RBP_tfidf_v4 = rbp(flat_list_tfidf_v4, qrels)
RBP_1_tfidf_v4 = sum(RBP_tfidf_v4.values()) / len(RBP_tfidf_v4)
performance_tfidf_v4["RBP(rel=1)"] = RBP_1_tfidf_v4

In [None]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf_v4.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
R@100: 0.4663502418118479
AP: 0.26342769052296783
nDCG@20: 0.291976199867674
R@1000: 0.8758607845640082
RBP(rel=1): 0.32950576091683986


In [None]:
# since we already have our custom preprocessing, we will use a function
def dummy_function(x):
    return x

def tf_idf(query, documents):
    # extracting the query and documents texts
    query_id, query_text = query
    doc_ids, doc_texts = zip(*documents)  # unzip the document tuples into ids and texts

    # Combine the query text with the document texts for vectorization
    texts = [query_text] + list(doc_texts)

    # Create the TF-IDF Vectorizer

    vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_function,
    preprocessor=dummy_function,
    token_pattern=None)

    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Extract the query vector (the first row in the matrix)
    query_vector = tfidf_matrix[0:1]

    # Compute the cosine similarity between the query vector and the document vectors
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix[1:]).flatten()

    # Pair each document id with its corresponding cosine similarity score
    scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(doc_ids, cosine_similarities)]

    # Sort documents by similarity score in descending order
    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

    ranked_data = [(query_id, uuid, value) for uuid, value in sorted_documents[:1000]]

    return ranked_data

In [None]:
ranked_tfidf = []
for query in tqdm(preprocessed_queries):
    scores_tfidf = tf_idf(query, preprocessed_documents)
    ranked_tfidf.append(scores_tfidf)

flat_list_tfidf = list(chain.from_iterable(ranked_tfidf))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
print(flat_list_tfidf[:5])

[('200', 'dbc0d493-44a5-4f5f-84ce-066d341a44a6', 0.5094508855965356), ('200', '21a96c19-6e65-4ad7-a5f1-bc51db6cd15c', 0.3703436596888059), ('200', '07135ef0-9faf-40fd-8cd0-f93ddca8a071', 0.3100856813869644), ('200', '0c1e52d0-4f1d-4109-9fb3-b84a4b0c99d9', 0.3027228759686717), ('200', 'c8cf62b1-d540-4524-995b-f01445ed351d', 0.29675834193817796)]


**Evaluation of TF-IDF ranking**

In [None]:
qrels = [
    ir_measures.Qrel(query_id=row['query_id'], doc_id=row['doc_id'], relevance=row['relevance'])
    for _, row in qrels_pd.iterrows()
]

In [None]:
performance_tfidf = evaluate(qrels, flat_list_tfidf)

In [None]:
RBP_tfidf = rbp(flat_list_tfidf, qrels)
RBP_1_tfidf = sum(RBP_tfidf.values()) / len(RBP_tfidf)
performance_tfidf["RBP(rel=1)"] = RBP_1_tfidf

RBP(rel=1): 0.32054686075827377


In [None]:
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")