In [None]:
!pip install ir_datasets
!pip install deep_translator
!pip install rank_bm25
!pip install ir_measures
!pip install razdel
!pip install stopwordsiso

Collecting ir_datasets
  Downloading ir_datasets-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.5.0-py3-none-any.whl.metadata (25 kB)
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.5-py3-none-any.whl.metadata (2.2 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting zlib-state>=0.1.3 (from ir_datasets)
  Downloading zlib_state-0.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting ijson>=3.1.3 (from ir_datasets)
  Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86

In [None]:
import ir_datasets
import nltk
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import numpy as np
from rank_bm25 import BM25Okapi
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc
from itertools import chain
from razdel import tokenize
from nltk.stem.snowball import SnowballStemmer
import string
from stopwordsiso import stopwords
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


### Load documents

In [None]:
dataset = ir_datasets.load("neuclir/1/ru/trec-2023")
dataset

Dataset(id='neuclir/1/ru/trec-2023', provides=['docs', 'queries', 'qrels'])

In [None]:
# extracting documents, queries and qrels
russian_documents = [(doc.doc_id, doc.title, doc.text) for doc in tqdm(dataset.docs_iter())]
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

0it [00:00, ?it/s]

[INFO] If you have a local copy of https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/3aabc798a3b5dd92d7c47db9521870b1
[INFO] [starting] https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true

https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.0%| 0.00/4.50G [00:00<?, ?B/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.3%| 12.2M/4.50G [00:00<00:37, 121MB/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 0.7%| 29.6M/4.50G [00:00<00:30, 147MB/s][A
https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true: 1.0%| 47.0M/4.50G [00:00<00:28, 156MB/s][A
ht

In [None]:
# filtering out the documents that are present in qrels (for which relevance feedback is available)
qrels_ids = {entry[1] for entry in tqdm(qrels)}
russian_documents_subset = [doc for doc in tqdm(russian_documents) if doc[0] in qrels_ids]
len(russian_documents_subset)

  0%|          | 0/25634 [00:00<?, ?it/s]

  0%|          | 0/4627543 [00:00<?, ?it/s]

24871

### Helper functions

In [None]:
def evaluate(qrels, result):

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]

    metrics = [
        ir_measures.nDCG @ 20,   # nDCG@20
        ir_measures.AP,          # Average Precision
        ir_measures.R @ 100,     # Recall@100
        ir_measures.R @ 1000     # Recall@1000
    ]

    scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)

    return scores

def print_document(document_id):
    print(next((doc for doc in russian_documents if doc[0] == document_id), None))


def translate_query(query):
    translated_text = GoogleTranslator(source='auto', target='ru').translate(query[1])
    translated_tuple = (query[0], translated_text)

    return translated_tuple


def combine_documents(documents):
    combine_documents =  list(map(lambda doc: (doc[0], doc[1] + ' ' + doc[2]), documents))
    return combine_documents


import stopwordsiso




def preprocess(text, remove_stop: bool=True) -> list:
    """
    :text: str, text of the corresponding document
    :param remove_stop: bool indicating if stopwords should be removed (default True)

    :return: list(str) of tokens, stemmed, with removed punctuation
    """

    tokens = [_.text for _ in list(tokenize(text))]

    preprocessed =[]

    stemmer = SnowballStemmer("russian")
    stopwords_ru = stopwords("ru")
    punct = string.punctuation + "«»" + "—" + '–'

    for t in tokens:
        if t in punct:
            continue
        if remove_stop and t.lower() in stopwords_ru:
            continue
        preprocessed.append(stemmer.stem(t))

    return preprocessed


Preprocessing

In [None]:
# To search on both document title and text, we will concatenate them, obtaining pairs (document_id, full text (title+text))
combined_documents = combine_documents(russian_documents_subset)
print(combined_documents[-1])

('20bc2d7b-91d1-428b-aa54-ef112e8a1e69', 'Экс-замминобороны США: ядерную войну запустит искусственный интеллект Вашингтон, , 06:51 — REGNUM О существенной опасности искусственного интеллекта в случае использования его военными заявил бывший заместитель министра обороны США Роберт Уорк, передает Breaking Defense.\n\nИскусственный интеллект Цитата из к/ф «Космическая одиссея 2001 года». Реж. Стэнли Кубрик. 1968. США — Великобритания\n\nПо мнению бывшего сотрудника Пентагона, если допустить искусственный интеллект (ИИ) к принятию решений, то существует вероятность, что имеющиеся риски будут оценены им неверно, и ядерное оружие будет применено в ситуации, когда реальной угрозы не будет существовать.\n\n«Представьте, что в системе управления ядерным оружием есть прогностическая система ИИ, которая запускается по определенным параметрам, — сказал он. — Это гораздо, гораздо, гораздо более тревожная перспектива, чем все опасности, что вы можете представить в связи с применением индивидуального

In [None]:
#Then, we preprocess all documents (tokenization, dropping punctuation and stopwords, stemming)
preprocessed_documents = [
    (doc_id, preprocess(doc_text))
    for doc_id, doc_text in combined_documents
]

In [None]:
#Next, we translate all queries to Russian
translated_queries = []
for query in english_queries:
  translated_queries.append(translate_query(query))

In [None]:
print(translated_queries[0:5])

[('200', 'Коррупция Взяточничество Спортивная федерация Олимпиада'), ('201', 'Китайские инвестиции в Иран'), ('202', 'Новые технологии, точное земледелие, интеллектуальное земледелие, сельское хозяйство'), ('203', 'Эвер Гивен застрял'), ('204', 'Штраф допинг спорт стоп')]


In [None]:
#Then, we preprocess every query (tokenization, dropping punctuation and stopwords, stemming)
preprocessed_queries = [
    (query_id, preprocess(query_text))
    for query_id, query_text in translated_queries
]

In [None]:
print(preprocessed_queries[0:5])

[('200', ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад']), ('201', ['китайск', 'инвестиц', 'ира']), ('202', ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств']), ('203', ['эвер', 'гив', 'застря']), ('204', ['штраф', 'допинг', 'спорт', 'стоп'])]


**BM25**

BM25

In [None]:
def bm25(query, documents):

    query_number = query[0]
    query_tokens = query[1]
    document_ids = [item[0] for item in documents]

    # Tokenize documents and initialize BM25
    corpus = [doc[1] for doc in documents]
    bm25 = BM25Okapi(corpus)

    # Get BM25 scores for the query tokens
    scores = bm25.get_scores(query_tokens)

    # Combine query_number, document_ids, and scores into the required format
    scored_documents = [(doc_id, score) for doc_id, score in zip(document_ids, scores)]

    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

    ranked_data = [(query_number, uuid, value) for uuid, value in sorted_documents[:1000]]

    return ranked_data

In [None]:
ranked_bm25 = []
for query in tqdm(preprocessed_queries):
    scores_bm25 = bm25(query, preprocessed_documents)
    ranked_bm25.append(scores_bm25)

flat_list_bm25 = list(chain.from_iterable(ranked_bm25))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
print(flat_list_bm25[:5])

[('200', 'fa560d7f-2f67-4fb1-a93c-3a51a0c539a6', 23.28870824007913), ('200', 'd7fbeb98-1726-4365-af82-d4e156cbc198', 21.798941281005405), ('200', 'c6fa30d0-dbfc-4bc3-892c-5e95a65f46b1', 21.14968436080987), ('200', '1a49ac53-27fa-45bc-abda-f33927daaf28', 20.798231676157467), ('200', 'e9b85084-2200-4812-989b-dd7ee8bacfbd', 20.791606684961053)]


**Evaluation of baseline methods**

In [None]:
qrels = [
    ir_measures.Qrel(query_id=query_id, doc_id=doc_id, relevance=score)
    for query_id, doc_id, score in qrels
]

In [None]:
from collections import defaultdict

def rbp(retrieved_results, qrels, p=0.8):
    # Create a dictionary of relevant documents for each query from qrels
    qrels_dict = defaultdict(set)

    # Unpack qrels and store relevant documents per query_id
    for qrel in qrels:
        query_id = qrel.query_id
        doc_id = qrel.doc_id
        relevance = qrel.relevance
        if relevance > 0:
            qrels_dict[query_id].add(doc_id)

    # Group the retrieved results by query_id
    grouped_retrieved_results = defaultdict(list)
    for query_id, doc_id, score in retrieved_results:
        grouped_retrieved_results[query_id].append((doc_id, score))

    # Initialize the rbp_scores dictionary
    rbp_scores = {}

    # Iterate over the grouped retrieved results
    for query_id, retrieved_docs in grouped_retrieved_results.items():
        score = 0
        # Rank each document for the query
        for rank, (doc_id, _) in enumerate(retrieved_docs, start=1):
            if doc_id in qrels_dict.get(query_id, set()):
                score += (1 - p) * (p ** (rank - 1))
        rbp_scores[query_id] = score

    return rbp_scores

BM25 evaluation

In [None]:
performance_bm25 = evaluate(qrels, flat_list_bm25)

In [None]:
RBP_bm25 = rbp(flat_list_bm25, qrels)
RBP_1_bm25 = sum(RBP_bm25.values()) / len(RBP_bm25)
performance_bm25["RBP(rel=1)"] = RBP_1_bm25

In [None]:
print("Evaluation Metrics (BM25):")
for metric, value in performance_bm25.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25):
R@1000: 0.8890930790435744
nDCG@20: 0.41167936939433875
R@100: 0.5054130943731772
AP: 0.33916640326369557
RBP(rel=1): 0.45496377037131436


In [None]:
from rank_bm25 import BM25Plus

def bm25_plus(query, documents):

    query_number = query[0]
    query_tokens = query[1]
    document_ids = [item[0] for item in documents]

    # Tokenize documents and initialize BM25
    corpus = [doc[1] for doc in documents]
    bm25_plus = BM25Plus(corpus)

    # Get BM25 scores for the query tokens
    scores = bm25_plus.get_scores(query_tokens)

    # Combine query_number, document_ids, and scores into the required format
    scored_documents = [(doc_id, score) for doc_id, score in zip(document_ids, scores)]

    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

    ranked_data = [(query_number, uuid, value) for uuid, value in sorted_documents[:1000]]

    return ranked_data

In [None]:
ranked_bm25_plus = []
for query in tqdm(preprocessed_queries):
    scores_bm25_plus = bm25_plus(query, preprocessed_documents)
    ranked_bm25_plus.append(scores_bm25_plus)

flat_list_bm25_plus = list(chain.from_iterable(ranked_bm25_plus))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_plus = evaluate(qrels, flat_list_bm25_plus)

In [None]:
RBP_bm25_plus = rbp(flat_list_bm25_plus, qrels)
RBP_1_bm25_plus = sum(RBP_bm25_plus.values()) / len(RBP_bm25_plus)
performance_bm25_plus["RBP(rel=1)"] = RBP_1_bm25_plus

In [None]:
print("Evaluation Metrics (BM25+):")
for metric, value in performance_bm25_plus.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25+):
R@100: 0.5038969725120288
AP: 0.33939018308897945
nDCG@20: 0.41381761254913013
R@1000: 0.8877383508634303
RBP(rel=1): 0.4567926242803976


In [None]:
from rank_bm25 import BM25L

def bm25_l(query, documents):

    query_number = query[0]
    query_tokens = query[1]
    document_ids = [item[0] for item in documents]

    # Tokenize documents and initialize BM25
    corpus = [doc[1] for doc in documents]
    bm25_l = BM25L(corpus)

    # Get BM25 scores for the query tokens
    scores = bm25_l.get_scores(query_tokens)

    # Combine query_number, document_ids, and scores into the required format
    scored_documents = [(doc_id, score) for doc_id, score in zip(document_ids, scores)]

    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

    ranked_data = [(query_number, uuid, value) for uuid, value in sorted_documents[:1000]]

    return ranked_data

In [None]:
ranked_bm25_l = []
for query in tqdm(preprocessed_queries):
    scores_bm25_l = bm25_l(query, preprocessed_documents)
    ranked_bm25_l.append(scores_bm25_l)

flat_list_bm25_l = list(chain.from_iterable(ranked_bm25_l))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_l = evaluate(qrels, flat_list_bm25_l)

In [None]:
RBP_bm25_l = rbp(flat_list_bm25_l, qrels)
RBP_1_bm25_l = sum(RBP_bm25_l.values()) / len(RBP_bm25_l)
performance_bm25_l["RBP(rel=1)"] = RBP_1_bm25_l

In [None]:
print("Evaluation Metrics (BM25L):")
for metric, value in performance_bm25_l.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25L):
R@100: 0.33462978711169333
AP: 0.16782446819142158
nDCG@20: 0.19235443119842124
R@1000: 0.8481764873064259
RBP(rel=1): 0.2051878450677502


**Improving BM25 results with query expansion**

To determine which terms need to be included in the query, we will use the relevance model that will for each query find top N terms appearing in top K ranked documents. The terms will be ranked by the sum of the probabilities of the term being in a relevant document weighted by the original score assigned to the document at previous ranking.

Lavrenko Relevance Model

In [None]:
# for our relevance model, we will define a separate class

class LavrenkoRelevanceModel:
    def __init__(self, collection, preranking, mu=2000, top_k=10, top_n_terms=10):
        """
        :param tcollection: List of tokenized documents [(doc_id, [tokens])]
        :param preranking: scores [(query_id, doc_id, score)] obtained by previous ranking
        :param mu: Dirichlet smoothing parameter
        :param top_k: Number of top-ranked documents to use for relevance model
        :param top_n_terms: Number of terms to expand the query with
        """
        self.documents = {doc_id: tokens for doc_id, tokens in collection}
        self.preranking = preranking
        self.mu = mu
        self.top_k = top_k
        self.top_n_terms = top_n_terms

        # building a background collection model
        self.background_model = self._compute_background_model()

    def _compute_background_model(self):
        """Compute collection-wide term frequencies for Dirichlet smoothing."""
        corpus = []
        for tokens in self.documents.values():
            corpus.extend(tokens)

        total_terms = len(corpus)
        term_frequencies = Counter(corpus)
        return {term: freq / total_terms for term, freq in term_frequencies.items()}

    def _get_top_k_documents(self, query_id):
        """Retrieve the top-k documents for a given query."""
        top_docs = [
            (doc_id, score)
            for q_id, doc_id, score in self.preranking
            if q_id == query_id
        ]
        top_docs = sorted(top_docs, key=lambda x: x[1], reverse=True)[:self.top_k]
        return top_docs

    def _term_probability_given_document(self, term, document_id):
        """Compute P(w|d) for a term given a document."""
        tokens = self.documents[document_id]
        term_count = tokens.count(term)
        doc_length = len(tokens)
        P_w_C = self.background_model.get(term, 1e-12)
        return (term_count + self.mu * P_w_C) / (doc_length + self.mu)

    def _term_probability_given_query(self, term, query_id):
        """Compute P(w|q) for a term given a query."""
        top_docs = self._get_top_k_documents(query_id)
        probability = 0

        for doc_id, bm25_score in top_docs:
            P_w_d = self._term_probability_given_document(term, doc_id)
            probability += P_w_d * bm25_score

        return probability

    def expand_query(self, query_id, query_tokens):
        """Expand a query using the Lavrenko relevance model."""
        # calculating P(w|q) for all terms in top-k documents
        top_docs = self._get_top_k_documents(query_id)
        all_terms = set()
        for doc_id, _ in top_docs:
            all_terms.update(self.documents[doc_id])

        term_probabilities = {
            term: self._term_probability_given_query(term, query_id)
            for term in all_terms
        }

        # ranking terms by P(w|q) and select top n terms
        ranked_terms = sorted(term_probabilities.items(), key=lambda x: x[1], reverse=True)
        top_terms = [term for term, _ in ranked_terms[:self.top_n_terms]]

        # combine original query with expanded terms
        expanded_query = query_tokens + top_terms
        return query_id, expanded_query



The model has a variety of parameters: mu for the Dirichlet smoothing, k as the number of top documents from previous ranking and n as the number of terms to be added to the query.
To establish suitable values for the parameters, we will try their ranges and evaluate the retrieval. Unfortunately, computational sources did not permit us to run this search as a loop due to frequent session crashes. The following part of this notebook shows experiments for each parameter congifuration: different values for mu (with fixed k and n), different values for n (with best choice of mu and fixed n) and different values for k (with best choice of mu and n). Each experiment is run separately and values are set manually, to be ale to retake from any part after the session crash without losing already obtained results.

Keeping k=10 and n=10, varying mu:

mu=2000

In [None]:
# Instantiate and expand queries
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=2000, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'стран', 'росс', 'нов', 'компан', 'сша', 'дан', 'российск', 'коррупц', 'международн', 'перв']
201: ['китайск', 'инвестиц', 'ира', 'стран', 'компан', 'сша', 'росс', 'ира', 'кита', 'нов', 'китайск', 'проект', 'дан']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'технолог', 'стран', 'росс', 'нов', 'компан', 'дан', 'хозяйств', 'развит', 'систем', 'российск']
203: ['эвер', 'гив', 'застря', 'компан', 'нов', 'дан', 'суд', 'кана', 'сша', 'суэцк', 'эвер', 'гив', 'судн']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'росс', 'стран', 'нов', 'сша', 'дан', 'российск', 'допинг', 'возможн', 'проект', 'перв']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'росс', 'стран', 'нов', 'энерг', 'компан', 'дан', 'сша', 'технолог', 'источник', 'развит']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'стран', 'росс', 'нов', 'конвенц', 'турецк', 'кана', 'монтр',

In [None]:
# Run BM25 on expanded queries
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
print(flat_list_bm25_LRM[:5])

[('200', 'c6fa30d0-dbfc-4bc3-892c-5e95a65f46b1', 38.103568680316535), ('200', 'a649d466-1259-4d8a-86a7-242e946ef7c6', 36.5159031687619), ('200', 'fa560d7f-2f67-4fb1-a93c-3a51a0c539a6', 34.892695685355314), ('200', 'dd6c790e-1dfa-4800-9026-78d26537caf1', 34.724661680660695), ('200', 'c8b66c15-d538-4f8a-9534-f5adfca6188c', 33.37746405834651)]


In [None]:
# Evaluate the performance on expanded queries
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=2000, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM m=2000, k=10, n=10):
R@1000: 0.8791297688626457
AP: 0.34734439430870234
nDCG@20: 0.4082245103932432
R@100: 0.5030639358928691
RBP(rel=1): 0.44841539748180415


mu = 1500

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1500, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'стран', 'росс', 'нов', 'сша', 'компан', 'коррупц', 'дан', 'российск', 'международн', 'президент']
201: ['китайск', 'инвестиц', 'ира', 'стран', 'компан', 'ира', 'кита', 'сша', 'китайск', 'росс', 'проект', 'нов', 'дан']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'технолог', 'стран', 'росс', 'нов', 'компан', 'хозяйств', 'дан', 'развит', 'систем', 'област']
203: ['эвер', 'гив', 'застря', 'компан', 'кана', 'суд', 'суэцк', 'нов', 'эвер', 'гив', 'дан', 'судн', 'сша']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'росс', 'стран', 'сша', 'нов', 'допинг', 'дан', 'российск', 'спортсмен', 'спорт', 'возможн']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'росс', 'нов', 'стран', 'дан', 'источник', 'компан', 'технолог', 'сша', 'развит']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'конвенц', 'турецк', 'стран', 'нов', 'кана', 'монтр', 

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1500, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1500, k=10, n=10):
R@1000: 0.8860845105442884
AP: 0.3566284246629659
nDCG@20: 0.4146418057966574
R@100: 0.5129040497892994
RBP(rel=1): 0.4543107189418579


mu = 1000

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1000, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'стран', 'росс', 'сша', 'нов', 'компан', 'дан', 'международн', 'российск', 'взяточничеств']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'компан', 'стран', 'китайск', 'сша', 'проект', 'росс', 'нов', 'доллар']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'технолог', 'хозяйств', 'стран', 'росс', 'нов', 'компан', 'сельск', 'систем', 'земледел', 'развит']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'эвер', 'суд', 'гив', 'компан', 'судн', 'нов', 'контейнеровоз', 'дан']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'росс', 'допинг', 'стран', 'сша', 'нов', 'российск', 'дан', 'спортсмен', 'штраф', 'спорт']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'росс', 'нов', 'источник', 'стран', 'технолог', 'дан', 'компан', 'развит', 'сша']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'конвенц', 'турецк', 'монтр', 'кан

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1000, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1000, k=10, n=10):
R@1000: 0.8907199595962648
AP: 0.3623874473612533
nDCG@20: 0.41367872068576134
R@100: 0.5164720489799699
RBP(rel=1): 0.4597301425586291


mu = 500

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=500, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'стран', 'международн', 'сша', 'росс', 'нов', 'компан', 'президент', 'дан']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'китайск', 'компан', 'стран', 'иранск', 'проект', 'инвестиц', 'сша', 'доллар']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'технолог', 'хозяйств', 'сельск', 'земледел', 'систем', 'цифров', 'точн', 'стран', 'нов', 'област']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'эвер', 'гив', 'суд', 'судн', 'контейнеровоз', 'компан', 'канал', 'застря']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'росс', 'штраф', 'спортсмен', 'спорт', 'закон', 'сша', 'стран', 'российск', 'использован']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'нов', 'росс', 'стран', 'технолог', 'электроэнерг', 'дан', 'развит', 'чист']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'кон

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=500, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=500, k=10, n=10):
R@1000: 0.909896905948726
AP: 0.3731322910492542
nDCG@20: 0.42929119903239465
R@100: 0.5406603652995244
RBP(rel=1): 0.46799662581730084


mu = 250

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=250, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'международн', 'фиф', 'мок', 'стран', 'сша', 'президент', 'власт', 'федерац']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'китайск', 'иранск', 'инвестиц', 'компан', 'доллар', 'стран', 'иран', 'проект']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'технолог', 'хозяйств', 'земледел', 'сельск', 'цифров', 'систем', 'точн', 'област', 'развит', 'цифровизац']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'эвер', 'гив', 'суд', 'судн', 'контейнеровоз', 'застря', 'канал', 'компан']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'росс', 'закон', 'использован', 'сша', 'российск', 'рубл']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'электроэнерг', 'нов', 'чист', 'росс', 'технолог', 'стран', 'развит', 'дан']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'т

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=250, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=250, k=10, n=10):
R@1000: 0.9124330661983685
AP: 0.3673681437490892
nDCG@20: 0.41377559199883046
R@100: 0.5420569649630835
RBP(rel=1): 0.46401161671386787


mu = 1

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'фиф', 'международн', 'мок', 'федерац', 'спортивн', 'президент', 'власт', 'организац']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'инвестиц', 'китайск', 'иранск', 'доллар', 'млрд', 'иран', 'сектор', 'нефт']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'цифров', 'систем', 'сельскохозяйствен', 'цифровизац', 'област']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'застря', 'гив', 'судн', 'суд', 'канал', 'Ever']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон', 'использован', 'запрещен', 'рубл', 'росс', 'законопроект']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'электроэнерг', 'чист', 'возобновля', 'применен', 'развит', 'технолог', 'нов', 'стран']
206: ['конвенц'

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=250, k=10, n=10):
R@1000: 0.9204044831468705
AP: 0.3828646127084423
nDCG@20: 0.4341763439703586
R@100: 0.5505193775570902
RBP(rel=1): 0.47271449980124286


mu = 0

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=0, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'фиф', 'международн', 'мок', 'федерац', 'спортивн', 'президент', 'власт', 'организац']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'инвестиц', 'китайск', 'иранск', 'доллар', 'млрд', 'иран', 'сектор', 'нефт']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'цифров', 'систем', 'сельскохозяйствен', 'цифровизац', 'област']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'застря', 'гив', 'судн', 'суд', 'канал', 'Ever']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон', 'использован', 'запрещен', 'рубл', 'росс', 'законопроект']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'электроэнерг', 'чист', 'возобновля', 'применен', 'развит', 'технолог', 'нов', 'стран']
206: ['конвенц'

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=250, k=10, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=250, k=10, n=10):
R@1000: 0.9204044831468705
AP: 0.382862987709192
nDCG@20: 0.4341763439703586
R@100: 0.5502885372985492
RBP(rel=1): 0.47271449970141716


Best choices of mu by performance - 1 and 0. Since setting mu to 0 would mean no smoothing for term probabilities, we will opt for mu=1 to avoid division by zero when more documents are taking into account

mu = 1, k = 10, varying n

n = 5

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=10, top_n_terms=5)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'фиф', 'международн', 'мок']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'инвестиц', 'китайск', 'иранск']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'застря']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'электроэнерг', 'чист', 'возобновля']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'конвенц', 'монтр', 'кана', 'турецк']
207: ['кит', 'выброс', 'берег', 'тасман', '2020', 'берег', 'выброс', 'кит', 'дельфин', 'животн']
208: ['сокращен', 'экспорт', 'нефт', 'росс', 'нефт', 'экспорт', 'сокращен', 'млн', 'добыч']
209: ['ручн', 'влия', 'здоров', 'ручн', 'влия',

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=250, k=10, n=5):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=250, k=10, n=5):
R@1000: 0.9075570017097686
AP: 0.3694768064250054
nDCG@20: 0.41904010374060635
R@100: 0.538800670235451
RBP(rel=1): 0.46410764720023784


n = 12

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=10, top_n_terms=12)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'фиф', 'международн', 'мок', 'федерац', 'спортивн', 'президент', 'власт', 'организац', 'олимпийск', 'подозрева']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'инвестиц', 'китайск', 'иранск', 'доллар', 'млрд', 'иран', 'сектор', 'нефт', 'компан', 'стран']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'цифров', 'систем', 'сельскохозяйствен', 'цифровизац', 'област', 'выставк', 'апк']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'застря', 'гив', 'судн', 'суд', 'канал', 'Ever', 'Given', 'мел']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон', 'использован', 'запрещен', 'рубл', 'росс', 'законопроект', 'ответствен', 'уголовн']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник',

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=10, n=12):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=10, n=12):
R@1000: 0.9181351038160972
AP: 0.38063058155510815
nDCG@20: 0.43087958427389333
R@100: 0.5477209362340211
RBP(rel=1): 0.4701601142182648


n = 8

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=10, top_n_terms=8)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'фиф', 'международн', 'мок', 'федерац', 'спортивн', 'президент']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'инвестиц', 'китайск', 'иранск', 'доллар', 'млрд', 'иран']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'цифров', 'систем', 'сельскохозяйствен']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'застря', 'гив', 'судн', 'суд']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон', 'использован', 'запрещен', 'рубл']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'электроэнерг', 'чист', 'возобновля', 'применен', 'развит', 'технолог']
206: ['конвенц', 'монтр', 'стамбульск', 'кана', 'турц', 'конвенц', 'монтр', 'кана', 'турецк', 'стамбул', 'эрдога', 'пролив']
207: ['кит'

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=10, n=8):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=10, n=12):
R@1000: 0.9192017537742866
nDCG@20: 0.42978177769097003
R@100: 0.5514849425286348
AP: 0.380815910450942
RBP(rel=1): 0.4766286012946052


n= 8 and n=10 give very similar performance results. We will opt for 10 due to higher recall

mu = 1, n = 10, varying value of k

k = 20

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=20, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'федерац', 'президент', 'международн', 'борьб', 'фиф', 'власт', 'глав', 'стран']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'инвестиц', 'кита', 'иранск', 'китайск', 'иран', 'стран', 'млрд', 'доллар', 'иностра']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'систем', 'цифров', 'област', 'внедрен', 'сельскохозяйствен']
203: ['эвер', 'гив', 'застря', 'застря', 'суэцк', 'контейнеровоз', 'кана', 'суд', 'судн', 'эвер', 'гив', 'канал', 'Ever']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'спорт', 'штраф', 'спортсмен', 'закон', 'антидопингов', 'запрещен', 'сша', 'использован', 'уголовн']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'чист', 'электроэнерг', 'нов', 'применен', 'возобновля', 'солнечн', 'технолог', 'развит']
206: ['конвенц', '

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=20, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=20, n=10):
R@1000: 0.9181652548529502
nDCG@20: 0.42285857177059133
R@100: 0.5445505549059305
AP: 0.37590231533332163
RBP(rel=1): 0.46175259447792694


k = 5

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=5, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'международн', 'мок', 'федерац', 'тыс', 'спортивн', 'фиф', 'взяточничеств', 'получен', 'бывш']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'китайск', 'доллар', 'инвестиц', 'млрд', 'экономик', 'нефт', 'иран', 'санкц']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'сельск', 'земледел', 'точн', 'выставк', 'сельскохозяйствен', 'кра', 'систем', 'цифровизац']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'эвер', 'гив', 'суд', 'судн', 'канал', 'Ever', 'Given']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'штраф', 'допинг', 'спортсмен', 'рубл', 'спорт', 'запрещен', 'специалист', 'законопроект', 'закон', 'тренер']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'росс', 'чист', 'электроэнерг', 'угол', 'дан', 'применен', 'лазерн', 'путин']
206: ['конвенц', 'монтр', 'стамбуль

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=5, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=5, n=10):
R@1000: 0.9169238718196574
nDCG@20: 0.4265111955060016
R@100: 0.5436222385129712
AP: 0.3753169032096758
RBP(rel=1): 0.4618200259783279


k=50

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=50, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'президент', 'федерац', 'международн', 'фиф', 'организац', 'борьб', 'стран', 'суд']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'инвестиц', 'кита', 'стран', 'китайск', 'иранск', 'компан', 'санкц', 'сша', 'иран']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'сельск', 'земледел', 'област', 'цифров', 'систем', 'точн', 'развит', 'внедрен']
203: ['эвер', 'гив', 'застря', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'Ever', 'Given', 'суд', 'судн', 'канал', 'мел']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'спорт', 'спортсмен', 'антидопингов', 'штраф', 'росс', 'российск', 'закон', 'спортсм', 'запрещен']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'чист', 'источник', 'энергетик', 'солнечн', 'возобновля', 'нов', 'энергетическ', 'развит', 'электроэнерг']
206: ['конвенц', 'монтр',

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=50, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=50, n=10):
R@1000: 0.9232079914892544
nDCG@20: 0.4082565637266586
R@100: 0.5290187006488946
AP: 0.359010003953024
RBP(rel=1): 0.4463805608317561


k = 15

In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=15, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'коррупц', 'взяточничеств', 'федерац', 'международн', 'президент', 'фиф', 'мок', 'организац', 'олимпийск', 'власт']
201: ['китайск', 'инвестиц', 'ира', 'ира', 'кита', 'китайск', 'инвестиц', 'иранск', 'доллар', 'иран', 'млрд', 'компан', 'стран']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'хозяйств', 'технолог', 'земледел', 'сельск', 'точн', 'област', 'систем', 'цифров', 'развит', 'сельскохозяйствен']
203: ['эвер', 'гив', 'застря', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'суд', 'эвер', 'гив', 'судн', 'канал', 'Ever']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'допинг', 'штраф', 'спортсмен', 'спорт', 'закон', 'нарушен', 'запрещен', 'антидопингов', 'использован', 'росс']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'энерг', 'источник', 'чист', 'электроэнерг', 'нов', 'возобновля', 'применен', 'технолог', 'солнечн', 'стран']
206: ['конвен

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=15, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=15, n=10):
R@1000: 0.9126074392379951
nDCG@20: 0.42626887793725937
R@100: 0.5395982078699959
AP: 0.37397235423822206
RBP(rel=1): 0.47399603892972036


The best configuration among observed is mu = 1, k = 10, n = 10

As an observation, for some queries, the terms with the highest weight are those which are already in the query, so the expanded queries contain repeared terms. We attempt to handle it by excluding the terms that are already in the query from the top N list and compare the results with the previous implementation for the best configuration.

In [None]:
class LavrenkoRelevanceModel:
    def __init__(self, collection, preranking, mu=1, top_k=10, top_n_terms=10):
        """
        :param tcollection: List of tokenized documents [(doc_id, [tokens])]
        :param preranking: scores [(query_id, doc_id, score)] obtained by previous ranking
        :param mu: Dirichlet smoothing parameter
        :param top_k: Number of top-ranked documents to use for relevance model
        :param top_n_terms: Number of terms to expand the query with
        """
        self.documents = {doc_id: tokens for doc_id, tokens in collection}
        self.preranking = preranking
        self.mu = mu
        self.top_k = top_k
        self.top_n_terms = top_n_terms

        # building a background collection model
        self.background_model = self._compute_background_model()

    def _compute_background_model(self):
        """Compute corpus-wide term frequencies."""
        corpus = []
        for tokens in self.documents.values():
            corpus.extend(tokens)

        total_terms = len(corpus)
        term_frequencies = Counter(corpus)
        return {term: freq / total_terms for term, freq in term_frequencies.items()}

    def _get_top_k_documents(self, query_id):
        """Retrieve the top-k documents for a given query."""
        top_docs = [
            (doc_id, score)
            for q_id, doc_id, score in self.preranking
            if q_id == query_id
        ]
        top_docs = sorted(top_docs, key=lambda x: x[1], reverse=True)[:self.top_k]
        return top_docs

    def _term_probability_given_document(self, term, document_id):
        """Compute P(w|d) for a term given a document."""
        tokens = self.documents[document_id]
        term_count = tokens.count(term)
        doc_length = len(tokens)
        P_w_C = self.background_model.get(term, 1e-12)
        return (term_count + self.mu * P_w_C) / (doc_length + self.mu)

    def _term_probability_given_query(self, term, query_id):
        """Compute P(w|q) for a term given a query."""
        top_docs = self._get_top_k_documents(query_id)
        probability = 0

        for doc_id, bm25_score in top_docs:
            P_w_d = self._term_probability_given_document(term, doc_id)
            probability += P_w_d * bm25_score

        return probability

    def expand_query(self, query_id, query_tokens):
        """Expand a query using the Lavrenko relevance model."""
        # calculating P(w|q) for all terms in top-k documents
        top_docs = self._get_top_k_documents(query_id)
        all_terms = set()
        for doc_id, _ in top_docs:
            all_terms.update(self.documents[doc_id])

        term_probabilities = {
            term: self._term_probability_given_query(term, query_id)
            for term in all_terms
        }

        # ranking terms by P(w|q) and select top n terms which do not appear in the query
        ranked_terms = sorted(term_probabilities.items(), key=lambda x: x[1], reverse=True)
        top_terms = [term for term, _ in ranked_terms if term not in query_tokens]
        top_new_terms = top_terms[:self.top_n_terms]

        # combine original query with expanded terms
        expanded_query = query_tokens + top_new_terms
        return query_id, expanded_query


In [None]:
lavrenko_model = LavrenkoRelevanceModel(preprocessed_documents, flat_list_bm25, mu=1, top_k=10, top_n_terms=10)
expanded_queries = [
    lavrenko_model.expand_query(query_id, query_tokens)
    for query_id, query_tokens in preprocessed_queries
]

In [None]:
# Print expanded queries
print("Expanded Queries:")
for query_id, expanded_query in expanded_queries:
    print(f"{query_id}: {expanded_query}")

Expanded Queries:
200: ['коррупц', 'взяточничеств', 'спортивн', 'федерац', 'олимпиад', 'фиф', 'международн', 'мок', 'президент', 'власт', 'организац', 'олимпийск', 'подозрева', 'чиновник', 'программ']
201: ['китайск', 'инвестиц', 'ира', 'кита', 'иранск', 'доллар', 'млрд', 'иран', 'сектор', 'нефт', 'компан', 'стран', 'соглашен']
202: ['нов', 'технолог', 'точн', 'земледел', 'интеллектуальн', 'земледел', 'сельск', 'хозяйств', 'цифров', 'систем', 'сельскохозяйствен', 'цифровизац', 'област', 'выставк', 'апк', 'внедрен', 'развит', 'кра']
203: ['эвер', 'гив', 'застря', 'суэцк', 'кана', 'контейнеровоз', 'судн', 'суд', 'канал', 'Ever', 'Given', 'мел', 'контейнер']
204: ['штраф', 'допинг', 'спорт', 'стоп', 'спортсмен', 'закон', 'использован', 'запрещен', 'рубл', 'росс', 'законопроект', 'ответствен', 'уголовн', 'тренер']
205: ['чист', 'энерг', 'применен', 'практик', 'тенденц', 'источник', 'электроэнерг', 'возобновля', 'развит', 'технолог', 'нов', 'стран', 'росс', 'экологическ', 'использован']
206

In [None]:
ranked_bm25_LRM = []
for query in tqdm(expanded_queries):
    scores_bm25_LRM = bm25(query, preprocessed_documents)
    ranked_bm25_LRM.append(scores_bm25_LRM)

flat_list_bm25_LRM = list(chain.from_iterable(ranked_bm25_LRM))

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
performance_bm25_LRM = evaluate(qrels, flat_list_bm25_LRM)

In [None]:
RBP_bm25_LRM = rbp(flat_list_bm25_LRM, qrels)
RBP_1_bm25_LRM = sum(RBP_bm25_LRM.values()) / len(RBP_bm25_LRM)
performance_bm25_LRM["RBP(rel=1)"]=RBP_1_bm25_LRM

In [None]:
print("Evaluation Metrics (BM25_LRM mu=1, k=20, n=10):")
for metric, value in performance_bm25_LRM.items():
    print(f"{metric}: {value}")

Evaluation Metrics (BM25_LRM mu=1, k=20, n=10):
R@1000: 0.9142945515917054
nDCG@20: 0.41888568534717485
R@100: 0.537279110100354
AP: 0.36974535246175905
RBP(rel=1): 0.46577744771991325


The results on all metrics are lower on this configuration than on the one allowing repeated terms in the query.