In [None]:
import ir_datasets
import torch
import nltk
from collections import defaultdict
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rank_bm25 import BM25Okapi
import string
from rank_bm25 import BM25Okapi
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc
from itertools import chain



### Load documents 

In [2]:
dataset = ir_datasets.load("neuclir/1/ru/trec-2023")
dataset

Dataset(id='neuclir/1/ru/trec-2023', provides=['docs', 'queries', 'qrels'])

In [None]:
russian_documents = [(doc.doc_id, doc.title, doc.text) for doc in tqdm(dataset.docs_iter())]
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

In [None]:
qrels_ids = {entry[1] for entry in tqdm(qrels)}
russian_documents_subset = [doc for doc in tqdm(russian_documents) if doc[0] in qrels_ids]
len(russian_documents_subset)

  0%|          | 0/25634 [00:00<?, ?it/s]

  0%|          | 0/4627543 [00:00<?, ?it/s]

24871

### Helper functions 

In [132]:
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance, iterations in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
#     scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)
    scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)


    return scores
    
    

def print_document(document_id):
    print(next((doc for doc in russian_documents if doc[0] == document_id), None))


def translate_query(query):
    translated_text = GoogleTranslator(source='auto', target='ru').translate(query[1]) 
    translated_tuple = (query[0], translated_text)

    return translated_tuple

def tokenize(text):

    query_tokens = text[1].split()

    # Define Russian stopwords
    russian_stopwords = set(stopwords.words('russian'))

    translator = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(translator).lower() for token in query_tokens if token.lower() not in russian_stopwords]
    return tokens


def convert_to_score(result, qrels):
    combined_dict = { (qid, docid): relevance for qid, docid, relevance in result}

    correct_matches = 0
    total = 0  

    # Compare qrels with combined_documents
    for qid, docid, true_relevance in qrels:
        if (qid, docid) in combined_dict:
            predicted_relevance = combined_dict[(qid, docid)]
            if predicted_relevance == true_relevance:
                correct_matches += 1
            total += 1  
        
    accuracy = correct_matches / total if total > 0 else 0
    return accuracy

def combine_documents(documents):
    combine_documents =  list(map(lambda doc: (doc[0], doc[1] + doc[2]), documents))
    return combine_documents

def assign_rank_tf_idf(value):
    if value >= 0.1:
        return 3
    elif value >= 0.5:
        return 2
    elif value >= 0.01:
        return 1
    else:
        return 0

def assign_rank(value):
    if value >= 0.2:
        return 3
    elif value >= 0.11:
        return 2
    elif value >= 0.05:
        return 1
    else:
        return 0
    
def assign_rank_bm25(value):
    if value >= 10:
        return 3
    elif value >= 6:
        return 2
    elif value >= 2:
        return 1
    else:
        return 0


In [119]:
def inverted_index(query, documents):
    # Translate the query, assuming this is a function you have defined elsewhere
    translated_query = translate_query(query)

    tokenized_query = tokenize(translated_query)

    # Initialize a dictionary to store documents for each word
    word_to_documents = {}

    # Iterate over each word in the query
    for word in tokenized_query:
        # Check each document for the word
        for doc_id, combined_text in documents:
            # If the word is found in the combined text (case-sensitive check)
            if word in combined_text:
                if word not in word_to_documents:
                    word_to_documents[word] = []
                word_to_documents[word].append(doc_id)

    # Create a dictionary to store the document frequencies (relevance score)
    document_frequency = {doc_id: 0 for doc_id, _ in documents}

    # For each word found in the documents, increment the score of the relevant documents
    for word, docs in word_to_documents.items():
        for doc_id in docs:
            document_frequency[doc_id] += 1  # Increment relevance score for documents that contain the word

    # Sort documents based on relevance (document frequency)
    sorted_documents = sorted(document_frequency.items(), key=lambda x: x[1], reverse=True)

    # Format the final output, giving a max score of 3 if the relevance is above 3, else leave as is
    formatted_docs = [(query[0], doc_id, min(relevance, 3)) for doc_id, relevance in sorted_documents]

    return formatted_docs


In [7]:
def tf_idf(query, documents):
    # Extract the query and documents texts
    query_id, query_text = query
    doc_ids, doc_texts = zip(*documents)  # unzip the document tuples into ids and texts

    # Combine the query text with the document texts for vectorization
    texts = [query_text] + list(doc_texts)

    # Create the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Extract the query vector (the first row in the matrix)
    query_vector = tfidf_matrix[0:1]

    # Compute the cosine similarity between the query vector and the document vectors
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix[1:]).flatten()

    # Pair each document id with its corresponding cosine similarity score
    scored_documents = [(doc_id, similarity) for doc_id, similarity in zip(doc_ids, cosine_similarities)]

    # Sort documents by similarity score in descending order
    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)
    

    ranked_data = [(query_id, uuid, assign_rank(value)) for uuid, value in sorted_documents]


    return ranked_data

In [None]:
def bm25(query, documents):
    # query = english_queries[2]

    # Translate the query (assumed to return a tuple with the query number and tokens)
    translated_query = translate_query(query)

    query_number = translated_query[0]  
    query_tokens = tokenize(translated_query) 
    document_ids = [item[0] for item in documents] 

    # Tokenize documents and initialize BM25
    corpus = [tokenize(doc) for doc in documents]
    bm25 = BM25Okapi(corpus)

    # Get BM25 scores for the query tokens
    scores = bm25.get_scores(query_tokens)

    # Combine query_number, document_ids, and scores into the required format
    scored_documents = [(doc_id, score) for doc_id, score in zip(document_ids, scores)]

    sorted_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

    ranked_data = [(query_number, uuid, assign_rank_bm25(value)) for uuid, value in sorted_documents]


    return ranked_data

### preprocess queries

In [69]:
combined_documents = combine_documents(russian_documents_subset)


### Calculate inverted index

In [76]:
english_queries[0]

('200', 'Corruption Bribery Sports Federation Olympics')

In [120]:
all_ranked_documents_inverted_index = []
for query in tqdm(english_queries):
    scores_inverted_index = inverted_index(query, combined_documents)
    all_ranked_documents_inverted_index.append(scores_inverted_index)

flat_list_inverted_index = list(chain.from_iterable(all_ranked_documents_inverted_index))



  0%|          | 0/76 [00:00<?, ?it/s]

In [133]:
evaluate(qrels, flat_list_inverted_index)

{R@1000: 0.5265270604239012,
 AP: 0.07992448263972612,
 R@100: 0.19601926707180986,
 nDCG@20: 0.12663033443370092}

In [113]:
print(ir_measures.providers.__dict__)


All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x000001B3138EEE80>, 'runfile': <function runfile at 0x000001B313A842C0>, '__IPYTHON__': True, 'display': <function display at 0x000001B31223B6A0>, '__pybind11_internals_v4_msvc__': <capsule object NULL at 0x000001B323C70FC0>, '__pybind11_internals_v4_mingw_libstdcpp_cxxabi1014__': <capsule object NULL at 0x000001B3392868B0>, 'get_ipython': <bound method InteractiveShell.get_ipython of <ipykernel.

### Calculate TF-IDF

In [136]:
all_ranked_documents_tfidf = []
for query in tqdm(english_queries):
    scores_tfidf = tf_idf(query, combined_documents)
    all_ranked_documents_tfidf.append(scores_tfidf)

flat_list_tfidf = list(chain.from_iterable(all_ranked_documents_tfidf))


  0%|          | 0/76 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
evaluate(qrels, flat_list_tfidf)

NameError: name 'flat_list_tfidf' is not defined

### Calculate BM25

In [137]:
all_ranked_documents_bm25 = []
for query in tqdm(english_queries):
    scores_bm25 = bm25(query, combined_documents)
    all_ranked_documents_bm25.append(scores_bm25)

flat_list_bm25 = list(chain.from_iterable(all_ranked_documents_bm25))


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/24871 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [140]:
evaluate(qrels, flat_list)

KeyboardInterrupt: 