# XLM-Roberta EN RU
Done by: Baga

## Preprocess data

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


documents = pd.read_csv('data/documents_subset.csv', sep='\t')
documents['doc_title'] = documents['doc_title'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
queries = pd.read_csv('data/queries.csv', sep='\t')
queries['query_text_rus'] = queries['query_text_rus'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()


## Generate embeddings

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model
model_name = "DeepPavlov/xlm-roberta-large-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Helper function to compute sentence embeddings
def get_sentence_embedding(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    # Mean pooling to get sentence embedding
    embeddings = outputs.last_hidden_state
    attention_mask = tokens["attention_mask"]
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    sentence_embedding = masked_embeddings.sum(dim=1) / mask.sum(dim=1)
    return sentence_embedding


# Compute embeddings for queries
query_embeddings = {}
for _, query in queries.iterrows():
    query_id = query["query_id"]
    query_text = query["query_text_rus"]
    query_embeddings[query_id] = get_sentence_embedding(query_text, tokenizer, model)



In [10]:
# Compute embeddings for documents
document_embeddings = {}
for _, document in documents.iterrows():
    doc_id = document["doc_id"]
    doc_text = str(document["doc_title"])
    document_embeddings[doc_id] = get_sentence_embedding(doc_text, tokenizer, model)


In [20]:
import pickle


# Save the dictionary to a pickle file
pickle_file_path = "data/doc-embeddings-xlmr.pkl"
with open(pickle_file_path, "wb") as f:
    pickle.dump(document_embeddings, f)

print(f"Dictionary saved to {pickle_file_path}")


Dictionary saved to data/doc-embeddings-xlmr.pkl


In [6]:
import pickle

# Specify the file path
file_path = "data/doc-embeddings-xlmr.pkl"

# Open the file in read-binary mode and load the dictionary
with open(file_path, "rb") as file:
    document_embeddings = pickle.load(file)

# Now `my_dict` contains the dictionary loaded from the file


In [11]:
# Compute rankings
rankings = {}
for query_id, query_embedding in query_embeddings.items():
    scores = []
    for doc_id, doc_embedding in document_embeddings.items():
        # Compute cosine similarity
        similarity = cosine_similarity(query_embedding.numpy(), doc_embedding.numpy())[0][0]
        scores.append((doc_id, similarity))
    # Sort documents by similarity in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    rankings[query_id] = scores


# # Print rankings
# for query_id, ranked_docs in rankings.items():
#     print(f"Query ID {query_id}:")
#     for doc_id, score in ranked_docs:
#         print(f"  Document ID: {doc_id}, Similarity Score: {score:.4f}")

In [8]:
# euclidean rankings:
import numpy as np

rankings = {}
for query_id, query_embedding in query_embeddings.items():
    scores = []
    for doc_id, doc_embedding in document_embeddings.items():
        # Compute Euclidean distance
        distance = np.linalg.norm(query_embedding.numpy() - doc_embedding.numpy())
        # Note: Smaller distance means higher similarity
        scores.append((doc_id, -distance))  # Negate distance to sort in descending order
    # Sort documents by distance (negated) in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    rankings[query_id] = scores

In [14]:
# Minkowski distance
# Define a parameter `p` for Minkowski distance
p = 3  # Example: p = 3 for cubic distance; change as needed

rankings = {}
for query_id, query_embedding in query_embeddings.items():
    scores = []
    for doc_id, doc_embedding in document_embeddings.items():
        # Compute Minkowski distance
        distance = np.sum(np.abs(query_embedding.numpy() - doc_embedding.numpy()) ** p) ** (1 / p)
        # Note: Smaller distance means higher similarity
        scores.append((doc_id, -distance))  # Negate distance to sort in descending order
    # Sort documents by distance (negated) in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    rankings[query_id] = scores

In [15]:
# Flatten rankings into a DataFrame
flattened_rankings = []
for query_id, docs in rankings.items():
    for doc_id, score in docs:
        flattened_rankings.append({'query_id': str(query_id), 'doc_id': doc_id, 'score': score})

# Convert to a DataFrame
flattened_rankings_df = pd.DataFrame(flattened_rankings)
# Ensure documents are sorted by score for each query
flattened_rankings_df = flattened_rankings_df.sort_values(by=['query_id', 'score'], ascending=[True, False])

# Display the flattened rankings
flattened_rankings_df[:10]


Unnamed: 0,query_id,doc_id,score
0,200,b76bf77f-78d4-460b-8d4e-1837bea7f6ef,-0.563927
1,200,fce6a923-8a9d-441e-b565-f79c53dc1baf,-0.57633
2,200,6259ed1e-02d3-438d-b3eb-950d51d95132,-0.600582
3,200,1a49ac53-27fa-45bc-abda-f33927daaf28,-0.606003
4,200,79228ae6-e047-4e1b-bd67-ed6873553834,-0.610377
5,200,a9aa71e8-3049-4017-8523-65490482afa0,-0.612731
6,200,2fb2cba8-5d76-4dc7-8c2e-80dbd2870eb8,-0.614481
7,200,e3fb40d6-e26c-4af9-a5c2-e019d64aed81,-0.614579
8,200,18afa815-690c-4c2c-be68-f0384c5ef8d3,-0.618613
9,200,29fdda1d-adea-44d7-a9dc-16124192c9c0,-0.620795


In [16]:
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc


qrels_pd = pd.read_csv('data/qrels.csv', sep='\t')

qrels = [
    ir_measures.Qrel(query_id=str(row['query_id']), doc_id=row['doc_id'], relevance=row['relevance_class'])
    for _, row in qrels_pd.iterrows()
]

In [17]:
def evaluate(qrels, result):
    runs = [
        ScoredDoc(query_id=row['query_id'], doc_id=row['doc_id'], score=row['score'])
        for _, row in result.iterrows()
    ]

    metrics = [
        ir_measures.nDCG @ 20,   # nDCG@20
        ir_measures.AP,          # Average Precision
        ir_measures.RBP(rel=1),  # Relevance Based Precision
        ir_measures.R @ 100,     # Recall@100
        ir_measures.R @ 1000     # Recall@1000
    ]

    scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)
    # scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)

    return scores

In [18]:
performance_tfidf = evaluate(qrels, flattened_rankings_df)
print("Evaluation Metrics ROBERTA:")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")

Evaluation Metrics ROBERTA:
R@100: 0.11560332034582407
RBP(rel=1): 0.173437160259603
nDCG@20: 0.13043452326804456
R@1000: 0.22229372723690713
AP: 0.04244673716818182
