In [98]:
import pandas as pd
import ir_datasets
import pickle
from sentence_transformers import SentenceTransformer, CrossEncoder
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc, calc_aggregate
from tqdm.notebook import tqdm
import torch
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
from scipy.spatial.distance import euclidean
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F


### Load datasets

In [100]:
# Load dataset
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")
english_queries = [
    (
        query.query_id, 
        query.title, 
        query.description, 
        query.fa_mt_title, 
        query.fa_mt_description, 
        query.ru_mt_title, 
        query.ru_mt_description, 
        query.zh_mt_title, 
        query.zh_mt_description, 
    ) 
    for query in dataset.queries_iter()
]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance, qrel.iteration) for qrel in dataset.qrels_iter()]

# Load locally saved embeddings from sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 
with open('data/document_embeddings.pkl', 'rb') as file:
    document_embeddings = pickle.load(file)

with open('data/title_embeddings.pkl', 'rb') as file:
    title_embeddings = pickle.load(file)

with open('data/multi-subset.pkl', 'rb') as file:
    multi_subset = pickle.load(file)

In [101]:
# initialize sentence transformer model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# Create query embeddinsg using sentence transformer (same procedure was applied to obtain the priorly loaded embeddings)
query_embeddings = model.encode([query[1] for query in english_queries], convert_to_tensor=True)

# Construct qrel dataframe
qrels_ids = []
document_ids = []
scores = []
iterations = []


for qrel in qrels:
    qrels_ids.append(qrel[0])
    document_ids.append(qrel[1])
    scores.append(qrel[2])
    iterations.append(qrel[3])

df_qrels = pd.DataFrame({
    'id':  qrels_ids, 
    'document_id': document_ids, 
    'score': scores,
    'iteration': iterations
})

# construct documents dataframe
multi_subset_ids = []
multi_subset_title = []
multi_subset_content = []

for x in tqdm(multi_subset):
    multi_subset_ids.append(x[0])
    multi_subset_title.append(x[1])
    multi_subset_content.append(x[2])    

df_documents = pd.DataFrame({
    'id': multi_subset_ids,
    'title': multi_subset_title,
    'content': multi_subset_content,
    'title_embedding': title_embeddings,
    'content_embedding': document_embeddings
})

# Construct query dataset
queries_ids = []
queries_query = []
queries_description = []
queries_fa_mt_title = []
queries_fa_mt_description = []
queries_ru_mt_title = []
queries_ru_mt_description = []
queries_zh_mt_title = []
queries_zh_mt_description = []

for x in english_queries:
    queries_ids.append(x[0])
    queries_query.append(x[1])
    queries_description.append(x[2])
    queries_fa_mt_title.append(x[3])
    queries_fa_mt_description.append(x[4])
    queries_ru_mt_title.append(x[5])
    queries_ru_mt_description.append(x[6])
    queries_zh_mt_title.append(x[7])
    queries_zh_mt_description.append(x[8])
    


df_queries = pd.DataFrame(
    english_queries, 
    columns=[
        'query_id', 
        'title', 
        'description', 
        'fa_mt_title', 
        'fa_mt_description', 
        'ru_mt_title', 
        'ru_mt_description', 
        'zh_mt_title', 
        'zh_mt_description', 
    ]
)

df_queries = pd.DataFrame({
    'id':  queries_ids, 
    'query': queries_query,
    'description': queries_description,
    'fa_mt_title': queries_fa_mt_title,
    'fa_mt_description': queries_fa_mt_description,
    'ru_mt_title': queries_ru_mt_title,
    'ru_mt_description': queries_ru_mt_description,
    'zh_mt_title': queries_zh_mt_title,
    'zh_mt_description': queries_zh_mt_description,
    'query_embedding': list(query_embeddings)
    
})

  0%|          | 0/76913 [00:00<?, ?it/s]

In [110]:
# Join df_qrels and df_documents together via document id
df_merged = pd.merge(df_qrels, df_documents, how='left', left_on='document_id', right_on='id')

df_merged.rename(columns={'title': 'document_title'}, inplace=True)

# Merge joined dataframe together via query id
df_merged = pd.merge(df_merged, df_queries, how='left', left_on='id_x', right_on='id')

# Assign query translation based on document language
def get_mt_title(row):
    if row['iteration'] == 'fas':
        return row['fa_mt_title']
    elif row['iteration'] == 'rus':
        return row['ru_mt_title']
    elif row['iteration'] == 'zho':
        return row['zh_mt_title']
    else:
        return None 
 
df_merged['query_title_translated'] = df_merged.apply(get_mt_title, axis=1)

# Drop columns that are not usefull anymore
df_merged.drop(columns=['fa_mt_title', 'ru_mt_title', 'zh_mt_title', 'fa_mt_description', 'ru_mt_description', 'zh_mt_description', 'id_y', 'id_x'], inplace=True)

# Merge df_qrels with df_documents on document id
final_merged_df = pd.merge(df_merged, df_qrels, on=['id', 'document_id'])
final_merged_df.rename(columns={'score_x': 'score', 'iteration_x': 'iteration'})

final_merged_df.drop(columns=['score_y', 'iteration_y'], inplace=True)

# delete iteration column because it is not used anymore
qrels = [t[:-1] for t in qrels]


In [111]:
# Define evaluation function
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
    scores = calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)

    return scores

In [112]:
# Convert tensors to numpy arrays for computational sparsity
final_merged_df['title_embedding'] = final_merged_df['title_embedding'].apply(lambda x: x.numpy() if hasattr(x, 'numpy') else np.array(x))
final_merged_df['content_embedding'] = final_merged_df['content_embedding'].apply(lambda x: x.numpy() if hasattr(x, 'numpy') else np.array(x))
final_merged_df['query_embedding'] = final_merged_df['query_embedding'].apply(lambda x: x.numpy() if hasattr(x, 'numpy') else np.array(x))

# aggregate title embedding and content embedding  
final_merged_df['mean_title_content_embedding'] = final_merged_df.apply(
    lambda row: np.mean([row['title_embedding'], row['content_embedding']], axis=0),
    axis=1
)

In [113]:
# Define cosine similarity function as similarity function
def calculate_cosine_similarity(row, embedding_type):
    query_vec = np.array(row['query_embedding']).reshape(1, -1)  # Convert list to numpy array and reshape for cosine_similarity
    title_vec = np.array(row[embedding_type]).reshape(1, -1)  
    return cosine_similarity(query_vec, title_vec)[0][0]

# Define euclidean distance as similarity function
def calculate_euclidean_distance(row, embedding_type):
    query_vec = np.array(row['query_embedding'])  
    title_vec = np.array(row[embedding_type])  
    return euclidean(query_vec, title_vec)

# Define dot product as similarity function
def calculate_dot_product(row, embedding_type):
    query_vec = np.array(row['query_embedding']) 
    title_vec = np.array(row[embedding_type])  
    return np.dot(query_vec, title_vec)


# Apply individual similarity measures and save in dataframe for easy access
final_merged_df['title_cosine_sim'] = final_merged_df.apply(calculate_cosine_similarity,  axis=1, embedding_type='title_embedding')
final_merged_df['content_cosine_sim'] = final_merged_df.apply(calculate_cosine_similarity,  axis=1, embedding_type='content_embedding')
final_merged_df['mean_title_content_embedding_cosine_sim'] = final_merged_df.apply(calculate_cosine_similarity,  axis=1, embedding_type='mean_title_content_embedding')

final_merged_df['title_euclidean_sim'] = final_merged_df.apply(calculate_euclidean_distance,  axis=1, embedding_type='title_embedding')
final_merged_df['content_euclidean_sim'] = final_merged_df.apply(calculate_euclidean_distance,  axis=1, embedding_type='content_embedding')
final_merged_df['mean_title_content_embedding_euclidean_sim'] = final_merged_df.apply(calculate_euclidean_distance,  axis=1, embedding_type='mean_title_content_embedding')

final_merged_df['title_dot_product'] = final_merged_df.apply(calculate_dot_product,  axis=1, embedding_type='title_embedding')
final_merged_df['content_dot_product'] = final_merged_df.apply(calculate_dot_product,  axis=1, embedding_type='content_embedding')
final_merged_df['mean_title_content_embedding_dot_product'] = final_merged_df.apply(calculate_dot_product,  axis=1, embedding_type='mean_title_content_embedding')

In [114]:
# Create tuple of each measure and sort descendingly by score
sorted_df_cosine_sim_title = final_merged_df.sort_values(by=['id', 'title_cosine_sim'], ascending=[True, False])
sbert_title_cosine_runs = list(sorted_df_cosine_sim_title[['id', 'document_id', 'title_cosine_sim']].itertuples(index=False, name=None))

sorted_df_cosine_sim_content = final_merged_df.sort_values(by=['id', 'content_cosine_sim'], ascending=[True, False])
sbert_content_cosine_runs = list(sorted_df_cosine_sim_content[['id', 'document_id', 'content_cosine_sim']].itertuples(index=False, name=None))

sorted_df_cosine_sim_title_and_content = final_merged_df.sort_values(by=['id', 'mean_title_content_embedding_cosine_sim'], ascending=[True, False])
sbert_title_and_content_cosine_runs = list(sorted_df_cosine_sim_title_and_content[['id', 'document_id', 'mean_title_content_embedding_cosine_sim']].itertuples(index=False, name=None))

sorted_df_euclidean_sim_title = final_merged_df.sort_values(by=['id', 'title_euclidean_sim'], ascending=[True, False])
sbert_title_euclidean_runs = list(sorted_df_euclidean_sim_title[['id', 'document_id', 'title_euclidean_sim']].itertuples(index=False, name=None))

sorted_df_euclidean_sim_content = final_merged_df.sort_values(by=['id', 'content_euclidean_sim'], ascending=[True, False])
sbert_content_euclidean_runs = list(sorted_df_euclidean_sim_content[['id', 'document_id', 'content_euclidean_sim']].itertuples(index=False, name=None))

sorted_df_euclidean_sim_title_and_content = final_merged_df.sort_values(by=['id', 'mean_title_content_embedding_euclidean_sim'], ascending=[True, False])
sbert_title_and_content_euclidean_runs = list(sorted_df_euclidean_sim_title_and_content[['id', 'document_id', 'mean_title_content_embedding_euclidean_sim']].itertuples(index=False, name=None))

sorted_df_dot_product_title = final_merged_df.sort_values(by=['id', 'title_dot_product'], ascending=[True, True])
sbert_title_dot_product_runs = list(sorted_df_dot_product_title[['id', 'document_id', 'title_dot_product']].itertuples(index=False, name=None))

sorted_df_dot_product_content = final_merged_df.sort_values(by=['id', 'content_dot_product'], ascending=[True, True])
sbert_content_dot_product_runs = list(sorted_df_dot_product_content[['id', 'document_id', 'content_dot_product']].itertuples(index=False, name=None))

sorted_df_dot_product_title_and_content = final_merged_df.sort_values(by=['id', 'mean_title_content_embedding_dot_product'], ascending=[True, True])
sbert_title_and_content_dot_product_runs = list(sorted_df_euclidean_sim_title_and_content[['id', 'document_id', 'mean_title_content_embedding_dot_product']].itertuples(index=False, name=None))


### Evaluation 

In [115]:
evaluate(qrels, sbert_title_cosine_runs)

{R@1000: 0.9289491760379356,
 AP: 0.26605608571641354,
 nDCG@20: 0.28265923222559025,
 RBP(rel=1): 0.41031624199604577,
 R@100: 0.19239881908238096}

In [116]:
evaluate(qrels, sbert_content_cosine_runs)

{R@1000: 0.9370592256561111,
 AP: 0.3071181697275949,
 nDCG@20: 0.3265541608066136,
 RBP(rel=1): 0.4741240482870999,
 R@100: 0.22473586747183774}

In [117]:
evaluate(qrels, sbert_title_and_content_cosine_runs)

{R@1000: 0.9399395199920656,
 AP: 0.29486777446657086,
 nDCG@20: 0.31654892237035054,
 RBP(rel=1): 0.46047940557638006,
 R@100: 0.21728120370757278}

In [118]:
evaluate(qrels, sbert_title_euclidean_runs)

{R@1000: 0.8282944435480063,
 AP: 0.15946156275371742,
 nDCG@20: 0.06722090656628024,
 RBP(rel=1): 0.11884350097582393,
 R@100: 0.07406646992555975}

In [119]:
evaluate(qrels, sbert_content_euclidean_runs)

{R@1000: 0.8132227545869642,
 AP: 0.14759415779477406,
 nDCG@20: 0.048247301288433825,
 RBP(rel=1): 0.08714808947395569,
 R@100: 0.05622804206503338}

In [120]:
evaluate(qrels, sbert_title_and_content_euclidean_runs)

{R@1000: 0.8138366184703791,
 AP: 0.1501234338457982,
 nDCG@20: 0.05126840667138022,
 RBP(rel=1): 0.09045887275342691,
 R@100: 0.054912916481281895}

In [121]:
evaluate(qrels, sbert_title_dot_product_runs)

{R@1000: 0.928911568640813,
 AP: 0.25618383141609175,
 nDCG@20: 0.26355405266151966,
 RBP(rel=1): 0.37382781401889587,
 R@100: 0.18322231526506277}

In [122]:
evaluate(qrels, sbert_content_dot_product_runs)

{R@1000: 0.939474688869609,
 AP: 0.30063341567550045,
 nDCG@20: 0.3067217994326334,
 RBP(rel=1): 0.43431829309903663,
 R@100: 0.220252211939205}

In [123]:
evaluate(qrels, sbert_title_and_content_dot_product_runs)

{R@1000: 0.940527634142903,
 AP: 0.28308758924802957,
 nDCG@20: 0.29465933186119225,
 RBP(rel=1): 0.4146459939783991,
 R@100: 0.20627482589211824}

### Refine Top 100 of each query by using Cross-encoder

In [124]:
# Take ranking of cosine simalrity on title because it performed best
final_merged_df = final_merged_df.sort_values(by=['id', 'title_cosine_sim'], ascending=[True, False])

In [None]:
# Load the pre-trained tokenizer and model
model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

query = "What is the capital of France?"
document = "I love me some baguette and cities"

# Tokenize the input (combining query and document)
inputs = tokenizer(query, document, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Forward pass to get the logits (score)
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits    

probs = F.softmax(logits, dim=-1)
probs

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[9.9998e-01, 2.3085e-05]])

In [133]:
logits

tensor([[-3.4886,  3.3367]])

In [140]:
# Load Multilingual Cross encoder from https://huggingface.co/amberoad/bert-multilingual-passage-reranking-msmarco
model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

updated_rows = []

# Iterate over each unique query_id
for query_id in tqdm(final_merged_df['id'].unique()):
    query_df = final_merged_df[final_merged_df['id'] == query_id]
    
    # Get top 100 rows based on title_cosine_sim
    top_100_df = query_df.nlargest(100, 'title_cosine_sim')
    remaining_df = query_df.drop(top_100_df.index)
    
    # Calculate cross_encoder scores for the top 100 rows
    query_titles = top_100_df['query_title_translated'].tolist()
    document_titles = top_100_df['document_title'].tolist()

    # Tokenize all query-document pairs via batch processing
    inputs = tokenizer(query_titles, document_titles, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Perform inference via batch processing
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits and apply softmax to get probabilities
    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)

    # Get relevance score - corresponds to the second value in the tensor
    cross_encoder_scores = probs[:, 1].cpu().numpy()

    # Update title_cosine_sim with cross_encoder scores for top 100 rows
    updated_top_100_df = top_100_df.copy()
    updated_top_100_df['new_title_cosine_sim'] = updated_top_100_df['title_cosine_sim'] + 0.1 * np.array(cross_encoder_scores)
    
    # Add updated rows to list
    updated_rows.append(updated_top_100_df)
    
    # Keep rows outside top 100 unchanged
    remaining_df['new_title_cosine_sim'] = remaining_df['title_cosine_sim']  
    updated_rows.append(remaining_df)

# Concatenate updated rows back into a DataFrame
final_updated_df = pd.concat(updated_rows)

# Sort DataFrame first by 'query_id' and then by 'new_title_cosine_sim' 
final_updated_df = final_updated_df.sort_values(by=['id', 'new_title_cosine_sim'], ascending=[True, False])

# Update original 'title_cosine_sim' column with the new value
final_updated_df['title_cosine_sim'] = final_updated_df['new_title_cosine_sim']

# Drop unnecessary column
final_updated_df = final_updated_df.drop(columns=['new_title_cosine_sim'])

  0%|          | 0/76 [00:00<?, ?it/s]

In [141]:
# Extract qrel data from dataframe and transform into list of tuples
cross_encoder_runs = [(row.id, row.document_id, row.title_cosine_sim) for row in final_updated_df.itertuples(index=False)]
# Evaluate cross-encoder scores 
evaluate(qrels, cross_encoder_runs) 

{R@1000: 0.9289491760379356,
 AP: 0.26924577283993967,
 nDCG@20: 0.3057825092714263,
 RBP(rel=1): 0.42070581734197193,
 R@100: 0.19239881908238096}