In [6]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [7]:
# Load data
PATH_COLLECTION_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/701a0a217286555445870e1005d637ff587c5cee/task4/subtask_4b/subtask4b_collection_data.pkl'
PATH_QUERY_TRAIN_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_train.tsv?inline=false'
PATH_QUERY_DEV_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_dev.tsv?inline=false'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep='\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep='\t')
device

'cuda'

In [8]:
# Embedding wrapper
class EmbeddingWrapper:
    def __init__(self, text_list, model_name, device):
        self.text_list = text_list
        self.model_name = model_name
        self.device = device

    def calculate_embeddings(self, batch_size=32):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModel.from_pretrained(self.model_name).to(self.device)
        model.eval()

        embeddings = []
        with torch.no_grad():
            for i in tqdm(range(0, len(self.text_list), batch_size), desc=f"Encoding with {self.model_name}"):
                batch_texts = self.text_list[i:i + batch_size]
                inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt",
                                   return_token_type_ids=False, max_length=512)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS]
                embeddings.append(batch_embeddings.cpu())
        self.embeddings = torch.cat(embeddings, dim=0)
        return self

In [9]:
# Top-k retrieval
def get_top_k_cords(emb_query, emb_collection, df_collection, k=30):
    cos_sim_matrix = cosine_similarity(emb_query, emb_collection)
    top_k_indices = np.argsort(-cos_sim_matrix, axis=1)[:, :k]
    top_k_cord_uids = df_collection.iloc[top_k_indices.flatten()]['cord_uid'].values.reshape(top_k_indices.shape)
    return top_k_cord_uids.tolist()

# MRR
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1 / ([i for i in x[col_pred][:k]].index(x[col_gold]) + 1)
                                                if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

# Models to evaluate
models_to_test = [
    'allenai/specter2_base',
    'sentence-transformers/all-MiniLM-L6-v2',
    'bert-base-uncased',
    'allenai/scibert_scivocab_uncased',
    'sentence-transformers/msmarco-distilbert-base-v4',
    'intfloat/e5-base-v2',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
    'sentence-transformers/all-mpnet-base-v2',
    'jinaai/jina-embeddings-v2-base-en',
    'nlpaueb/legal-bert-base-uncased',
    'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
    'sentence-transformers/paraphrase-TinyBERT-L6-v2',
    'mixedbread-ai/mxbai-embed-large-v1'
    
]

# Evaluation
all_results = {}

for model_name in models_to_test:
    print(f"\n========== Evaluating model: {model_name} ==========")

    # Prepare text input
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    text_collection = [title + tokenizer.sep_token + abstract for title, abstract in zip(df_collection['title'], df_collection['abstract'])]
    text_query_train = df_query_train['tweet_text'].tolist()
    text_query_dev = df_query_dev['tweet_text'].tolist()

    # Embeddings
    emb_collection = EmbeddingWrapper(text_collection, model_name=model_name, device=device).calculate_embeddings()
    emb_query_train = EmbeddingWrapper(text_query_train, model_name=model_name, device=device).calculate_embeddings()
    emb_query_dev = EmbeddingWrapper(text_query_dev, model_name=model_name, device=device).calculate_embeddings()

    # Retrieval
    df_query_train[f'topk_{model_name}'] = get_top_k_cords(emb_query_train.embeddings, emb_collection.embeddings, df_collection)
    df_query_dev[f'topk_{model_name}'] = get_top_k_cords(emb_query_dev.embeddings, emb_collection.embeddings, df_collection)

    # Evaluation
    results_train = get_performance_mrr(df_query_train, 'cord_uid', f'topk_{model_name}')
    results_dev = get_performance_mrr(df_query_dev, 'cord_uid', f'topk_{model_name}')

    all_results[model_name] = {
        'train': results_train,
        'dev': results_dev
    }

    # Print scores
    print(f"Train MRR: {results_train}")
    print(f"Dev MRR:   {results_dev}")




AttributeError: module 'torch' has no attribute 'get_default_device'

In [None]:
# Summary
print("\n======= Summary of All Models =======")
for model, results in all_results.items():
    print(f"\nModel: {model}")
    print(f"Train MRR: {results['train']}")
    print(f"Dev MRR:   {results['dev']}")

All runs:

Model: allenai/specter2_base
Train MRR: {1: 0.403096553333852, 5: 0.47030135636297626, 10: 0.47891934932120095}
Dev MRR:   {1: 0.4357142857142857, 5: 0.49826190476190474, 10: 0.5065691609977324}

Model: sentence-transformers/all-MiniLM-L6-v2

Train MRR: {1: 0.3298840737570995, 5: 0.3967322804014627, 10: 0.40613950915047936}
Dev MRR:   {1: 0.3435714285714286, 5: 0.4059642857142857, 10: 0.4150019841269841}

Model: bert-base-uncased
Train MRR: {1: 0.0071578619777483855, 5: 0.01069140797219845, 10: 0.011623529310061637}
Dev MRR:   {1: 0.013571428571428571, 5: 0.01776190476190476, 10: 0.018869614512471655}

Model: allenai/scibert_scivocab_uncased
Train MRR: {1: 0.004823776550221738, 5: 0.008070748722736585, 10: 0.00893553354846438}
Dev MRR:   {1: 0.007857142857142858, 5: 0.009488095238095236, 10: 0.009815476190476189}

Model: sentence-transformers/msmarco-distilbert-base-v4
Train MRR: {1: 0.3257605228351358, 5: 0.38241785316009236, 10: 0.39129639303528674}
Dev MRR:   {1: 0.32357142857142857, 5: 0.37922619047619044, 10: 0.38698724489795916}

Model: intfloat/e5-base-v2
Train MRR: {1: 0.07826966466972692, 5: 0.10394590108664643, 10: 0.1088433124747604}
Dev MRR:   {1: 0.1, 5: 0.12107142857142857, 10: 0.1259985827664399}

Model: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract
Train MRR: {1: 0.008402707539095932, 5: 0.010779584532793901, 10: 0.011319882085462106}
Dev MRR:   {1: 0.01, 5: 0.012773809523809524, 10: 0.012924603174603172}

Model: sentence-transformers/all-mpnet-base-v2
Train MRR: {1: 0.3861355325604917, 5: 0.4509790191654348, 10: 0.4602069617494032}
Dev MRR:   {1: 0.3964285714285714, 5: 0.4585000000000001, 10: 0.46760289115646253}

Model: jinaai/jina-embeddings-v2-base-en
Train MRR: {1: 0.00023340854275266475, 5: 0.00048108094089577015, 10: 0.0006447139139895695}
Dev MRR:   {1: 0.0, 5: 0.00014285714285714287, 10: 0.00014285714285714287}

Model: nlpaueb/legal-bert-base-uncased
Train MRR: {1: 0.0036567338364584144, 5: 0.005326901631266371, 10: 0.0057552495310217245}
Dev MRR:   {1: 0.0064285714285714285, 5: 0.006904761904761904, 10: 0.00709608843537415}

Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Train MRR: {1: 0.1528047926554112, 5: 0.19350086879846468, 10: 0.2001792861650482}
Dev MRR:   {1: 0.16, 5: 0.20447619047619048, 10: 0.2115909863945578}

Model: sentence-transformers/paraphrase-TinyBERT-L6-v2
Train MRR: {1: 0.18135843771882051, 5: 0.22694571954666876, 10: 0.23494391402661846}
Dev MRR:   {1: 0.19857142857142857, 5: 0.24644047619047618, 10: 0.25355243764172336}

Model: mixedbread-ai/mxbai-embed-large-v1
Train MRR: {1: 0.529837392048549, 5: 0.5973222853289764, 10: 0.6045375732180369}
Dev MRR:   {1: 0.5471428571428572, 5: 0.6070238095238095, 10: 0.6137389455782313}


So the model performing the best was the:

mixedbread-ai/mxbai-embed-large-v1