In [2]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

In [3]:
PATH_COLLECTION_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/701a0a217286555445870e1005d637ff587c5cee/task4/subtask_4b/subtask4b_collection_data.pkl'
PATH_QUERY_TRAIN_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_train.tsv?inline=false'
PATH_QUERY_DEV_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_dev.tsv?inline=false'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
device

'cuda'

In [4]:
class EmbeddingWrapper:
    def __init__(self, text_list, model_name, device):
        self.text_list = text_list
        self.model_name = model_name
        self.device = device
    def calculate_embeddings(self, batch_size = 32):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModel.from_pretrained(self.model_name).to(self.device)
        model.eval()
    
        embeddings = []
    
        with torch.no_grad():
            for i in tqdm(range(0, len(self.text_list), batch_size), desc="Encoding"):
                batch_texts = self.text_list[i:i+batch_size]
                inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt",
                                   return_token_type_ids=False, max_length=512)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :] 
                embeddings.append(batch_embeddings.cpu())
    
        self.embeddings = torch.cat(embeddings, dim=0)
        return self
tokenizer =  AutoTokenizer.from_pretrained('allenai/specter2_base')       
text_batch = [title + tokenizer.sep_token + abstract for title, abstract in zip(df_collection['title'], df_collection['abstract'])]
emb_collection = EmbeddingWrapper(text_batch,model_name='allenai/specter2_base', device=device).calculate_embeddings(32)

  return self.fget.__get__(instance, owner)()
Encoding: 100%|██████████| 242/242 [03:50<00:00,  1.05it/s]


In [5]:
text_query_train = df_query_train['tweet_text'].to_list()
emb_query_train = EmbeddingWrapper(text_query_train,model_name='allenai/specter2_base', device=device).calculate_embeddings(32)

Encoding: 100%|██████████| 402/402 [00:53<00:00,  7.50it/s]


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance
    
def get_top_k_cords(emb_query, emb_collection, df_collection, k=30):
    # compute cosine similarity matrix (for each query consine similarity for each document)
    cos_sim_matrix = cosine_similarity(emb_query, emb_collection)
    # For each query, get the indices of the top-k documents
    top_k_indices = np.argsort(-cos_sim_matrix, axis=1)[:, :k]  # shape: (num_queries, k)

    top_k_cord_uids = df_collection.iloc[top_k_indices.flatten()]['cord_uid'].values.reshape(top_k_indices.shape) # shape: (num_queries_topcords)
    return top_k_cord_uids.tolist()

In [7]:
df_query_train['topk_specter_v1'] = get_top_k_cords(emb_query_train.embeddings, emb_collection.embeddings, df_collection)

In [9]:
get_performance_mrr(df_query_train, 'cord_uid', 'topk_specter_v1')

{1: 0.403096553333852, 5: 0.4702948727923443, 10: 0.47891286575056896}