# Imports

In [2]:
!pip install rank_bm25
!pip install sentence-transformers transformers
!pip install datasets
!pip install accelerate
!pip install transformers


ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_lp64.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_thread.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRE

In [3]:
import os
os.environ["USE_TF"] = "0" # disable TensorFlow to avoid problems later

import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity

from datasets import Dataset
from sentence_transformers import InputExample, losses, SentenceTransformer, models
from torch.utils.data import DataLoader

# Loading Data
We load the data from the gitlab repository.

In [4]:
PATH_COLLECTION_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/701a0a217286555445870e1005d637ff587c5cee/task4/subtask_4b/subtask4b_collection_data.pkl'
PATH_QUERY_TRAIN_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_train.tsv?inline=false'
PATH_QUERY_DEV_DATA = 'https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/raw/main/task4/subtask_4b/subtask4b_query_tweets_dev.tsv?inline=false'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
device

'cuda'

## Baseline "neural" NLP representation learning approach  
### (MXBAI Embed Large v1 without any finetuning or preprocessing)  
As a baseline for this approach we used **MXBAI Embed Large v1**. It is a transformer-based model trained on diverse open-domain text pairs to produce high-quality sentence embeddings. Without any task-specific finetuning, it serves as a strong general-purpose baseline for linking tweets to academic papers.


In [5]:
class EmbeddingWrapper:
    
    def __init__(self, text_list, model_name, device):
        self.text_list = text_list
        self.model_name = model_name
        self.device = device
        
    def calculate_embeddings(self, batch_size = 32):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModel.from_pretrained(self.model_name).to(self.device)
        model.eval()
    
        embeddings = []
    
        with torch.no_grad():
            for i in tqdm(range(0, len(self.text_list), batch_size), desc="Encoding"):
                batch_texts = self.text_list[i:i+batch_size]
                inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt",
                                   return_token_type_ids=False, max_length=512)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :] 
                embeddings.append(batch_embeddings.cpu())
    
        self.embeddings = torch.cat(embeddings, dim=0)
        return self

### Document Embeddings:

In [5]:
tokenizer =  AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-embed-large-v1')       
text_batch = [title + tokenizer.sep_token + abstract for title, abstract in zip(df_collection['title'], df_collection['abstract'])]
emb_collection = EmbeddingWrapper(text_batch,model_name='mixedbread-ai/mxbai-embed-large-v1', device=device).calculate_embeddings(32)

Encoding:   2%|▏         | 5/242 [00:16<12:51,  3.25s/it]


KeyboardInterrupt: 

### Tweet Embeddings:
We only look at tweet text and keep the order so the ground truth can be matched later.

In [None]:
text_query_train = df_query_train['tweet_text'].to_list()
emb_query_train = EmbeddingWrapper(text_query_train,model_name='mixedbread-ai/mxbai-embed-large-v1', device=device).calculate_embeddings(32)

text_query_dev = df_query_dev['tweet_text'].to_list()
emb_query_dev = EmbeddingWrapper(text_query_dev,model_name='mixedbread-ai/mxbai-embed-large-v1', device=device).calculate_embeddings(32)

In [None]:
print(text_query_dev[0:2])
print(emb_query_dev.embeddings[:2])

In [6]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance
    
def get_top_k_cords(emb_query, emb_collection, df_collection, k=30):
    # compute cosine similarity matrix (for each query consine similarity for each document)
    cos_sim_matrix = cosine_similarity(emb_query, emb_collection)
    # For each query, get the indices of the top-k documents
    top_k_indices = np.argsort(-cos_sim_matrix, axis=1)[:, :k]  # shape: (num_queries, k)

    top_k_cord_uids = df_collection.iloc[top_k_indices.flatten()]['cord_uid'].values.reshape(top_k_indices.shape) # shape: (num_queries_topcords)
    return top_k_cord_uids.tolist()

In [None]:
df_query_train['topk'] = get_top_k_cords(emb_query_train.embeddings, emb_collection.embeddings, df_collection)
df_query_dev['topk'] = get_top_k_cords(emb_query_dev.embeddings, emb_collection.embeddings, df_collection)

In [None]:
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'topk')

print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

As we can see the model already performs better than the baseline but we would still like to improve the model by doing some finetuning.

## fine-tuning MXBAI Embed Large v1

To make our model perform better we are fine-tuning the model. We use hard positives (like a tweet and the matching paper) and hard negatives (a tweet and a similar paper (bm25) which is not the original paper) to train our model.

### Generating training pairs

In [None]:
# query + title + label
query_texts = df_query_train['tweet_text'].tolist()
true_uids = df_query_train['cord_uid'].tolist()
collection_texts = df_collection['title'].fillna('').tolist()
collection_uids = df_collection['cord_uid'].tolist()

# Tokenize and build index
tokenized_corpus = [doc.split() for doc in collection_texts]
bm25 = BM25Okapi(tokenized_corpus)

positive_pairs = []
negative_pairs = []

for qtext, true_uid in zip(query_texts, true_uids):
    # Positive
    try:
        pos_idx = collection_uids.index(true_uid)
        pos_doc = collection_texts[pos_idx]
        positive_pairs.append((qtext, pos_doc, 1))
    except ValueError:
        continue

    # Hard Negatives from BM25
    scores = bm25.get_scores(qtext.split())
    top_indices = np.argsort(scores)[::-1]
    negs = 0
    for idx in top_indices:
        if collection_uids[idx] != true_uid:
            negative_pairs.append((qtext, collection_texts[idx], 0))
            negs += 1
        if negs == 1:
            break

df_query_title_triples = pd.DataFrame(positive_pairs + negative_pairs, columns=["query", "document", "label"])
df_query_title_triples

In [None]:
df_query_title_triples.to_csv("df_query_title_triples.csv", index=False)

In [7]:
# query + title and abstract + label
collection_texts = (df_collection['title'].fillna('') + ' ' + df_collection['abstract'].fillna('')).tolist()

# The rest stays the same
query_texts = df_query_train['tweet_text'].tolist()
true_uids = df_query_train['cord_uid'].tolist()
collection_uids = df_collection['cord_uid'].tolist()

# Tokenize and build index
tokenized_corpus = [doc.split() for doc in collection_texts]
bm25 = BM25Okapi(tokenized_corpus)

positive_pairs = []
negative_pairs = []

for qtext, true_uid in zip(query_texts, true_uids):
    # Positive
    try:
        pos_idx = collection_uids.index(true_uid)
        pos_doc = collection_texts[pos_idx]
        positive_pairs.append((qtext, pos_doc, 1))
    except ValueError:
        continue

    # Hard Negatives from BM25
    scores = bm25.get_scores(qtext.split())
    top_indices = np.argsort(scores)[::-1]
    negs = 0
    for idx in top_indices:
        if collection_uids[idx] != true_uid:
            negative_pairs.append((qtext, collection_texts[idx], 0))
            negs += 1
        if negs == 1:
            break

df_query_title_abstract_triples = pd.DataFrame(positive_pairs + negative_pairs, columns=["query", "document", "label"])
df_query_title_abstract_triples


Unnamed: 0,query,document,label
0,Oral care in rehabilitation medicine: oral vul...,Oral Management in Rehabilitation Medicine: Or...,1
1,this study isn't receiving sufficient attentio...,Variation in racial/ethnic disparities in COVI...,1
2,"thanks, xi jinping. a reminder that this study...",Effect of non-pharmaceutical interventions for...,1
3,Taiwan - a population of 23 million has had ju...,Potential lessons from the Taiwan and New Zeal...,1
4,Obtaining a diagnosis of autism in lower incom...,Autism screening and conditional cash transfer...,1
...,...,...,...
25701,"""evidence on covid-19 reveals a growing body o...",Airborne transmission of SARS-CoV-2 over dista...,0
25702,Outdoor lighting has detrimental impacts on lo...,Artificial nighttime lighting impacts visual e...,0
25703,"26/ and influenza virus (and other pathogens, ...",Emerging Pandemic Diseases: How We Got To COVI...,0
25704,does it?'sars-cov-2-naïve vaccinees had a 13.0...,SARS-CoV-2 Naturally Acquired Immunity vs. Vac...,0


In [8]:
df_query_title_abstract_triples.to_csv("df_query_title_abstract_triples.csv", index=False)

### Prepare the training data
Convert your df_train_pairs.csv into InputExamples.

In [9]:
!os.kill(os.getpid(), 9)

ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_lp64.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_thread.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
/bin/bash: -c: line 1: syntax error near unexpected token `os.getpid'
/bin/bash: -c: line 1: `os.kill(os.getpid(), 9)'


In [11]:
df = pd.read_csv("df_query_title_abstract_triples.csv")
#df = df[df['label'] != 0]
#df = df.sample(frac=0.5)

train_examples = [
    InputExample(texts=[row['query'], row['document']], label=float(row['label']))
    for _, row in df.iterrows()
]

In [12]:
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

In [13]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.CosineSimilarityLoss(model)

In [14]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    output_path='mxbai-finetuned-tweet2paper',
    use_amp=True  # ← THIS helps reduce memory usage
)

ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_lp64.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_thread.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRE

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1918
1000,0.1717
1500,0.1651
2000,0.1622
2500,0.1591
3000,0.1541
3500,0.1248
4000,0.1139
4500,0.1133
5000,0.1153


In [15]:
model = SentenceTransformer('mxbai-finetuned-tweet2paper', device=device)

In [16]:
# For document embeddings
text_batch = [title + model.tokenizer.sep_token + abstract for title, abstract in zip(df_collection['title'], df_collection['abstract'])]
emb_collection = model.encode(text_batch, batch_size=32, show_progress_bar=True)

# For query training embeddings
text_finetuned_query_train = df_query_train['tweet_text'].tolist()
emb_finetuned_query_train = model.encode(text_finetuned_query_train, batch_size=32, show_progress_bar=True)

# For query dev embeddings
text_finetuned_query_dev = df_query_dev['tweet_text'].tolist()
emb_finetuned_query_dev = model.encode(text_finetuned_query_dev, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

Batches:   0%|          | 0/402 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

In [17]:
df_query_train['topk'] = get_top_k_cords(emb_finetuned_query_train, emb_collection, df_collection)
df_query_dev['topk'] = get_top_k_cords(emb_finetuned_query_dev, emb_collection, df_collection)

In [18]:
results_finetuned_train = get_performance_mrr(df_query_train, 'cord_uid', 'topk')
results_finetuned_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'topk')

print(f"Results on the train set: {results_finetuned_train}")
print(f"Results on the dev set: {results_finetuned_dev}")

Results on the train set: {1: 0.6068622111569283, 5: 0.6703895329235716, 10: 0.6771817523918201}
Results on the dev set: {1: 0.3557142857142857, 5: 0.4024523809523809, 10: 0.4095833333333334}
