In [1]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class ClaimSourceDataset(Dataset):
    def __init__(self, df, collection_df, tokenizer, max_len=256):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        paper_lookup = collection_df.set_index('cord_uid')

        for _, row in df.iterrows():
            tweet = row['tweet_text']
            paper_id = row['cord_uid']

            if paper_id not in paper_lookup.index:
                continue

            paper_row = paper_lookup.loc[paper_id]
            paper = f"{paper_row['title']} {paper_row['abstract']}"

            paper_tokenized = tokenizer(
                paper,
                truncation=True,
                padding='max_length',
                max_length=max_len,
                return_tensors="pt",
                return_overflowing_tokens=True
            )

            tweet_tokenized = tokenizer(
                tweet,
                truncation=True,
                padding='max_length',
                max_length=max_len,
                return_tensors="pt",
                return_overflowing_tokens=True
            )

            # Handle multiple overflow chunks (if any) — flat storage
            for i in range(len(paper_tokenized['input_ids'])):
                for j in range(len(tweet_tokenized['input_ids'])):
                    self.samples.append({
                        'tweet_input_ids': tweet_tokenized['input_ids'][j].squeeze(0),
                        'tweet_attention_mask': tweet_tokenized['attention_mask'][j].squeeze(0),
                        'paper_input_ids': paper_tokenized['input_ids'][i].squeeze(0),
                        'paper_attention_mask': paper_tokenized['attention_mask'][i].squeeze(0)
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

class Encoder(nn.Module):
    def __init__(self, model_name="scibert_scivocab_uncased"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')


    def forward(self, tweet_ids, tweet_mask, paper_ids, paper_mask):
        tweet_vec = self.encoder(tweet_ids, attention_mask=tweet_mask).last_hidden_state[:, 0]
        paper_vec = self.encoder(paper_ids, attention_mask=paper_mask).last_hidden_state[:, 0]
        return tweet_vec, paper_vec


In [3]:
import torch.nn.functional as F

def contrastive_loss(tweet_vecs, paper_vecs, temperature=0.05):
    tweet_vecs = F.normalize(tweet_vecs, dim=1)
    paper_vecs = F.normalize(paper_vecs, dim=1)

    logits = torch.matmul(tweet_vecs, paper_vecs.T) / temperature
    labels = torch.arange(len(tweet_vecs)).to(tweet_vecs.device)
    return F.cross_entropy(logits, labels)


In [None]:
import numpy as np
import pandas as pd

# prepare data for task

# 1) Download the collection set from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_COLLECTION_DATA = 'subtask4b_collection_data.pkl' #MODIFY PATH
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

PATH_QUERY_TRAIN_DATA = 'subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = 'subtask4b_query_tweets_dev.tsv' #MODIFY PATH
PATH_QUERY_DEV_TEST = 'subtask4b_query_tweets_test.tsv' #MODIFY PATH

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
df_query_test = pd.read_csv(PATH_QUERY_DEV_TEST, sep = '\t')

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', return_overflowing_tokens=True)
model = Encoder().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

train_dataset = ClaimSourceDataset(df_query_train, df_collection, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16 , shuffle=False)

In [11]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        tweet_ids = batch['tweet_input_ids'].to(device)
        tweet_mask = batch['tweet_attention_mask'].to(device)
        paper_ids = batch['paper_input_ids'].to(device)
        paper_mask = batch['paper_attention_mask'].to(device)

        tweet_vecs, paper_vecs = model(tweet_ids, tweet_mask, paper_ids, paper_mask)

        loss = contrastive_loss(tweet_vecs, paper_vecs)
   
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")


100%|██████████| 1587/1587 [36:48<00:00,  1.39s/it]


Epoch 1 Loss: 1.1701


100%|██████████| 1587/1587 [36:11<00:00,  1.37s/it]


Epoch 2 Loss: 0.9617


100%|██████████| 1587/1587 [35:55<00:00,  1.36s/it]


Epoch 3 Loss: 0.8946


100%|██████████| 1587/1587 [36:01<00:00,  1.36s/it]


Epoch 4 Loss: 0.8783


100%|██████████| 1587/1587 [36:42<00:00,  1.39s/it]

Epoch 5 Loss: 0.8625





In [14]:
import faiss
import numpy as np

def encode_papers(model, df_collection, tokenizer, batch_size=8, max_len=256):
    model.eval()
    paper_texts = df_collection.apply(lambda row: f"{row['title']} {row['abstract']}", axis=1).tolist()
    paper_ids = df_collection['cord_uid'].tolist()

    all_embeddings = []
    all_paper_ids = []

    with torch.no_grad():
        for i in range(0, len(paper_texts), batch_size):
            batch_texts = paper_texts[i:i+batch_size]
            batch_ids = paper_ids[i:i+batch_size]

            encodings = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                return_overflowing_tokens=True,
                return_tensors="pt",
                max_length=max_len
            )

            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            overflow_map = encodings['overflow_to_sample_mapping']

            vecs = model.encoder(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0].cpu().numpy()

            for j, vec in enumerate(vecs):
                original_idx = overflow_map[j].item()
                all_embeddings.append(vec)
                all_paper_ids.append(batch_ids[original_idx])

    return all_paper_ids, np.vstack(all_embeddings)



In [15]:
paper_ids, paper_embeddings = encode_papers(model, df_collection, tokenizer)

faiss_index = faiss.IndexFlatIP(paper_embeddings.shape[1])
faiss_index.add(paper_embeddings)

paper_id_map = {i: pid for i, pid in enumerate(paper_ids)}


In [16]:
def retrieve(model, df_query_dev, tokenizer, faiss_index, paper_id_map, topk=10):
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in df_query_dev['tweet_text']:
            enc = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            tweet_vec = model.encoder(enc['input_ids'], attention_mask=enc['attention_mask']).last_hidden_state[:, 0]
            tweet_vec = F.normalize(tweet_vec, dim=1).cpu().numpy()

            D, I = faiss_index.search(tweet_vec, topk)
            
            preds = [paper_id_map[idx] for idx in I[0]]
            predictions.append(preds)

    df_query_dev['dense_topk'] = predictions


In [18]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

# Evaluate
retrieve(model, df_query_dev, tokenizer, faiss_index, paper_id_map)
results_test = get_performance_mrr(df_query_dev, 'cord_uid', 'dense_topk')
print("MRR Results:", results_test)


MRR Results: {1: 0.4492857142857143, 5: 0.5170714285714286, 10: 0.5267610544217688}


In [None]:

df_query_test['preds'] = df_query_test['dense_topk'].apply(lambda x: x[:5])
df_query_test[['post_id', 'preds']].to_csv('predictions_new_scibert_final_context.tsv', index=None, sep='\t')
