In [None]:
# Getting started

### CLEF 2025 - CheckThat! Lab  - Task 4 Scientific Web Discourse - Subtask 4b (Scientific Claim Source Retrieval)

This notebook enables participants of subtask 4b to quickly get started. It includes the following:
- Code to upload data, including:
    - code to upload the collection set (CORD-19 academic papers' metadata)
    - code to upload the query set (tweets with implicit references to CORD-19 papers)
- Code to run a baseline retrieval model (BM25)
- Code to evaluate the baseline model

Participants are free to use this notebook and add their own models for the competition.

# 1) Importing data

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
DATA_DIR = Path('../')

## 1.a) Import the collection set
The collection set contains metadata of CORD-19 academic papers.

The preprocessed and filtered CORD-19 dataset is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.


In [None]:
# 1) Download the collection set from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_COLLECTION_DATA = DATA_DIR /  'subtask4b_collection_data.pkl' #MODIFY PATH

In [None]:
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [None]:
df_collection.info()

In [None]:
df_collection.head()

## 1.b) Import the query set

The query set contains tweets with implicit references to academic papers from the collection set.

The preprocessed query set is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.

In [None]:
# 1) Download the query tweets from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b?ref_type=heads
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_QUERY_TRAIN_DATA = DATA_DIR / 'subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = DATA_DIR / 'subtask4b_query_tweets_dev.tsv' #MODIFY PATH

In [None]:
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

In [None]:
df_query_train.head()

In [None]:
df_query_train.info()

In [None]:
df_query_dev.head()

In [None]:
df_query_dev.info()

In [None]:
df_query_train

# 2) Trying BERT


In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader

In [None]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

In [None]:
def build_paper_text(df):
    return (
        df["title"].fillna('') + " " +
        df["abstract"].fillna('')
    )

In [None]:
df_collection["paper_text"] = build_paper_text(df_collection)

In [None]:
df_query_train = df_query_train.merge(df_collection[["cord_uid", "paper_text"]], on="cord_uid")

In [None]:
train_examples = []
for _, row in df_query_train.iterrows():
    example = InputExample(texts=[row["tweet_text"], row["paper_text"]], label=1)
    train_examples.append(example)

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
df_query_dev = df_query_train.merge(df_collection[["cord_uid", "paper_text"]], on="cord_uid")

In [None]:
dev_examples = []
for _, row in df_query_dev.iterrows():
    example = InputExample(texts=[row["tweet_text"], row["paper_text"]], label=1)
    dev_examples.append(example)

In [None]:
dev_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_examples, name='dev-eval')

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    #steps_per_epoch=500,
    output_path=f'fine-{model_name}',
    save_best_model=True,
    checkpoint_path=f'checkpoints/fine-tuned-{model_name}',
    show_progress_bar=True,
    optimizer_params={'lr': 2e-5}
)

In [None]:
paper_texts = df_collection['title'] + ' ' + df_collection['abstract']
paper_embeddings = model.encode(paper_texts.tolist(), convert_to_tensor=True)

In [None]:
# --- 1. Create the BERT corpus embeddings ---
# Combine title + abstract like in BM25
corpus = df_collection[['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection['cord_uid'].tolist()

# Encode the corpus (can take time if large — do this once and cache)
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, progress_bar=True)

# Optional: store cord_uid → index mapping
uid_to_idx = {uid: i for i, uid in enumerate(cord_uids)}

# --- 2. Encode a tweet and rank papers ---
def retrieve_with_bert(tweet_text, top_k=10):
    # Encode tweet
    query_embedding = model.encode(tweet_text, convert_to_tensor=True)

    # Compute cosine similarity with the corpus
    scores = util.cos_sim(query_embedding, corpus_embeddings)[0]  # shape: (num_docs,)

    # Get top-k highest scoring indices
    top_results = torch.topk(scores, k=top_k)

    # Retrieve corresponding paper IDs
    top_indices = top_results.indices.cpu().tolist()
    ranked_cord_uids = [cord_uids[i] for i in top_indices]

    return ranked_cord_uids


In [None]:
text2berttop = {}

def get_top_cord_uids_bert(query, top_k=10):
    if query in text2berttop:
        return text2berttop[query]
    
    # Encode the tweet query
    query_embedding = model.encode(query, convert_to_tensor=True, progress_bar=True)

    # Compute cosine similarity
    scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

    # Get top-k indices
    top_indices = torch.topk(scores, k=top_k).indices.cpu().tolist()

    # Map indices to cord_uids
    topk_uids = [cord_uids[i] for i in top_indices]

    text2berttop[query] = topk_uids
    return topk_uids

In [None]:
# --- 3. Apply to your datasets just like BM25 ---
df_query_train['bert_topk'] = df_query_train['tweet_text'].apply(lambda x: get_top_cord_uids_bert(x))
df_query_dev['bert_topk'] = df_query_dev['tweet_text'].apply(lambda x: get_top_cord_uids_bert(x))

# 3) Evaluating the baseline
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5).

In [None]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [None]:
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'bert_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bert_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

# 4) Exporting results to prepare the submission on Codalab

In [None]:
df_query_dev['preds'] = df_query_dev['bm25_topk'].apply(lambda x: x[:5])

In [None]:
df_query_dev[['post_id', 'preds']].to_csv('predictions_bert.tsv', index=None, sep='\t')