# Getting started

### CLEF 2025 - CheckThat! Lab  - Task 4 Scientific Web Discourse - Subtask 4b (Scientific Claim Source Retrieval)

This notebook enables participants of subtask 4b to quickly get started. It includes the following:
- Code to upload data, including:
    - code to upload the collection set (CORD-19 academic papers' metadata)
    - code to upload the query set (tweets with implicit references to CORD-19 papers)
- Code to run a baseline retrieval model (BM25)
- Code to evaluate the baseline model

Participants are free to use this notebook and add their own models for the competition.

# 1) Importing data

In [None]:
import numpy as np
import pandas as pd
import tqdm
from tqdm.auto import tqdm

## 1.a) Import the collection set
The collection set contains metadata of CORD-19 academic papers.

The preprocessed and filtered CORD-19 dataset is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.


In [None]:
# 1) Download the collection set from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_COLLECTION_DATA = '../subtask4b_collection_data.pkl' #MODIFY PATH

In [None]:
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [None]:
df_collection.info()

In [None]:
df_collection.head()

## 1.b) Import the query set

The query set contains tweets with implicit references to academic papers from the collection set.

The preprocessed query set is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.

In [None]:
# 1) Download the query tweets from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b?ref_type=heads
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_QUERY_TRAIN_DATA = '../subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = '../subtask4b_query_tweets_dev.tsv' #MODIFY PATH

In [None]:
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

In [None]:
df_query_train.head()

In [None]:
df_query_train.info()

In [None]:
df_query_dev.head()

In [None]:
df_query_dev.info()

# 2) Running the baseline
The following code runs a BM25 baseline.


In [None]:
from rank_bm25 import BM25Okapi

In [None]:
# Create the BM25 corpus
corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection[:]['cord_uid'].tolist()
tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
def get_top_cord_uids(query):
  text2bm25top = {}
  if query in text2bm25top.keys():
      return text2bm25top[query]
  else:
      tokenized_query = query.split(' ')
      doc_scores = bm25.get_scores(tokenized_query)
      indices = np.argsort(-doc_scores)[:20]
      bm25_topk = [cord_uids[x] for x in indices]

      text2bm25top[query] = bm25_topk
      return bm25_topk


In [None]:
# Retrieve topk candidates using the BM25 model
tqdm.pandas()
#df_query_train['bm25_topk'] = df_query_train['tweet_text'].progress_apply(lambda x: get_top_cord_uids(x))
df_query_dev['bm25_topk'] = df_query_dev['tweet_text'].progress_apply(lambda x: get_top_cord_uids(x))

In [None]:
df_query_dev.head()

## 3) Evaluate retrieved candidates using MRR@k

In [None]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

## 4) Re-ranking approach

In [None]:
import openai
import json

In [None]:
openai.api_key = XXXXXXX
GPT_MODEL    = "o4-mini-2025-04-16"
TOP_K_FINAL       = 10
BATCH_CLAIMS      = 50
MAX_SNIPPET       = 200

In [None]:
def rerank_batch(batch_rows):
    system = {
        "role": "system",
        "content": (
            "You are a scientific IR specialist. "
            f"For each claim, you will get a set of candidate paper excerpts with unique IDs. "
            f"Your job is to select **exactly {TOP_K_FINAL}** excerpts per claim, ranked from most to least relevant. "
            "Do NOT provide any explanations or extra text—only follow the user’s output format."
        )
    }

    parts = []
    for _, (idx, claim, chunk_to_uid) in enumerate(batch_rows):
        listing = "\n".join(
            f"{j+1}. [{uid}] {chunk.replace(chr(10), ' ')[:MAX_SNIPPET]}…"
            for j, (chunk, uid) in enumerate(chunk_to_uid.items())
        )
        parts.append(
            f"---\n"
            f"ROW_INDEX: {idx}\n"
            f"Claim:\n\"{claim}\"\n\n"
            "Candidates:\n" + listing
        )

    user = {
        "role": "user",
        "content": (
            "\n\n".join(parts)
            + "\n\n"
            "Now output **one** JSON object (no commentary). "
            f"Each key must be the ROW_INDEX as a string, and each value an **array of exactly {TOP_K_FINAL}** paper ID strings, "
            "ordered from highest to lowest relevance. "
            "If fewer than 10 are clearly relevant, still list 10 IDs by your best judgment. "
            "Example:\n"
            "{\n"
            "  \"0\": [\"uid1\",\"uid2\", … ,\"uid10\"],\n"
            "  \"1\": [\"uidA\",\"uidB\", … ,\"uidJ\"]\n"
            "}\n"
            "Do NOT include any other keys, text, or formatting."
        )
    }

    resp = openai.chat.completions.create(
        model=GPT_MODEL,
        messages=[system, user]
    )
    out = resp.choices[0].message.content.strip()

    try:
        raw = json.loads(out)

        return {int(k): v for k, v in raw.items()}
    except Exception as e:
        print("JSON parse failed:", e)
        print("GPT output was:", out)

        return {
            idx: list(chunk_to_uid.values())[:TOP_K_FINAL]
            for idx, _, chunk_to_uid in batch_rows
        }


In [None]:
df_query_dev["gpt_topk"] = None

In [None]:
for batch_start in tqdm(range(0, len(df_query_dev), BATCH_CLAIMS)):
    batch = []
    for qi in range(batch_start, min(batch_start + BATCH_CLAIMS, len(df_query_dev))):
        row    = df_query_dev.iloc[qi]
        claim  = row["tweet_text"]
        uids   = row["bm25_topk"][:20]

        # build full-text->uid map
        chunk_to_uid = {}
        for uid in uids:
            paper = df_collection.loc[df_collection["cord_uid"] == uid]
            if paper.empty:
                continue
            title    = paper["title"].fillna("").iloc[0].strip()
            abstract = paper["abstract"].fillna("").iloc[0].strip()
            full_txt = f"{title}\n\n{abstract}"
            chunk_to_uid[full_txt] = uid

        batch.append((qi, claim, chunk_to_uid))

    results = rerank_batch(batch)

    for idx, best_uids in results.items():
        df_query_dev.at[idx, "gpt_topk"] = best_uids

In [None]:
mask = df_query_dev['gpt_topk'].isnull()
print(df_query_dev.loc[mask, ['cord_uid','gpt_topk']])

In [None]:
df_query_dev.loc[mask, "gpt_topk"] = df_query_dev.loc[mask, "bm25_topk"]

In [None]:
results_reranked = get_performance_mrr(df_query_dev, 'cord_uid', 'gpt_topk')
print(f"Results on the reranked set: {results_reranked}")

In [None]:
results_reranked = get_performance_mrr(df_query_dev, 'cord_uid', 'hybrid_topk')
print(f"Results on the reranked set: {results_reranked}")