## Building the hard negatives
The following scripts get 100 BM25 ranks and look for missing articles in the ranks. If relevant articles are missing from the 100 ranks for each query,
then it drops lower ranks and injects relevant article ids into the lists. 
--> All the hard negatives have exactly 100 articles including the highest Bm25 ranks and relevant articles from the gold data.

In [None]:
'''
>>> Script for extracting 100 first ranks with BM25. Results are saved in folder ranks/.

'''

import os
import json
import pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi

LANG = "nl"  # or "nl"
TOP_K = 100
QUERY_PATH = f"../baselines/preprocessed_data/queries_{LANG}_clean.csv"
CORPUS_PATH = f"../baselines/preprocessed_data/corpus_{LANG}_clean.csv"
OUTPUT_SIMPLE = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
OUTPUT_DETAILED = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}_with_scores.jsonl"
# ------------------------

os.makedirs("ranks", exist_ok=True)

df_corpus = pd.read_csv(CORPUS_PATH)
corpus_texts = df_corpus["article"].astype(str).tolist()
corpus_ids = df_corpus["id"].astype(str).tolist()
tokenized_corpus = [doc.split() for doc in corpus_texts]

bm25 = BM25Okapi(tokenized_corpus, k1=1.0, b=0.6)

print(f"Loaded {len(corpus_ids)} documents.")

df_queries = pd.read_csv(QUERY_PATH)
queries = df_queries[["id", "question", "article_ids"]].astype(str).values.tolist()

print(f"Loaded {len(queries)} queries.")

ranked_results_simple = {}
ranked_results_detailed = []

for qid, question, relevant_str in tqdm(queries, desc="Processing queries"):
    query_tokens = question.split()  # already preprocessed

    scores = bm25.get_scores(query_tokens)
    ranked_indices = scores.argsort()[::-1][:TOP_K]

    ranked_doc_ids = [corpus_ids[i] for i in ranked_indices]
    ranked_results_simple[qid] = ranked_doc_ids

    ranked_list = [
        {
            "doc_id": corpus_ids[i],
            "score": float(scores[i]),
            "rank": rank + 1
        }
        for rank, i in enumerate(ranked_indices)
    ]

    ranked_results_detailed.append({
        "query_id": qid,
        "relevant_ids": [x.strip() for x in relevant_str.split(",")],
        "bm25_ranked_list": ranked_list
    })

with open(OUTPUT_SIMPLE, "w", encoding="utf-8") as f:
    json.dump(ranked_results_simple, f, ensure_ascii=False, indent=2)

with open(OUTPUT_DETAILED, "w", encoding="utf-8") as f:
    for entry in ranked_results_detailed:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
'''
>>> Script for injecting relevant articles into the hard negatives. The script checks for missing relevant ids in the 100 ranks,
for any missing article, it drops the last rank, and injects the missing id. In the end, it shuffles the lists.
Results are saved in the folder hard_negatives/.

'''


import os
import json
import random
from tqdm import tqdm

LANG = "fr"  # or "fr"
TOP_K = 100
BM25_PATH = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
GOLD_PATH = f"gold_standard_{LANG}.json"
OUTPUT_PATH = f"hard_negatives/hard_negatives_{LANG}.jsonl"

os.makedirs("hard_negatives", exist_ok=True)

with open(BM25_PATH, encoding="utf-8") as f:
    bm25_ranks = json.load(f)

with open(GOLD_PATH, encoding="utf-8") as f:
    gold_data = json.load(f)

with open(OUTPUT_PATH, "w", encoding="utf-8") as out_f:
    for qid in tqdm(sorted(bm25_ranks.keys(), key=int), desc=f"Injecting gold for {LANG.upper()}"):
        top100 = bm25_ranks[qid][:]
        gold_ids = gold_data.get(qid, [])

        missing = [doc_id for doc_id in gold_ids if doc_id not in top100]

        if missing:
            to_drop = []
            i = len(top100) - 1
            while len(to_drop) < len(missing) and i >= 0:
                doc_id = top100[i]
                if doc_id not in gold_ids:
                    to_drop.append(i)
                i -= 1
            for index in sorted(to_drop, reverse=True):
                del top100[index]
            top100 += missing

        assert len(top100) == TOP_K, f"Query {qid}: {len(top100)} docs (expected {TOP_K})"
        assert all(doc in top100 for doc in gold_ids), f"Query {qid}: missing relevant doc(s)"

        random.shuffle(top100)

        out_f.write(json.dumps({
            "query_id": qid,
            "candidate_docs": top100,
            "relevant_ids": gold_ids
        }, ensure_ascii=False) + "\n")