In [22]:
import os
import json
import pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi

# -------- CONFIG --------
LANG = "nl"  # or "nl"
TOP_K = 100
QUERY_PATH = f"../baselines/preprocessed_data/queries_{LANG}_clean.csv"
CORPUS_PATH = f"../baselines/preprocessed_data/corpus_{LANG}_clean.csv"
OUTPUT_SIMPLE = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
OUTPUT_DETAILED = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}_with_scores.jsonl"
# ------------------------

os.makedirs("ranks", exist_ok=True)

# Load corpus
df_corpus = pd.read_csv(CORPUS_PATH)
corpus_texts = df_corpus["article"].astype(str).tolist()
corpus_ids = df_corpus["id"].astype(str).tolist()
tokenized_corpus = [doc.split() for doc in corpus_texts]

bm25 = BM25Okapi(tokenized_corpus, k1=1.0, b=0.6)

print(f"Loaded {len(corpus_ids)} documents.")

# Load queries
df_queries = pd.read_csv(QUERY_PATH)
queries = df_queries[["id", "question", "article_ids"]].astype(str).values.tolist()

print(f"Loaded {len(queries)} queries.")

ranked_results_simple = {}
ranked_results_detailed = []

for qid, question, relevant_str in tqdm(queries, desc="Processing queries"):
    query_tokens = question.split()  # already preprocessed

    scores = bm25.get_scores(query_tokens)
    ranked_indices = scores.argsort()[::-1][:TOP_K]

    ranked_doc_ids = [corpus_ids[i] for i in ranked_indices]
    ranked_results_simple[qid] = ranked_doc_ids

    ranked_list = [
        {
            "doc_id": corpus_ids[i],
            "score": float(scores[i]),
            "rank": rank + 1
        }
        for rank, i in enumerate(ranked_indices)
    ]

    ranked_results_detailed.append({
        "query_id": qid,
        "relevant_ids": [x.strip() for x in relevant_str.split(",")],
        "bm25_ranked_list": ranked_list
    })

# Write output
with open(OUTPUT_SIMPLE, "w", encoding="utf-8") as f:
    json.dump(ranked_results_simple, f, ensure_ascii=False, indent=2)

with open(OUTPUT_DETAILED, "w", encoding="utf-8") as f:
    for entry in ranked_results_detailed:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

Loaded 22417 documents.
Loaded 203 queries.


Processing queries: 100%|██████████| 203/203 [00:03<00:00, 66.30it/s]


In [20]:
import os
import json
from tqdm import tqdm

# -------- CONFIG --------
LANG = "nl"  # or "nl"
TOP_K = 100
BM25_PATH = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
GOLD_PATH = f"gold_standard_{LANG}.json"
# ------------------------

# Load files
with open(BM25_PATH, encoding="utf-8") as f:
    bm25_data = json.load(f)

with open(GOLD_PATH, encoding="utf-8") as f:
    gold_data = json.load(f)

# Logging setup
total_missing = 0
total_queries = 0

print(f"Checking for missing relevant docs in top-{TOP_K} ranks...\n")

for qid in tqdm(sorted(gold_data.keys(), key=int), desc="Processing"):
    relevant_ids = set(gold_data[qid])
    top100 = set(bm25_data.get(qid, []))

    missing = relevant_ids - top100
    if missing:
        print(f"Query {qid}: {len(missing)} missing relevant article(s): {sorted(missing)}")
        total_missing += len(missing)
    total_queries += 1

print("\n--- SUMMARY ---")
print(f"Total queries processed: {total_queries}")
print(f"Queries with missing relevant docs: {sum(1 for q in gold_data if set(gold_data[q]) - set(bm25_data.get(q, [])))}")
print(f"Total missing relevant articles: {total_missing}")

Checking for missing relevant docs in top-100 ranks...



Processing: 100%|██████████| 203/203 [00:00<00:00, 101701.35it/s]

Query 4: 1 missing relevant article(s): ['2222']
Query 16: 6 missing relevant article(s): ['2220', '2221', '2222', '2503', '2504', '2505']
Query 17: 5 missing relevant article(s): ['2220', '2221', '2222', '2503', '2504']
Query 25: 2 missing relevant article(s): ['1057', '1077']
Query 26: 1 missing relevant article(s): ['2845']
Query 39: 2 missing relevant article(s): ['16360', '16464']
Query 40: 1 missing relevant article(s): ['16470']
Query 47: 1 missing relevant article(s): ['2844']
Query 54: 3 missing relevant article(s): ['14201', '14202', '14365']
Query 65: 2 missing relevant article(s): ['2134', '6043']
Query 69: 1 missing relevant article(s): ['2127']
Query 72: 3 missing relevant article(s): ['2041', '2119', '2138']
Query 79: 5 missing relevant article(s): ['5146', '5149', '5712', '5713', '5714']
Query 93: 2 missing relevant article(s): ['5851', '5852']
Query 96: 1 missing relevant article(s): ['10041']
Query 101: 6 missing relevant article(s): ['5149', '5150', '5151', '5226', '




In [19]:
import os
import json
from tqdm import tqdm

# --- CONFIG ---
LANG = "nl"  # or "fr"
TOP_K = 100
INPUT_PATH = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
GOLD_PATH = f"gold_standard_{LANG}.json"
OUTPUT_PATH = f"ranks/bm25_top{TOP_K}_with_drop_ranks_{LANG}.json"
# --------------

# Load BM25 top-100 ranks
with open(INPUT_PATH, encoding="utf-8") as f:
    bm25_ranks = json.load(f)

# Load gold relevant document IDs
with open(GOLD_PATH, encoding="utf-8") as f:
    gold_relevant = json.load(f)

adjusted_ranks = {}

for query_id in tqdm(bm25_ranks, desc="Adjusting ranks"):
    ranked_docs = bm25_ranks[query_id]
    gold_ids = gold_relevant.get(query_id, [])
    
    # Identify missing gold IDs
    missing_gold = [doc_id for doc_id in gold_ids if doc_id not in ranked_docs]
    
    # Append missing gold docs at the end
    extended_docs = ranked_docs + missing_gold

    # Now remove as many docs from the end as there were missing gold IDs
    if missing_gold:
        to_remove = len(missing_gold)
        new_docs = []
        removed = 0
        for doc_id in reversed(extended_docs):
            if removed < to_remove and doc_id not in missing_gold:
                removed += 1
                continue
            new_docs.append(doc_id)
        final_docs = list(reversed(new_docs))
    else:
        final_docs = ranked_docs

    adjusted_ranks[query_id] = final_docs

# Save output
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(adjusted_ranks, f, ensure_ascii=False, indent=2)

print(f"✅ Done. Adjusted ranks saved to: {OUTPUT_PATH}")

Adjusting ranks: 100%|██████████| 203/203 [00:00<00:00, 60886.99it/s]

✅ Done. Adjusted ranks saved to: ranks/bm25_top100_with_drop_ranks_nl.json





In [None]:
import os
import json
import random
from tqdm import tqdm

# -------- CONFIG --------
LANG = "fr"  # or "fr"
TOP_K = 100
BM25_PATH = f"ranks/bm25_top{TOP_K}_ranked_results_{LANG}.json"
GOLD_PATH = f"gold_standard_{LANG}.json"
OUTPUT_PATH = f"hard_negatives/hard_negatives_{LANG}.jsonl"
# ------------------------

os.makedirs("hard_negatives", exist_ok=True)

# Load data
with open(BM25_PATH, encoding="utf-8") as f:
    bm25_ranks = json.load(f)

with open(GOLD_PATH, encoding="utf-8") as f:
    gold_data = json.load(f)

with open(OUTPUT_PATH, "w", encoding="utf-8") as out_f:
    for qid in tqdm(sorted(bm25_ranks.keys(), key=int), desc=f"Injecting gold for {LANG.upper()}"):
        top100 = bm25_ranks[qid][:]
        gold_ids = gold_data.get(qid, [])

        missing = [doc_id for doc_id in gold_ids if doc_id not in top100]

        if missing:
            to_drop = []
            i = len(top100) - 1
            while len(to_drop) < len(missing) and i >= 0:
                doc_id = top100[i]
                if doc_id not in gold_ids:
                    to_drop.append(i)
                i -= 1
            for index in sorted(to_drop, reverse=True):
                del top100[index]
            top100 += missing

        assert len(top100) == TOP_K, f"Query {qid}: {len(top100)} docs (expected {TOP_K})"
        assert all(doc in top100 for doc in gold_ids), f"Query {qid}: missing relevant doc(s)"

        random.shuffle(top100)

        out_f.write(json.dumps({
            "query_id": qid,
            "candidate_docs": top100,
            "relevant_ids": gold_ids
        }, ensure_ascii=False) + "\n")

print(f"Done. Output written to: {OUTPUT_PATH}")

Injecting gold for NL: 100%|██████████| 203/203 [00:00<00:00, 5883.02it/s]

Done. Output written to: hard_negatives/hard_negatives_nl.jsonl



