In [1]:
import json
import pandas as pd

# -------- CONFIG --------
LANG = "nl"  # or "nl"
QUERY_PATH = f"preprocessed_data/queries_{LANG}_clean.csv"
BM25_RANK_PATH = f"ranks/bm25_ranked_results_{LANG}.json"
EXPECTED_NUM_DOCS = 22417  # change this if corpus size is different
# ------------------------

# Load query IDs
df_queries = pd.read_csv(QUERY_PATH)
query_ids = set(df_queries["id"].astype(str))

# Load BM25 ranks
with open(BM25_RANK_PATH, encoding="utf-8") as f:
    bm25_ranks = json.load(f)

bm25_query_ids = set(bm25_ranks.keys())

# --- CHECK 1: All queries present ---
missing_qids = query_ids - bm25_query_ids
extra_qids = bm25_query_ids - query_ids

if missing_qids:
    print(f"Missing query IDs in BM25 output: {missing_qids}")
else:
    print("All query IDs from CSV are present in BM25 ranks.")

if extra_qids:
    print(f"Extra query IDs in BM25 output not in CSV: {extra_qids}")

# --- CHECK 2: All rankings contain all expected docs ---
incomplete = []
duplicates = []

for qid, doc_ids in bm25_ranks.items():
    if len(doc_ids) != EXPECTED_NUM_DOCS:
        incomplete.append((qid, len(doc_ids)))
    if len(set(doc_ids)) != len(doc_ids):
        duplicates.append(qid)

if not incomplete:
    print(f"All queries have {EXPECTED_NUM_DOCS} ranked documents.")
else:
    print(f"Some queries have incomplete rankings:")
    for qid, count in incomplete:
        print(f"   - Query {qid} has {count} documents")

if not duplicates:
    print("No duplicate article IDs in any query ranking.")
else:
    print(f"Duplicate article IDs found in queries: {duplicates}")

All query IDs from CSV are present in BM25 ranks.
All queries have 22417 ranked documents.
No duplicate article IDs in any query ranking.
