In [None]:
from datasets import load_from_disk
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import json
import os
from nltk.tokenize.toktok import ToktokTokenizer

# Initialize tokenizer
tokenizer = ToktokTokenizer()

# Languages to process
languages = ["fr", "nl"]

for lang in languages:
    print(f"\n🔍 Processing language: {lang.upper()}")

    # Load cleaned datasets
    corpus = load_from_disk("../data_processing/data/cleaned_corpus_ds/cleaned_corpus")[lang]
    test_queries = load_from_disk("../data_processing/data/cleaned_queries_ds/cleaned_test_queries")[lang]

    # Prepare corpus documents and IDs
    corpus_docs = [doc['article'] for doc in corpus]
    corpus_ids = [str(doc['id']) for doc in corpus]

    # Tokenize corpus
    tokenized_corpus = [tokenizer.tokenize(doc.lower()) for doc in corpus_docs]
    bm25 = BM25Okapi(tokenized_corpus)

    # Prepare output
    output = []

    for query in tqdm(test_queries):
        query_id = query['id']
        query_text = query['question']

        relevant_ids = set(str(id_.strip()) for id_ in query['article_ids'].split(",") if id_.strip())
        num_relevant = len(relevant_ids)
        num_negatives_needed = 100

        tokenized_query = tokenizer.tokenize(query_text.lower())
        bm25_scores = bm25.get_scores(tokenized_query)

        scored_docs = [
            {"doc_id": corpus_ids[idx], "score": float(bm25_scores[idx]), "rank": None}
            for idx in range(len(bm25_scores))
        ]
        scored_docs = sorted(scored_docs, key=lambda x: x["score"], reverse=True)[:500]

        for final_rank, doc in enumerate(scored_docs):
            doc["rank"] = final_rank + 1

        hard_negatives = []
        for doc in scored_docs:
            if str(doc["doc_id"]) not in relevant_ids:
                hard_negatives.append(doc)
            if len(hard_negatives) >= num_negatives_needed:
                break

        output.append({
            "query_id": query_id,
            "query_text": query_text,
            "relevant_ids": list(relevant_ids),
            "bm25_ranked_list": scored_docs,
            "hard_negatives": hard_negatives
        })

    # Save JSONL output
    os.makedirs("data/bm25_sampling", exist_ok=True)
    output_path = f"data/bm25_sampling/bm25_with_scores_and_ranks_{lang}.jsonl"
    with open(output_path, "w", encoding="utf-8") as f:
        for entry in output:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"BM25 sampling completed for {lang.upper()} and saved to {output_path}")



🔍 Processing language: FR


100%|██████████| 190/190 [00:13<00:00, 14.30it/s]


✅ BM25 sampling completed for FR and saved to data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl

🔍 Processing language: NL


100%|██████████| 190/190 [00:12<00:00, 15.08it/s]

✅ BM25 sampling completed for NL and saved to data/bm25_sampling/bm25_with_scores_and_ranks_nl.jsonl





In [11]:
import json

file_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"

lengths = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        num_negatives = len(entry["hard_negatives"])
        lengths.append(num_negatives)

# Summary
print(f"Total queries: {len(lengths)}")
print(f"Min hard negatives: {min(lengths)}")
print(f"Max hard negatives: {max(lengths)}")
print(f"Average hard negatives: {sum(lengths)/len(lengths):.2f}")

# Optional: count how many are not 100
not_full = sum(1 for x in lengths if x < 100)
print(f"Queries with < 100 hard negatives: {not_full}")

Total queries: 190
Min hard negatives: 100
Max hard negatives: 100
Average hard negatives: 100.00
Queries with < 100 hard negatives: 0


In [12]:
import json

file_path = "data/bm25_sampling/bm25_with_scores_and_ranks_nl.jsonl"

lengths = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        num_negatives = len(entry["hard_negatives"])
        lengths.append(num_negatives)

# Summary
print(f"Total queries: {len(lengths)}")
print(f"Min hard negatives: {min(lengths)}")
print(f"Max hard negatives: {max(lengths)}")
print(f"Average hard negatives: {sum(lengths)/len(lengths):.2f}")

# Optional: count how many are not 100
not_full = sum(1 for x in lengths if x < 100)
print(f"Queries with < 100 hard negatives: {not_full}")

Total queries: 190
Min hard negatives: 100
Max hard negatives: 100
Average hard negatives: 100.00
Queries with < 100 hard negatives: 0


In [13]:
import json

file_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"

violations = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        query_id = entry["query_id"]
        relevant_ids = set(str(id_) for id_ in entry["relevant_ids"])
        hard_neg_ids = set(doc["doc_id"] for doc in entry["hard_negatives"])

        overlap = relevant_ids.intersection(hard_neg_ids)
        if overlap:
            violations.append({
                "query_id": query_id,
                "overlap_ids": list(overlap)
            })

# Report
if violations:
    print(f"❌ Violations found in {len(violations)} queries.")
    for v in violations:
        print(f"Query {v['query_id']} has overlap: {v['overlap_ids']}")
else:
    print("✅ All hard negative lists are clean. No overlap with relevant IDs.")

✅ All hard negative lists are clean. No overlap with relevant IDs.


In [14]:
import json

file_path = "data/bm25_sampling/bm25_with_scores_and_ranks_nl.jsonl"

violations = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        query_id = entry["query_id"]
        relevant_ids = set(str(id_) for id_ in entry["relevant_ids"])
        hard_neg_ids = set(doc["doc_id"] for doc in entry["hard_negatives"])

        overlap = relevant_ids.intersection(hard_neg_ids)
        if overlap:
            violations.append({
                "query_id": query_id,
                "overlap_ids": list(overlap)
            })

# Report
if violations:
    print(f"❌ Violations found in {len(violations)} queries.")
    for v in violations:
        print(f"Query {v['query_id']} has overlap: {v['overlap_ids']}")
else:
    print("✅ All hard negative lists are clean. No overlap with relevant IDs.")

✅ All hard negative lists are clean. No overlap with relevant IDs.


In [15]:
# NEW FILE JSON

import json
import os

# Load BM25 JSONL file
input_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"
output_path = "data/bm25_sampling/candidate_articles_fr.jsonl"

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        entry = json.loads(line)
        query_id = entry["query_id"]
        query_text = entry["query_text"]
        relevant_ids = set(str(rid) for rid in entry["relevant_ids"])

        # Keep only candidate article IDs that are NOT relevant
        candidate_ids = [doc["doc_id"] for doc in entry["bm25_ranked_list"] if doc["doc_id"] not in relevant_ids]

        # Limit to top 100 (if needed)
        candidate_ids = candidate_ids[:100]

        # Write cleaned entry
        new_entry = {
            "query_id": query_id,
            "query_text": query_text,
            "candidate_ids": candidate_ids
        }
        outfile.write(json.dumps(new_entry, ensure_ascii=False) + "\n")

print(f"✅ Saved cleaned candidate list to {output_path}")



✅ Saved cleaned candidate list to data/bm25_sampling/candidate_articles_fr.jsonl
