In [None]:
from datasets import load_from_disk
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import json
import os
from nltk.tokenize.toktok import ToktokTokenizer

# Initialize tokenizer
tokenizer = ToktokTokenizer()

# Languages to process
languages = ["fr", "nl"]

for lang in languages:
    print(f"\n🔍 Processing language: {lang.upper()}")

    # Load cleaned datasets
    corpus = load_from_disk("../data_processing/data/cleaned_corpus_ds/cleaned_corpus")[lang]
    test_queries = load_from_disk("../data_processing/data/cleaned_queries_ds/cleaned_test_queries")[lang]

    # Prepare corpus documents and IDs
    corpus_docs = [doc['article'] for doc in corpus]
    corpus_ids = [str(doc['id']) for doc in corpus]

    # Tokenize corpus
    tokenized_corpus = [tokenizer.tokenize(doc.lower()) for doc in corpus_docs]
    bm25 = BM25Okapi(tokenized_corpus)

    # Prepare output
    output = []

    for query in tqdm(test_queries):
        query_id = query['id']
        query_text = query['question']

        relevant_ids = set(str(id_.strip()) for id_ in query['article_ids'].split(",") if id_.strip())
        num_relevant = len(relevant_ids)
        num_negatives_needed = 100 - num_relevant

        tokenized_query = tokenizer.tokenize(query_text.lower())
        bm25_scores = bm25.get_scores(tokenized_query)

        scored_docs = [
            {"doc_id": corpus_ids[idx], "score": float(bm25_scores[idx]), "rank": None}
            for idx in range(len(bm25_scores))
        ]
        scored_docs = sorted(scored_docs, key=lambda x: x["score"], reverse=True)[:500]

        for final_rank, doc in enumerate(scored_docs):
            doc["rank"] = final_rank + 1

        hard_negatives = []
        for doc in scored_docs:
            if str(doc["doc_id"]) not in relevant_ids:
                hard_negatives.append(doc)
            if len(hard_negatives) >= num_negatives_needed:
                break

        output.append({
            "query_id": query_id,
            "query_text": query_text,
            "relevant_ids": list(relevant_ids),
            "bm25_ranked_list": scored_docs,
            "hard_negatives": hard_negatives
        })

    # Save JSONL output
    os.makedirs("data/bm25_sampling", exist_ok=True)
    output_path = f"data/bm25_sampling/bm25_with_scores_and_ranks_{lang}.jsonl"
    with open(output_path, "w", encoding="utf-8") as f:
        for entry in output:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"✅ BM25 sampling completed for {lang.upper()} and saved to {output_path}")



🔍 Processing language: FR


100%|██████████| 190/190 [00:12<00:00, 15.12it/s]


✅ BM25 sampling completed for FR and saved to data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl

🔍 Processing language: NL


100%|██████████| 190/190 [00:12<00:00, 15.42it/s]

✅ BM25 sampling completed for NL and saved to data/bm25_sampling/bm25_with_scores_and_ranks_nl.jsonl





In [None]:
from datasets import load_from_disk



#corpus_fr = corpus['fr']
#test_fr = test['fr']


### WITH SCORES AND RANKS FOR FRENCH

from datasets import load_from_disk
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import json
import os
from nltk.tokenize.toktok import ToktokTokenizer

# Initialize tokenizer
tokenizer = ToktokTokenizer()

# Load cleaned corpus and test queries (French as example)
corpus = load_from_disk("../data_processing/data/cleaned_corpus_ds/cleaned_corpus")['fr']
test_queries = load_from_disk("../data_processing/data/cleaned_queries_ds/cleaned_test_queries")['fr']

# Prepare corpus documents and their IDs
corpus_docs = [doc['article'] for doc in corpus]
corpus_ids = [str(doc['id']) for doc in corpus]

# Tokenize corpus with ToktokTokenizer
tokenized_corpus = [tokenizer.tokenize(doc.lower()) for doc in corpus_docs]
bm25 = BM25Okapi(tokenized_corpus)

# Prepare output
output = []

for query in tqdm(test_queries):

    query_id = query['id']
    query_text = query['question']
    
    # Get relevant article IDs for this query
    relevant_ids = [id_.strip() for id_ in query['article_ids'].split(",") if id_.strip() != ""]
    num_relevant = len(relevant_ids)
    
    # Calculate how many negatives needed
    num_negatives_needed = 100 - num_relevant
    
    # Tokenize query text using ToktokTokenizer
    tokenized_query = tokenizer.tokenize(query_text.lower())

    # BM25 scoring
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Combine document ID, score and index
    scored_docs = [
        {"doc_id": corpus_ids[idx], "score": float(bm25_scores[idx]), "rank": None}
        for idx in range(len(bm25_scores))
    ]
    
    # Sort by score (high to low) → rank them properly
    scored_docs = sorted(scored_docs, key=lambda x: x["score"], reverse=True)[:200]
    
    # Add final rank after sorting (rank 0 = highest score)
    for final_rank, doc in enumerate(scored_docs):
        doc["rank"] = final_rank + 1  # make ranks 1-based

    # Select negatives (skip relevant ids)
    hard_negatives = []
    
    for doc in scored_docs:
        if doc["doc_id"] not in relevant_ids:
            hard_negatives.append(doc)
        if len(hard_negatives) >= num_negatives_needed:
            break

    # Save full ranked list + hard negatives
    output.append({
        "query_id": query_id,
        "query_text": query_text,
        "relevant_ids": relevant_ids,
        "bm25_ranked_list": scored_docs,  # FULL ranked list with doc_id, score, rank
        "hard_negatives": hard_negatives  # selected hard negatives only
    })

# Save to JSONL
os.makedirs("data/bm25_sampling", exist_ok=True)
output_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

#print("✅ BM25 sampling complete and saved (with scores and ranks).")



100%|██████████| 190/190 [00:14<00:00, 13.08it/s]
