## Dutch article cleaning

In [None]:
import re
import os

def clean_article_start(text):
    # removing tokens in the start of the  article
    text = re.sub(r"^(Art\.?|Artikel|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|BIJLAGE|Inbreuk op artikel|Voorheen)\s*", "", text, flags=re.IGNORECASE)

    # removing numbers, dots, non-words
    text = re.sub(r"^\d+[^\w]*\s*", "", text)

    # list of common words in the beginning
    forbidden_words = ["Art", "Artikel", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "BIJLAGE", "Inbreuk op artikel", "Voorheen"]

    # scanning for capital letter
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()

        # excluding capital letter if inside a common word
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]

        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue

        # checking next two characters after capital letter
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue

        return text[start_index:].strip()

    # If nothing found → return original text
    return text.strip()


# ---> REMOVING LONG ARTICLES BEFORE CLEANING <--- Can take the next 3 lines of code out when working with the whole corpus
with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = json.load(f)

df_corpus_nl = df_corpus_nl[~df_corpus_nl["id"].isin(long_article_ids)]

# apply cleaning
df_corpus_nl["article_cleaned"] = df_corpus_nl["article"].apply(clean_article_start)

# making two directories for cleaned corpus and mixed corpus for comparison
os.makedirs("data/original_cleaned_mix_corpus", exist_ok=True)
os.makedirs("data/cleaned_corpus", exist_ok=True)

df_corpus_nl.to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_nl_corpus.csv", index=False)


df_corpus_nl_original_format = df_corpus_nl[["id", "reference", "article_cleaned"]].rename(columns={"article_cleaned": "article"})
df_corpus_nl_original_format.to_csv("data/cleaned_corpus/corpus_nl_cleaned.csv", index=False)

print("Created a mixed CSV of original and cleaned Dutch article texts for comparison.")

print("Saved cleaned Dutch corpus as CSV.")

## French article cleaning

In [None]:
import re
import os

def clean_article_start(text):
    # removing tokens in the start of the  article
    text = re.sub(r"[\(\[]\s*(ancien article|ancien art|erronément intitulé art\.?)\s*\d+[^\]\)]*[\)\]]", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"^(Art\.?|Article|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|ancien article|Infraction à l'article)\s*", "", text, flags=re.IGNORECASE)

    # removing numbers, dots, non-words
    text = re.sub(r"^\d+[^\w]*\s*", "", text)

    # list of common words in the beginning
    forbidden_words = ["Art", "Article", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "ancien article", "Infraction à l'article"]

    # scanning for capital letter
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()

        # excluding capital letter if inside a common word
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]

        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue

        # checking next two characters after capital letter
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue

        return text[start_index:].strip()

    return text.strip()

# ---> REMOVING LONG ARTICLES BEFORE CLEANING <--- Can take the next 3 lines of code out when working with the whole corpus
with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = json.load(f)

df_corpus_fr = df_corpus_fr[~df_corpus_fr["id"].isin(long_article_ids)]
# apply cleaning
df_corpus_fr["article_cleaned"] = df_corpus_fr["article"].apply(clean_article_start)

# making two directories for cleaned corpus and mixed corpus for comparison
os.makedirs("data/original_cleaned_mix_corpus", exist_ok=True)
os.makedirs("data/cleaned_corpus", exist_ok=True)

df_corpus_fr.to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_fr_corpus.csv", index=False)

df_corpus_fr_original_format = df_corpus_fr[["id", "reference", "article_cleaned"]].rename(columns={"article_cleaned": "article"})
df_corpus_fr_original_format.to_csv("data/cleaned_corpus/corpus_fr_cleaned.csv", index=False)

print("Created a mixed CSV of original and cleaned French article texts for comparison.")
print("Saved cleaned French corpus as CSV.")

## SAMPLING CODES (BM25)

In [None]:
from datasets import load_from_disk

corpus = load_from_disk("/content/data/cleaned/corpus")
test = load_from_disk("/content/data/cleaned/test")

corpus_fr = corpus['fr']
test_fr = test['fr']


### WITH SCORES AND RANKS FOR FRENCH

from datasets import load_from_disk
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import json
import os
from nltk.tokenize.toktok import ToktokTokenizer

# Initialize tokenizer
tokenizer = ToktokTokenizer()

# Load cleaned corpus and test queries (French as example)
corpus = load_from_disk("data/cleaned/corpus")['fr']
test_queries = load_from_disk("data/cleaned/test")['fr']

# Prepare corpus documents and their IDs
corpus_docs = [doc['article'] for doc in corpus]
corpus_ids = [str(doc['id']) for doc in corpus]

# Tokenize corpus with ToktokTokenizer
tokenized_corpus = [tokenizer.tokenize(doc.lower()) for doc in corpus_docs]
bm25 = BM25Okapi(tokenized_corpus)

# Prepare output
output = []

for query in tqdm(test_queries):

    query_id = query['id']
    query_text = query['question']
    
    # Get relevant article IDs for this query
    relevant_ids = [id_.strip() for id_ in query['article_ids'].split(",") if id_.strip() != ""]
    num_relevant = len(relevant_ids)
    
    # Calculate how many negatives needed
    num_negatives_needed = 100 - num_relevant
    
    # Tokenize query text using ToktokTokenizer
    tokenized_query = tokenizer.tokenize(query_text.lower())

    # BM25 scoring
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Combine document ID, score and index
    scored_docs = [
        {"doc_id": corpus_ids[idx], "score": float(bm25_scores[idx]), "rank": None}
        for idx in range(len(bm25_scores))
    ]
    
    # Sort by score (high to low) → rank them properly
    scored_docs = sorted(scored_docs, key=lambda x: x["score"], reverse=True)
    
    # Add final rank after sorting (rank 0 = highest score)
    for final_rank, doc in enumerate(scored_docs):
        doc["rank"] = final_rank + 1  # make ranks 1-based

    # Select negatives (skip relevant ids)
    hard_negatives = []
    
    for doc in scored_docs:
        if doc["doc_id"] not in relevant_ids:
            hard_negatives.append(doc)
        if len(hard_negatives) >= num_negatives_needed:
            break

    # Save full ranked list + hard negatives
    output.append({
        "query_id": query_id,
        "query_text": query_text,
        "relevant_ids": relevant_ids,
        "bm25_ranked_list": scored_docs,  # FULL ranked list with doc_id, score, rank
        "hard_negatives": hard_negatives  # selected hard negatives only
    })

# Save to JSONL
os.makedirs("data/bm25_sampling", exist_ok=True)
output_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

#print("✅ BM25 sampling complete and saved (with scores and ranks).")



FileNotFoundError: [Errno 2] No such file or directory: 'bm25_hard_negatives_fr.jsonl'

In [None]:
### WITH SCORES AND RANKS FOR DUTCH

from datasets import load_from_disk
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import json
import os
from nltk.tokenize.toktok import ToktokTokenizer

# Initialize tokenizer
tokenizer = ToktokTokenizer()

# Load cleaned corpus and test queries (Dutch)
corpus = load_from_disk("data/cleaned/corpus")['nl']
test_queries = load_from_disk("data/cleaned/test")['nl']

# Prepare corpus documents and their IDs
corpus_docs = [doc['article'] for doc in corpus]
corpus_ids = [str(doc['id']) for doc in corpus]

# Tokenize corpus with ToktokTokenizer
tokenized_corpus = [tokenizer.tokenize(doc.lower()) for doc in corpus_docs]
bm25 = BM25Okapi(tokenized_corpus)

# Prepare output
output = []

for query in tqdm(test_queries):

    query_id = query['id']
    query_text = query['question']
    
    # Get relevant article IDs for this query
    relevant_ids = [id_.strip() for id_ in query['article_ids'].split(",") if id_.strip() != ""]
    num_relevant = len(relevant_ids)
    
    # Calculate how many negatives needed
    num_negatives_needed = 100 - num_relevant
    
    # Tokenize query text using ToktokTokenizer
    tokenized_query = tokenizer.tokenize(query_text.lower())

    # BM25 scoring
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Combine document ID, score and index
    scored_docs = [
        {"doc_id": corpus_ids[idx], "score": float(bm25_scores[idx]), "rank": None}
        for idx in range(len(bm25_scores))
    ]
    
    # Sort by score (high to low) → rank them properly
    scored_docs = sorted(scored_docs, key=lambda x: x["score"], reverse=True)
    
    # Add final rank after sorting (rank 0 = highest score)
    for final_rank, doc in enumerate(scored_docs):
        doc["rank"] = final_rank + 1  # make ranks 1-based

    # Select negatives (skip relevant ids)
    hard_negatives = []
    
    for doc in scored_docs:
        if doc["doc_id"] not in relevant_ids:
            hard_negatives.append(doc)
        if len(hard_negatives) >= num_negatives_needed:
            break

    # Save full ranked list + hard negatives
    output.append({
        "query_id": query_id,
        "query_text": query_text,
        "relevant_ids": relevant_ids,
        "bm25_ranked_list": scored_docs,  # FULL ranked list with doc_id, score, rank
        "hard_negatives": hard_negatives  # selected hard negatives only
    })

# Save to JSONL
os.makedirs("data/bm25_sampling", exist_ok=True)
output_path = "data/bm25_sampling/bm25_with_scores_and_ranks_nl.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("BM25 sampling complete and saved (with scores and ranks) for DUTCH.")

In [None]:
### FOR LOADING AND CHECKING


import json

# Load French BM25 results
fr_path = "data/bm25_sampling/bm25_with_scores_and_ranks_fr.jsonl"
with open(fr_path, "r", encoding="utf-8") as f:
    french_data = [json.loads(line) for line in f]

print(f"✅ Loaded {len(french_data)} French BM25 entries")
print("Example:")
print(french_data[:5])  # Show first entry

# Load Dutch BM25 results
nl_path = "data/bm25_sampling/bm25_hard_negatives_nl.jsonl"
with open(nl_path, "r", encoding="utf-8") as f:
    dutch_data = [json.loads(line) for line in f]

print(f"\n✅ Loaded {len(dutch_data)} Dutch BM25 entries")
print("Example:")
print(dutch_data[5])  # Show first entry

## The following code is for removing all the relevant article ids from the data. It can be used but I will not use it here

In [None]:
## Cleaning and saving as CSV and dataset.arrows
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import json
import os


ds_test = load_dataset("clips/bBSARD", "test")
test_fr = ds_test["fr"]
test_nl = ds_test["nl"]

# Load long article citation map
with open("queries_citing_long_articles.json", "r", encoding="utf-8") as f:
    queries_citing_long_articles = json.load(f)

long_articles_fr = queries_citing_long_articles["fr"]
long_articles_nl = queries_citing_long_articles["nl"]

# Cleaning function
def clean_query_set(test_set, long_article_map, lang_label):
    cleaned = []
    for query in test_set:
        query_id = str(query["id"])
        relevant_ids = [id_.strip() for id_ in query["article_ids"].split(",")]

        if len(relevant_ids) > 10:
            continue

        if query_id in long_article_map:
            long_ids = set(long_article_map[query_id])
            relevant_ids = [id_ for id_ in relevant_ids if id_ not in long_ids]

        if len(relevant_ids) == 0:
            continue

        query_cleaned = dict(query)
        query_cleaned["article_ids"] = ", ".join(relevant_ids)
        cleaned.append(query_cleaned)
    print(f"Cleaned {len(cleaned)} {lang_label} queries.")
    return cleaned

cleaned_fr = clean_query_set(test_fr, long_articles_fr, "French")
cleaned_nl = clean_query_set(test_nl, long_articles_nl, "Dutch")

os.makedirs("data/cleaned_queries_csv", exist_ok=True)
pd.DataFrame(cleaned_fr).to_csv("data/cleaned_queries_csv/cleaned_test_queries_fr.csv", index=False)
pd.DataFrame(cleaned_nl).to_csv("data/cleaned_queries_csv/cleaned_test_queries_nl.csv", index=False)

ds_cleaned_fr = Dataset.from_list(cleaned_fr)
ds_cleaned_nl = Dataset.from_list(cleaned_nl)

ds_cleaned = DatasetDict({
    "fr": ds_cleaned_fr,
    "nl": ds_cleaned_nl
})
os.makedirs("data/cleaned_queries_ds", exist_ok=True)
ds_cleaned.save_to_disk("data/cleaned_queries_ds/cleaned_test_queries")
print("Saved cleaned test queries to HuggingFace dataset format.")

## counting words of hard negatives for estimation on tokens

In [18]:
import json
import pandas as pd
from pathlib import Path

# CONFIG: paths — do NOT change
corpus_paths = {
    "fr": Path("data_processing/data/original_csv/corpus_fr.csv"),
    "nl": Path("data_processing/data/original_csv/corpus_nl.csv")
}
hard_negatives_paths = {
    "fr": Path("sampling_hard_negatives/hard_negatives/hard_negatives_fr.jsonl"),
    "nl": Path("sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl")
}

# Output folder
output_dir = Path("hard_negatives_stats")
output_dir.mkdir(parents=True, exist_ok=True)

for lang in ["fr", "nl"]:
    print(f"\nProcessing language: {lang.upper()}")

    # Load corpus CSV
    print(f"Loading original corpus for language: {lang}")
    df_corpus = pd.read_csv(corpus_paths[lang])
    print(f"Loaded {len(df_corpus)} documents from CSV.")

    # Create ID → article mapping
    id_to_doc = dict(zip(df_corpus['id'].astype(str), df_corpus['article']))
    print("Created ID-to-text mapping.")

    # Load hard negatives JSONL
    print(f"Loading hard negatives for language: {lang}")
    entries = []
    with open(hard_negatives_paths[lang], encoding="utf-8") as f:
        for line in f:
            entries.append(json.loads(line.strip()))

    print(f"Loaded {len(entries)} queries with hard negatives.")

    # Collect per-query total word counts
    results = []

    for entry in entries:
        query_id = entry['query_id']
        candidate_ids = entry['candidate_docs']

        total_words = 0
        missing_docs = 0

        for doc_id in candidate_ids:
            text = id_to_doc.get(doc_id, "").strip()
            if not text:
                missing_docs += 1
                continue
            total_words += len(text.split())

        results.append({
            "query_id": query_id,
            "total_words": total_words,
            "missing_docs": missing_docs
        })

    df_results = pd.DataFrame(results)
    print("\nPer-query total word counts (first 5 rows):")
    print(df_results.head())

    # Save to CSV
    output_path = output_dir / f"query_word_counts_{lang}.csv"
    df_results.to_csv(output_path, index=False)
    print(f"Saved results to: {output_path}")


Processing language: FR
Loading original corpus for language: fr
Loaded 22417 documents from CSV.
Created ID-to-text mapping.
Loading hard negatives for language: fr
Loaded 203 queries with hard negatives.

Per-query total word counts (first 5 rows):
  query_id  total_words  missing_docs
0        4        22801             0
1        7        24975             0
2       16        13061             0
3       17        18037             0
4       25        14777             0
Saved results to: hard_negatives_stats/query_word_counts_fr.csv

Processing language: NL
Loading original corpus for language: nl
Loaded 22417 documents from CSV.
Created ID-to-text mapping.
Loading hard negatives for language: nl
Loaded 203 queries with hard negatives.

Per-query total word counts (first 5 rows):
  query_id  total_words  missing_docs
0        4        10448             0
1        7        18852             0
2       16        15842             0
3       17        17496             0
4       25    

In [19]:
import pandas as pd
from pathlib import Path

# Paths
output_dir = Path("hard_negatives_stats")
output_txt = output_dir / "query_word_counts_statistics.txt"

with open(output_txt, "w", encoding="utf-8") as f_out:
    for lang in ["fr", "nl"]:
        input_csv = output_dir / f"query_word_counts_{lang}.csv"
        df = pd.read_csv(input_csv)

        f_out.write(f"=== Statistics for language: {lang.upper()} ===\n")
        print(f"=== Statistics for language: {lang.upper()} ===")

        desc_words = df['total_words'].describe()
        desc_missing = df['missing_docs'].describe()

        f_out.write("\nTotal Words (in 100 docs per query):\n")
        print("\nTotal Words (in 100 docs per query):")
        f_out.write(desc_words.to_string() + "\n")
        print(desc_words)

        f_out.write("\nMissing Docs (in 100 docs per query):\n")
        print("\nMissing Docs (in 100 docs per query):")
        f_out.write(desc_missing.to_string() + "\n\n")
        print(desc_missing)
        print("\n")

print(f"Statistics saved to: {output_txt}")

=== Statistics for language: FR ===

Total Words (in 100 docs per query):
count      203.000000
mean     23812.359606
std       6935.895663
min       6265.000000
25%      18975.500000
50%      23403.000000
75%      28420.500000
max      40998.000000
Name: total_words, dtype: float64

Missing Docs (in 100 docs per query):
count    203.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: missing_docs, dtype: float64


=== Statistics for language: NL ===

Total Words (in 100 docs per query):
count      203.000000
mean     22563.310345
std       6651.436819
min       8261.000000
25%      17574.000000
50%      22393.000000
75%      26977.000000
max      41883.000000
Name: total_words, dtype: float64

Missing Docs (in 100 docs per query):
count    203.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: missing_docs, dtype: float64


Statistics saved to: hard_negatives_stat

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Create output folder
output_dir = Path("hard_negatives_stats")
output_dir.mkdir(parents=True, exist_ok=True)

for lang in ["fr", "nl"]:
    input_csv = f"query_word_counts_{lang}.csv"

    # Load per-query results
    df = pd.read_csv(input_csv)

    # Plot histogram
    plt.figure(figsize=(10, 6))
    plt.hist(df['total_words'], bins=30, color='skyblue', edgecolor='black')
    plt.title(f"Distribution of Total Words Across Queries ({lang.upper()})")
    plt.xlabel("Total Words in 100 Documents")
    plt.ylabel("Number of Queries")
    plt.grid(axis='y', alpha=0.75)

    # Save
    output_png = output_dir / f"query_word_counts_histogram_{lang}.png"
    plt.savefig(output_png, dpi=150, bbox_inches='tight')
    plt.close()

    print(f"Histogram saved to: {output_png}")

Histogram saved to: hard_negatives_stats/query_word_counts_histogram_fr.png
Histogram saved to: hard_negatives_stats/query_word_counts_histogram_nl.png


In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Create output folder
output_dir = Path("hard_negatives_stats")
output_dir.mkdir(parents=True, exist_ok=True)

for lang in ["fr", "nl"]:
    input_csv = f"query_word_counts_{lang}.csv"

    # Load results
    df = pd.read_csv(input_csv)

    # Sort by query_id
    df['query_id'] = df['query_id'].astype(str)
    df = df.sort_values(by='query_id')

    # Plot
    plt.figure(figsize=(16, 6))
    plt.bar(df['query_id'], df['total_words'], color='steelblue')
    plt.title(f"Total Words in 100 Candidate Docs per Query ({lang.upper()})")
    plt.xlabel("Query ID")
    plt.ylabel("Total Words")
    plt.xticks(rotation=90, fontsize=6)  # rotate x-axis labels for readability
    plt.tight_layout()

    # Save
    output_png = output_dir / f"query_word_counts_barplot_{lang}.png"
    plt.savefig(output_png, dpi=150, bbox_inches='tight')
    plt.close()

    print(f"Bar plot saved to: {output_png}")

Bar plot saved to: hard_negatives_stats/query_word_counts_barplot_fr.png
Bar plot saved to: hard_negatives_stats/query_word_counts_barplot_nl.png


In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Create output folder
output_dir = Path("hard_negatives_stats")
output_dir.mkdir(parents=True, exist_ok=True)

# Load both datasets
data = {}
for lang in ["fr", "nl"]:
    input_csv = f"query_word_counts_{lang}.csv"
    df = pd.read_csv(input_csv)
    data[lang.upper()] = df['total_words']

# Plot boxplot
plt.figure(figsize=(8, 6))
plt.boxplot([data["FR"], data["NL"]],
            vert=True,
            patch_artist=True,
            labels=["FR", "NL"],
            boxprops=dict(facecolor='lightblue', color='black'),
            medianprops=dict(color='red'))

plt.title("Box Plot of Total Words Across Queries (FR vs NL)")
plt.ylabel("Total Words in 100 Documents")
plt.grid(axis='y', alpha=0.5)

# Save
output_png = output_dir / "query_word_counts_boxplot_fr_nl.png"
plt.savefig(output_png, dpi=150, bbox_inches='tight')
plt.close()

print(f"Box plot saved to: {output_png}")

Box plot saved to: hard_negatives_stats/query_word_counts_boxplot_fr_nl.png


  plt.boxplot([data["FR"], data["NL"]],


## Counting tokens

### LLAMA tokens

In [23]:
# Statistics into one text file
stats_txt = base_dir / "query_word_counts_and_tokens_llama_statistics.txt"
with open(stats_txt, "w", encoding="utf-8") as f_out:
    for lang in ["fr", "nl"]:
        f_out.write(f"=== Statistics for language: {lang.upper()} ===\n")
        print(f"\n=== Statistics for language: {lang.upper()} ===")

        df_lang = dfs[lang]

        desc_tokens = df_lang['total_tokens'].describe()
        desc_words = df_lang['total_words'].describe()
        desc_missing = df_lang['missing_docs'].describe()

        total_tokens_all_queries = df_lang['total_tokens'].sum()
        total_words_all_queries = df_lang['total_words'].sum()

        f_out.write("\nTotal Tokens (in 100 docs per query):\n")
        print("\nTotal Tokens (in 100 docs per query):")
        f_out.write(desc_tokens.to_string() + "\n")
        print(desc_tokens)

        f_out.write("\nTotal Words (in 100 docs per query):\n")
        print("\nTotal Words (in 100 docs per query):")
        f_out.write(desc_words.to_string() + "\n")
        print(desc_words)

        f_out.write("\nMissing Docs (in 100 docs per query):\n")
        print("\nMissing Docs (in 100 docs per query):")
        f_out.write(desc_missing.to_string() + "\n")
        print(desc_missing)

        f_out.write(f"\n*** TOTAL WORDS across all queries: {total_words_all_queries} ***\n")
        print(f"\n*** TOTAL WORDS across all queries: {total_words_all_queries} ***")

        f_out.write(f"*** TOTAL TOKENS across all queries: {total_tokens_all_queries} ***\n\n")
        print(f"*** TOTAL TOKENS across all queries: {total_tokens_all_queries} ***")

print(f"\nStatistics saved to: {stats_txt}")


=== Statistics for language: FR ===

Total Tokens (in 100 docs per query):
count      203.000000
mean     44067.000000
std      12753.935272
min      11470.000000
25%      35263.500000
50%      43301.000000
75%      52735.500000
max      77976.000000
Name: total_tokens, dtype: float64

Total Words (in 100 docs per query):
count      203.000000
mean     23812.359606
std       6935.895663
min       6265.000000
25%      18975.500000
50%      23403.000000
75%      28420.500000
max      40998.000000
Name: total_words, dtype: float64

Missing Docs (in 100 docs per query):
count    203.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: missing_docs, dtype: float64

*** TOTAL WORDS across all queries: 4833909 ***
*** TOTAL TOKENS across all queries: 8945601 ***

=== Statistics for language: NL ===

Total Tokens (in 100 docs per query):
count      203.000000
mean     47242.541872
std      13846.857202
min      16840.000000
25%      