## Queries with long relevant articles in the original test set
### <<< All queries are considered, no matter how many relevant articles they have. >>>

In [55]:
from datasets import load_dataset
import json

with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = [str(id).strip() for id in json.load(f)]

ds_test = load_dataset("clips/bBSARD", "test")

test_fr = ds_test['fr']
test_nl = ds_test['nl']


# Check which long articles are cited in the test queries (French)
cited_long_articles_fr = []
for query in test_fr:
    relevant_ids = query['article_ids'].split(",")
    relevant_ids = [id_.strip() for id_ in relevant_ids]  # normalize by stripping spaces
    for doc_id in relevant_ids:
        if doc_id in long_article_ids:
            cited_long_articles_fr.append((query['id'], doc_id))

# Check which long articles are cited in the test queries (Dutch)
cited_long_articles_nl = []
for query in test_nl:
    relevant_ids = query['article_ids'].split(",")
    relevant_ids = [id_.strip() for id_ in relevant_ids]  # normalize by stripping spaces
    for doc_id in relevant_ids:
        if doc_id in long_article_ids:
            cited_long_articles_nl.append((query['id'], doc_id))

unique_queries_fr = set(query_id for query_id, doc_id in cited_long_articles_fr)
unique_long_articles_fr = set(doc_id for query_id, doc_id in cited_long_articles_fr)
num_unique_long_articles_fr = len(unique_long_articles_fr)


unique_queries_nl = set(query_id for query_id, doc_id in cited_long_articles_nl)
unique_long_articles_nl = set(doc_id for query_id, doc_id in cited_long_articles_nl)
num_unique_long_articles_nl = len(unique_long_articles_nl)

# Report results
print(f"French test queries citing long articles: {len(cited_long_articles_fr)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_fr) 
print("\nNumber of French unique queries citing long articles:", len(unique_queries_fr))
print("List of French unique citing long articles:", unique_queries_fr)
print("\nFrench unique long articles cited:", num_unique_long_articles_fr)
print("List of French long articles cited:", unique_long_articles_fr)

print("\n" + "="*60 + "\n")

print(f"Dutch test queries citing long articles: {len(cited_long_articles_nl)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_nl) 
print("\nNumber of Dutch unique queries citing long articles:", len(unique_queries_nl))
print("List of Dutch queries citing long articles:", unique_queries_nl)
print("\nDutch unique long articles cited:", num_unique_long_articles_nl)
print("List of Dutch long articles cited:", unique_long_articles_nl)

French test queries citing long articles: 41
Examples (Query ID, Long Article ID):
[(634, '13310'), (116, '5958'), (228, '18548'), (506, '13307'), (146, '4587'), (365, '1403'), (783, '4646'), (783, '4693'), (633, '13310'), (721, '6121'), (142, '4587'), (893, '856'), (684, '4756'), (1001, '611'), (649, '6103'), (649, '6104'), (215, '18548'), (877, '856'), (214, '4694'), (686, '4756'), (117, '5958'), (230, '18548'), (875, '856'), (1039, '4694'), (54, '14194'), (54, '14202'), (867, '856'), (132, '13531'), (691, '4756'), (701, '4756'), (220, '4694'), (632, '13310'), (231, '18548'), (219, '18548'), (664, '13056'), (982, '856'), (672, '13056'), (897, '856'), (983, '856'), (616, '4756'), (930, '1490')]

Number of French unique queries citing long articles: 38
List of French unique citing long articles: {897, 132, 649, 142, 783, 1039, 146, 664, 672, 930, 684, 686, 691, 54, 701, 721, 214, 215, 982, 983, 219, 220, 867, 228, 230, 231, 616, 1001, 875, 365, 877, 116, 117, 634, 632, 633, 506, 893}



## Queries with long relevant articles after removing the queries with more that 10 rel. art.
### <<< Only queries with <=10 articles are considered. >>>>

In [53]:
import json
from datasets import load_dataset

# Load long article IDs
with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = [str(id).strip() for id in json.load(f)]

# Load queries with too many relevant articles → skip these
with open("queries_with_many_relevant_articles.json", "r", encoding="utf-8") as f:
    queries_to_remove = json.load(f)

queries_to_remove_fr = set(queries_to_remove["fr"])
queries_to_remove_nl = set(queries_to_remove["nl"])

# Load test sets
ds_test = load_dataset("clips/bBSARD", "test")
test_fr = ds_test['fr']
test_nl = ds_test['nl']

# === French
cited_long_articles_fr = {}

for query in test_fr:
    query_id = query['id']
    if query_id in queries_to_remove_fr:
        continue  # skip queries with too many relevant articles

    relevant_ids = [id_.strip() for id_ in query['article_ids'].split(",")]

    long_cited = [doc_id for doc_id in relevant_ids if doc_id in long_article_ids]

    if long_cited:
        cited_long_articles_fr[str(query_id)] = long_cited

# === Dutch
cited_long_articles_nl = {}

for query in test_nl:
    query_id = query['id']
    if query_id in queries_to_remove_nl:
        continue  # skip queries with too many relevant articles

    relevant_ids = [id_.strip() for id_ in query['article_ids'].split(",")]

    long_cited = [doc_id for doc_id in relevant_ids if doc_id in long_article_ids]

    if long_cited:
        cited_long_articles_nl[str(query_id)] = long_cited

# Combine into one dictionary
cited_long_articles = {
    "fr": cited_long_articles_fr,
    "nl": cited_long_articles_nl
}

with open("queries_citing_long_articles.json", "w", encoding="utf-8") as f:
    json.dump(cited_long_articles, f, ensure_ascii=False, indent=2)

print("Saved queries citing long articles to queries_citing_long_articles.json")
print(f"French queries: {len(cited_long_articles_fr)}")
print(f"Dutch queries: {len(cited_long_articles_nl)}")



print(f"French test queries citing long articles: {len(cited_long_articles_fr)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_fr) 
print("\nNumber of French unique queries citing long articles:", len(unique_queries_fr))
print("List of French unique citing long articles:", unique_queries_fr)
print("\nFrench unique long articles cited:", num_unique_long_articles_fr)
print("List of French long articles cited:", unique_long_articles_fr)

print("\n" + "="*60 + "\n")

print(f"Dutch test queries citing long articles: {len(cited_long_articles_nl)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_nl) 
print("\nNumber of Dutch unique queries citing long articles:", len(unique_queries_nl))
print("List of Dutch queries citing long articles:", unique_queries_nl)
print("\nDutch unique long articles cited:", num_unique_long_articles_nl)
print("List of Dutch long articles cited:", unique_long_articles_nl)

Saved queries citing long articles to queries_citing_long_articles.json
French queries: 33
Dutch queries: 33
French test queries citing long articles: 33
Examples (Query ID, Long Article ID):
{'634': ['13310'], '116': ['5958'], '228': ['18548'], '506': ['13307'], '146': ['4587'], '365': ['1403'], '783': ['4646', '4693'], '633': ['13310'], '721': ['6121'], '142': ['4587'], '893': ['856'], '1001': ['611'], '649': ['6103', '6104'], '215': ['18548'], '877': ['856'], '214': ['4694'], '117': ['5958'], '230': ['18548'], '875': ['856'], '1039': ['4694'], '54': ['14194', '14202'], '867': ['856'], '132': ['13531'], '220': ['4694'], '632': ['13310'], '231': ['18548'], '219': ['18548'], '664': ['13056'], '982': ['856'], '672': ['13056'], '897': ['856'], '983': ['856'], '930': ['1490']}

Number of French unique queries citing long articles: 38
List of French unique citing long articles: {897, 132, 649, 142, 783, 1039, 146, 664, 672, 930, 684, 686, 691, 54, 701, 721, 214, 215, 982, 983, 219, 220, 86

## Cleaning the queries data
## Removing article ids of long articles from the relevant articles field
### <<< 1: queries with more than 10 articles will be completely removed >>>
### <<< Long article ids will be removed from the relevant id field. >>>

In [None]:
## Cleaning and saving as CSV and dataset.arrows
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import json
import os


ds_test = load_dataset("clips/bBSARD", "test")
test_fr = ds_test["fr"]
test_nl = ds_test["nl"]

# Load long article citation map
with open("queries_citing_long_articles.json", "r", encoding="utf-8") as f:
    queries_citing_long_articles = json.load(f)

long_articles_fr = queries_citing_long_articles["fr"]
long_articles_nl = queries_citing_long_articles["nl"]

# Cleaning function
def clean_query_set(test_set, long_article_map, lang_label):
    cleaned = []
    for query in test_set:
        query_id = str(query["id"])
        relevant_ids = [id_.strip() for id_ in query["article_ids"].split(",")]

        if len(relevant_ids) > 10:
            continue

        if query_id in long_article_map:
            long_ids = set(long_article_map[query_id])
            relevant_ids = [id_ for id_ in relevant_ids if id_ not in long_ids]

        if len(relevant_ids) == 0:
            continue

        query_cleaned = dict(query)
        query_cleaned["article_ids"] = ", ".join(relevant_ids)
        cleaned.append(query_cleaned)
    print(f"Cleaned {len(cleaned)} {lang_label} queries.")
    return cleaned

cleaned_fr = clean_query_set(test_fr, long_articles_fr, "French")
cleaned_nl = clean_query_set(test_nl, long_articles_nl, "Dutch")

os.makedirs("data/cleaned_queries_csv", exist_ok=True)
pd.DataFrame(cleaned_fr).to_csv("data/cleaned_queries_csv/cleaned_test_queries_fr.csv", index=False)
pd.DataFrame(cleaned_nl).to_csv("data/cleaned_queries_csv/cleaned_test_queries_nl.csv", index=False)

ds_cleaned_fr = Dataset.from_list(cleaned_fr)
ds_cleaned_nl = Dataset.from_list(cleaned_nl)

ds_cleaned = DatasetDict({
    "fr": ds_cleaned_fr,
    "nl": ds_cleaned_nl
})
os.makedirs("data/cleaned_queries_ds", exist_ok=True)
ds_cleaned.save_to_disk("data/cleaned_queries_ds/cleaned_test_queries")
print("Saved cleaned test queries to HuggingFace dataset format.")

Cleaned 190 French queries.
Cleaned 190 Dutch queries.


Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 23156.79 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 77581.56 examples/s]

Saved cleaned test queries to HuggingFace dataset format.





In [84]:
from datasets import load_from_disk
ds = load_from_disk("data/cleaned_queries_ds/cleaned_test_queries")
print(ds["fr"].shape)
print(ds["nl"].shape)

(190, 6)
(190, 6)


In [85]:
from datasets import load_from_disk
ds = load_from_disk("data/cleaned_queries_ds/cleaned_test_queries")
print(ds["fr"][0])

{'id': 775, 'category': 'Logement', 'subcategory': 'Insalubrité en Wallonie', 'question': "Quels sont les critères communaux d'insalubrité ?", 'extra_description': None, 'article_ids': '11822'}


In [86]:
from datasets import load_from_disk
import random

dataset_path = "data/cleaned_queries_ds/cleaned_test_queries"
ds = load_from_disk(dataset_path)

print(ds)

print("\n--- Random French Queries ---")
fr_samples = random.sample(list(ds["fr"]), k=min(5, len(ds["fr"])))
for i, sample in enumerate(fr_samples, 1):
    print(f"{i}. ID: {sample['id']} | Relevant articles: {sample['article_ids']}")

print("\n--- Random Dutch Queries ---")
nl_samples = random.sample(list(ds["nl"]), k=min(5, len(ds["nl"])))
for i, sample in enumerate(nl_samples, 1):
    print(f"{i}. ID: {sample['id']} | Relevant articles: {sample['article_ids']}")

DatasetDict({
    fr: Dataset({
        features: ['id', 'category', 'subcategory', 'question', 'extra_description', 'article_ids'],
        num_rows: 190
    })
    nl: Dataset({
        features: ['id', 'category', 'subcategory', 'question', 'extra_description', 'article_ids'],
        num_rows: 190
    })
})

--- Random French Queries ---
1. ID: 824 | Relevant articles: 11922
2. ID: 132 | Relevant articles: 5970, 22325
3. ID: 262 | Relevant articles: 1277
4. ID: 172 | Relevant articles: 1863, 1864, 1865
5. ID: 430 | Relevant articles: 1274, 1275, 1276

--- Random Dutch Queries ---
1. ID: 257 | Relevant articles: 1289, 6556
2. ID: 7 | Relevant articles: 2222
3. ID: 861 | Relevant articles: 5559, 5560, 5561, 5562, 5563
4. ID: 493 | Relevant articles: 1082, 891, 5466, 5467, 5468
5. ID: 724 | Relevant articles: 13138


## The following code is only for analysis

In [87]:

'''
############################################################################################################

>>> This script is pre-removal: It is only for checking the statistics in the original data <<<

############################################################################################################
'''


from datasets import load_dataset
import json

ds_corpus = load_dataset("clips/bBSARD", "corpus")
ds_test = load_dataset("clips/bBSARD", "test")

corpus_fr = ds_corpus['fr']
corpus_nl = ds_corpus['nl']

test_fr = ds_test['fr']
test_nl = ds_test['nl']

threshold = 5000  

long_article_ids_fr = set(str(doc['id']) for doc in corpus_fr if len(doc['article']) > threshold)
long_article_ids_nl = set(str(doc['id']) for doc in corpus_nl if len(doc['article']) > threshold)

cited_long_articles_fr = []
for query in test_fr:
    relevant_ids = query['article_ids'].split(",")
    relevant_ids = [id_.strip() for id_ in relevant_ids] 
    for doc_id in relevant_ids:
        if doc_id in long_article_ids_fr:
            cited_long_articles_fr.append((query['id'], doc_id))

cited_long_articles_nl = []
for query in test_nl:
    relevant_ids = query['article_ids'].split(",")
    relevant_ids = [id_.strip() for id_ in relevant_ids] 
    for doc_id in relevant_ids:
        if doc_id in long_article_ids_nl:
            cited_long_articles_nl.append((query['id'], doc_id))

unique_queries_fr = set(query_id for query_id, doc_id in cited_long_articles_fr)
unique_long_articles_fr = set(doc_id for query_id, doc_id in cited_long_articles_fr)
num_unique_long_articles_fr = len(unique_long_articles_fr)


unique_queries_nl = set(query_id for query_id, doc_id in cited_long_articles_nl)
unique_long_articles_nl = set(doc_id for query_id, doc_id in cited_long_articles_nl)
num_unique_long_articles_nl = len(unique_long_articles_nl)

# results
print(f"French test queries citing long articles: {len(cited_long_articles_fr)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_fr) 
print("\nNumber of French unique queries citing long articles:", len(unique_queries_fr))
print("List of French unique citing long articles:", unique_queries_fr)
print("\nFrench unique long articles cited:", num_unique_long_articles_fr)
print("List of French long articles cited:", unique_long_articles_fr)

print("\n" + "="*60 + "\n")

print(f"Dutch test queries citing long articles: {len(cited_long_articles_nl)}")
print("Examples (Query ID, Long Article ID):")
print(cited_long_articles_nl) 
print("\nNumber of Dutch unique queries citing long articles:", len(unique_queries_nl))
print("List of Dutch queries citing long articles:", unique_queries_nl)
print("\nDutch unique long articles cited:", num_unique_long_articles_nl)
print("List of Dutch long articles cited:", unique_long_articles_nl)

French test queries citing long articles: 38
Examples (Query ID, Long Article ID):
[(634, '13310'), (228, '18548'), (506, '13307'), (146, '4587'), (365, '1403'), (783, '4693'), (633, '13310'), (721, '6121'), (142, '4587'), (893, '856'), (684, '4756'), (1001, '611'), (649, '6103'), (649, '6104'), (215, '18548'), (877, '856'), (214, '4694'), (686, '4756'), (230, '18548'), (875, '856'), (1039, '4694'), (54, '14194'), (54, '14202'), (867, '856'), (132, '13531'), (691, '4756'), (701, '4756'), (220, '4694'), (632, '13310'), (231, '18548'), (219, '18548'), (664, '13056'), (982, '856'), (672, '13056'), (897, '856'), (983, '856'), (616, '4756'), (930, '1490')]

Number of French unique queries citing long articles: 36
List of French unique citing long articles: {897, 132, 649, 142, 783, 1039, 146, 664, 672, 930, 684, 686, 691, 54, 701, 721, 214, 215, 982, 983, 219, 220, 867, 228, 230, 231, 616, 1001, 875, 365, 877, 634, 632, 633, 506, 893}

French unique long articles cited: 18
List of French lo