## LOADING THE DATA

In [47]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

ds_corpus = load_dataset("clips/bBSARD", "corpus")
ds_test = load_dataset("clips/bBSARD", "test")

corpus_fr = ds_corpus['fr']
test_fr = ds_test['fr']

corpus_nl = ds_corpus['nl']
test_nl = ds_test['nl']

queries_fr = test_fr
queries_nl = test_nl

In [4]:
corpus_fr[500:510]['article']

["Art. 265. Toute convention, écrite ou verbale portant sur une aliénation soumise au droit de préemption conformément à l'article 263 est irréfragablement réputée conclue sous condition suspensive de non-exercice du droit de préemption établi en vertu du présent Titre. . (579)",
 "Art. 266 § 1er. Toute personne physique ou morale, titulaire de droits réels sur tout ou partie d'immeubles bâtis ou non bâtis situés dans un périmètre de préemption qui a l'intention d'aliéner tout ou partie de ces droits réels immobiliers, a l'obligation d'en informer l'administration. Cette déclaration d'intention d'aliéner doit être effectuée dès la diffusion de l'offre d'aliéner, sur quelque support que ce soit. A défaut, le notaire ou l'agent immobilier au sens de l'arrêté royal du 6 septembre 1993 protégeant le titre professionnel et l'exercice de la profession d'agent immobilier, en ce compris les personnes visées à l'article 4 de cet arrêté, chargé de cette aliénation est soumis à la même obligation

## MAKING CSV / LOADING long_article_ids.json

In [48]:
import json

with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = json.load(f)

In [49]:
ds_corpus = load_dataset("clips/bBSARD", "corpus")
ds_test = load_dataset("clips/bBSARD", "test")

corpus_fr = ds_corpus['fr']
corpus_nl = ds_corpus['nl']

test_fr = ds_test['fr']
test_nl = ds_test['nl']

df_corpus_fr = pd.DataFrame(corpus_fr)
df_corpus_nl = pd.DataFrame(corpus_nl)

df_test_fr = pd.DataFrame(test_fr)
df_test_nl = pd.DataFrame(test_nl)

os.makedirs("data/original_csv", exist_ok=True)

df_corpus_fr.to_csv("data/original_csv/corpus_fr.csv", index=False)
df_corpus_nl.to_csv("data/original_csv/corpus_nl.csv", index=False)

df_test_fr.to_csv("data/original_csv/test_fr.csv", index=False)
df_test_nl.to_csv("data/original_csv/test_nl.csv", index=False)

print("Original datasets saved as CSV.")

Original datasets saved as CSV.


In [None]:
# French and Dutch corpus cleaning script (combined version with separate blocks)
import re
import os
import json
import pandas as pd
from datasets import load_dataset, Dataset

# === Load original corpus ===
ds_corpus = load_dataset("clips/bBSARD", "corpus")
corpus_fr = ds_corpus["fr"]
corpus_nl = ds_corpus["nl"]

# === Load long article IDs ===
with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = set(str(id).strip() for id in json.load(f))

# === French Cleaning ===
def clean_article_start_fr(text):
    text = re.sub(r"[\(\[]\s*(ancien article|ancien art|erronément intitulé art\.?)\s*\d+[^\]\)]*[\)\]]", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"^(Art\.?|Article|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|ancien article|Infraction à l'article)\s*", "", text, flags=re.IGNORECASE)
    text = re.sub(r"^\d+[^\w]*\s*", "", text)
    forbidden_words = ["Art", "Article", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "ancien article", "Infraction à l'article"]
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]
        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue
        return text[start_index:].strip()
    return text.strip()

cleaned_fr = []
original_and_cleaned_fr = []
for entry in corpus_fr:
    if str(entry["id"]) in long_article_ids:
        continue
    cleaned_text = clean_article_start_fr(entry["article"])
    cleaned_entry = {"id": entry["id"], "reference": entry["reference"], "article": cleaned_text}
    cleaned_fr.append(cleaned_entry)
    original_and_cleaned_fr.append({"id": entry["id"], "reference": entry["reference"], "article": entry["article"], "article_cleaned": cleaned_text})

# === Dutch Cleaning ===
def clean_article_start_nl(text):
    text = re.sub(r"^(Art\.?|Artikel|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|BIJLAGE|Inbreuk op artikel|Voorheen)\s*", "", text, flags=re.IGNORECASE)
    text = re.sub(r"^\d+[^\w]*\s*", "", text)
    forbidden_words = ["Art", "Artikel", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "BIJLAGE", "Inbreuk op artikel", "Voorheen"]
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]
        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue
        return text[start_index:].strip()
    return text.strip()

cleaned_nl = []
original_and_cleaned_nl = []
for entry in corpus_nl:
    if str(entry["id"]) in long_article_ids:
        continue
    cleaned_text = clean_article_start_nl(entry["article"])
    cleaned_entry = {"id": entry["id"], "reference": entry["reference"], "article": cleaned_text}
    cleaned_nl.append(cleaned_entry)
    original_and_cleaned_nl.append({"id": entry["id"], "reference": entry["reference"], "article": entry["article"], "article_cleaned": cleaned_text})

# === Save all outputs ===
os.makedirs("data/cleaned_corpus", exist_ok=True)
os.makedirs("data/original_cleaned_mix_corpus", exist_ok=True)
os.makedirs("data/cleaned_corpus_ds", exist_ok=True)

# CSVs
pd.DataFrame(cleaned_fr).to_csv("data/cleaned_corpus/corpus_fr_cleaned.csv", index=False)
pd.DataFrame(original_and_cleaned_fr).to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_fr_corpus.csv", index=False)

pd.DataFrame(cleaned_nl).to_csv("data/cleaned_corpus/corpus_nl_cleaned.csv", index=False)
pd.DataFrame(original_and_cleaned_nl).to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_nl_corpus.csv", index=False)

# datasets.arrows
from datasets import DatasetDict

ds_corpus_cleaned = DatasetDict({
    "fr": Dataset.from_list(cleaned_fr),
    "nl": Dataset.from_list(cleaned_nl)
})

ds_corpus_cleaned.save_to_disk("data/cleaned_corpus_ds/cleaned_corpus")

print("✅ Both French and Dutch corpus cleaned and saved in CSV and HuggingFace formats.")


Saving the dataset (1/1 shards): 100%|██████████| 22033/22033 [00:00<00:00, 2711095.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 22033/22033 [00:00<00:00, 3119429.54 examples/s]

✅ Both French and Dutch corpus cleaned and saved in CSV and HuggingFace formats.





## CHECKING THE CLEANING PROCESS for articles (based on length difference between original and cleaned article)

In [None]:
# French articles 
import pandas as pd
import os

input_csv = "data/original_cleaned_mix_corpus/original_cleaned_mix_fr_corpus.csv" 
output_csv = "data/original_cleaned_mix_corpus/cleaning_difference_check_fr.csv"
threshold = 100 

df = pd.read_csv(input_csv)

# lengths of articles (char_count)
df["original_length"] = df["article"].str.len()
df["cleaned_length"] = df["article_cleaned"].str.len()

# length difference between original and cleaned article
df["length_diff"] = df["original_length"] - df["cleaned_length"]

# filtering where length diff. bigger than threshold
df_filtered = df[df["length_diff"] > threshold][["id", "article", "article_cleaned", "original_length", "cleaned_length", "length_diff"]]

df_filtered.to_csv(output_csv, index=False)

print(f"Saved {len(df_filtered)} cases where cleaning removed more than {threshold} characters.")
print(f"Saved to {output_csv}")

Saved 169 cases where cleaning removed more than 100 characters.
Saved to data/original_cleaned_mix_corpus/cleaning_difference_check_fr.csv


In [None]:
# Dutch articles
import pandas as pd
import os

input_csv = "data/original_cleaned_mix_corpus/original_cleaned_mix_nl_corpus.csv"  
output_csv = "data/original_cleaned_mix_corpus/cleaning_difference_check_nl.csv"
threshold = 100 

df = pd.read_csv(input_csv)

# lengths of articles (char_count)
df["original_length"] = df["article"].str.len()
df["cleaned_length"] = df["article_cleaned"].str.len()

# length difference between original and cleaned article
df["length_diff"] = df["original_length"] - df["cleaned_length"]

# filtering where length diff. bigger than threshold
df_filtered = df[df["length_diff"] > threshold][["id", "article", "article_cleaned", "original_length", "cleaned_length", "length_diff"]]

df_filtered.to_csv(output_csv, index=False)

print(f"Saved {len(df_filtered)} cases where cleaning removed more than {threshold} characters.")
print(f"Saved to {output_csv}")

Saved 181 cases where cleaning removed more than 100 characters.
Saved to data/original_cleaned_mix_corpus/cleaning_difference_check_nl.csv


## Checking the size of cleaned files

In [82]:
from datasets import load_from_disk
ds = load_from_disk("data/cleaned_corpus_ds/cleaned_corpus")
print(ds)
print(ds["fr"][0])

DatasetDict({
    fr: Dataset({
        features: ['id', 'reference', 'article'],
        num_rows: 22033
    })
    nl: Dataset({
        features: ['id', 'reference', 'article'],
        num_rows: 22033
    })
})
{'id': 1, 'reference': "Art. 1.1.1 Code Bruxellois de l'Air, du Climat et de la Maîtrise de l'Energie (Livre 1er, Titre 1er)", 'article': "Le présent Code règle une matière visée à l'article 39 de la Constitution."}


In [68]:
import pandas as pd

cor_fr = pd.read_csv('data/cleaned_corpus/corpus_fr_cleaned.csv')
cor_nl = pd.read_csv('data/cleaned_corpus/corpus_nl_cleaned.csv')

print(cor_fr.shape) 
print(cor_nl.shape)


(22033, 3)
(22033, 3)
