## Dutch article cleaning

In [None]:
import re
import os

def clean_article_start(text):
    # removing tokens in the start of the  article
    text = re.sub(r"^(Art\.?|Artikel|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|BIJLAGE|Inbreuk op artikel|Voorheen)\s*", "", text, flags=re.IGNORECASE)

    # removing numbers, dots, non-words
    text = re.sub(r"^\d+[^\w]*\s*", "", text)

    # list of common words in the beginning
    forbidden_words = ["Art", "Artikel", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "BIJLAGE", "Inbreuk op artikel", "Voorheen"]

    # scanning for capital letter
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()

        # excluding capital letter if inside a common word
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]

        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue

        # checking next two characters after capital letter
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue

        return text[start_index:].strip()

    # If nothing found → return original text
    return text.strip()


with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = json.load(f)

df_corpus_nl = df_corpus_nl[~df_corpus_nl["id"].isin(long_article_ids)]

# apply cleaning
df_corpus_nl["article_cleaned"] = df_corpus_nl["article"].apply(clean_article_start)

# making two directories for cleaned corpus and mixed corpus for comparison
os.makedirs("data/original_cleaned_mix_corpus", exist_ok=True)
os.makedirs("data/cleaned_corpus", exist_ok=True)

df_corpus_nl.to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_nl_corpus.csv", index=False)


df_corpus_nl_original_format = df_corpus_nl[["id", "reference", "article_cleaned"]].rename(columns={"article_cleaned": "article"})
df_corpus_nl_original_format.to_csv("data/cleaned_corpus/corpus_nl_cleaned.csv", index=False)

## French article cleaning

In [None]:
import re
import os

def clean_article_start(text):
    # removing tokens in the start of the  article
    text = re.sub(r"[\(\[]\s*(ancien article|ancien art|erronément intitulé art\.?)\s*\d+[^\]\)]*[\)\]]", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"^(Art\.?|Article|ANNEXE|DROIT FUTUR|Antérieurement|Voir note sous TITRE|ancien article|Infraction à l'article)\s*", "", text, flags=re.IGNORECASE)

    # removing numbers, dots, non-words
    text = re.sub(r"^\d+[^\w]*\s*", "", text)

    # list of common words in the beginning
    forbidden_words = ["Art", "Article", "ANNEXE", "DROIT FUTUR", "Antérieurement", "Voir note sous TITRE", "ancien article", "Infraction à l'article"]

    # scanning for capital letter
    for match in re.finditer(r"[A-Z]", text):
        start_index = match.start()

        # excluding capital letter if inside a common word
        window_start = max(0, start_index - 10)
        window_text = text[window_start:start_index + 10]

        if any(forbidden.lower() in window_text.lower() for forbidden in forbidden_words):
            continue

        # checking next two characters after capital letter
        next_chars = text[start_index+1:start_index+3]
        if not re.match(r"[\s*'a-zA-Z]{1,2}", next_chars):
            continue

        return text[start_index:].strip()

    return text.strip()

with open("long_article_ids.json", "r", encoding="utf-8") as f:
    long_article_ids = json.load(f)

df_corpus_fr = df_corpus_fr[~df_corpus_fr["id"].isin(long_article_ids)]
# apply cleaning
df_corpus_fr["article_cleaned"] = df_corpus_fr["article"].apply(clean_article_start)

# making two directories for cleaned corpus and mixed corpus for comparison
os.makedirs("data/original_cleaned_mix_corpus", exist_ok=True)
os.makedirs("data/cleaned_corpus", exist_ok=True)

df_corpus_fr.to_csv("data/original_cleaned_mix_corpus/original_cleaned_mix_fr_corpus.csv", index=False)

df_corpus_fr_original_format = df_corpus_fr[["id", "reference", "article_cleaned"]].rename(columns={"article_cleaned": "article"})
df_corpus_fr_original_format.to_csv("data/cleaned_corpus/corpus_fr_cleaned.csv", index=False)