In [None]:
import sys
print(sys.executable)

In [None]:
from datasets import load_dataset
import nltk
from nltk.tokenize import sent_tokenize
from pathlib import Path
from tqdm import tqdm
import json, re

from helpers.filtering import looks_like_clean_french, remove_invisible_chars

In [None]:
nltk.download("punkt_tab")

out_file = Path("fr_data.jsonl")

In [None]:
print("Loading French Wikipedia dataset...")
dataset = load_dataset("wikimedia/wikipedia", "20231101.fr", split="train", streaming=True)

In [None]:
min_words = 3 # min words in each example
n_sentences = 0
n_articles = 0
n_words = 0
TARGET_ARTICLES = 10 # 20_000
TARGET_SENTENCES = 100_000
TARGET_WORDS = 10_000

with out_file.open("w", encoding="utf-8") as f_out:
    for example in tqdm(dataset, desc="Processing articles"):

        article_not_seen = True

        # if n_articles >= TARGET_ARTICLES or n_sentences >= TARGET_SENTENCES or n_words >= TARGET_WORDS:
        #     break

        if n_sentences >= TARGET_SENTENCES:
            break

        text = example["text"]
        if not text:
            continue

        sentences = sent_tokenize(text, language="french")

        for sent in sentences:
            if n_sentences >= TARGET_SENTENCES:
                break

            sent = sent.strip()
            sent = re.sub(r'-{2,}', ' ', sent)
            sent = remove_invisible_chars(sent)

            # Normalize whitespace 
            sent = re.sub(r"\s+", " ", sent)
            
            if not sent:
                continue

            word_count = len(sent.split())
            if word_count < min_words:
                continue

            # our extra "robust English" filter
            # extra "robust French" filter
            if not looks_like_clean_french(sent):
                continue

            # normalize
            sent = sent.lower().replace("\n", " ")

            line = json.dumps({"text": sent}, ensure_ascii=False)
            f_out.write(line + "\n")

            n_words += word_count
            n_sentences += 1
            if article_not_seen:
                n_articles += 1
                article_not_seen = False

print(f"Done. Saved {n_articles} articles, {n_words} words, and {n_sentences} sentences to: {out_file}")


![image-2.png](attachment:image-2.png)

In [None]:
from pathlib import Path
import random

input_path = Path("fr_data.jsonl")        # your big file
out_dir = Path("../../MAFT/fr")

train_path = out_dir / "train.jsonl"
val_path   = out_dir / "validation.jsonl"
test_path  = out_dir / "test.jsonl"

# ratios: change them if you want
train_ratio = 0.8
val_ratio   = 0.1
test_ratio  = 0.1

random.seed(42)  # for reproducibility

n_train = n_val = n_test = 0

with input_path.open("r", encoding="utf-8") as fin, \
     train_path.open("w", encoding="utf-8") as f_train, \
     val_path.open("w", encoding="utf-8") as f_val, \
     test_path.open("w", encoding="utf-8") as f_test:

    for line in fin:
        line = line.strip()
        if not line:
            continue

        r = random.random()
        if r < train_ratio:
            f_train.write(line + "\n")
            n_train += 1

        elif r < train_ratio + val_ratio:
            f_val.write(line + "\n")
            n_val += 1

        else:
            f_test.write(line + "\n")
            n_test += 1

print("Done.")
print("train:", n_train)
print("validation:", n_val)
print("test:", n_test)

![image-3.png](attachment:image-3.png)