In [None]:
%pip install requests beautifulsoup4 pandas openpyxl
import os
import re
import json
import random


In [None]:

# Paths
DATA_DIR = "data_source"
OUTPUT_DIR = "."
TRAIN_FILE = os.path.join(OUTPUT_DIR, "train.jsonl")
VAL_FILE = os.path.join(OUTPUT_DIR, "val.jsonl")
TEST_FILE = os.path.join(OUTPUT_DIR, "test.jsonl")

def load_texts(data_dir):
    """Load all .txt files inside the data_source folder"""
    texts = []
    for root, _, files in os.walk(data_dir):
        for f in files:
            if f.endswith(".txt"):
                with open(os.path.join(root, f), "r", encoding="utf-8", errors="ignore") as infile:
                    texts.append(infile.read())
    return texts

def clean_text(text):
    """Normalize Tshiluba text"""
    text = text.replace("\n", " ")  # remove line breaks
    text = re.sub(r"[^a-zA-Zà-žÀ-Ž\s\.\,\;\:\?\!']", " ", text)  # keep letters & punctuation
    text = re.sub(r"\s+", " ", text)  # collapse spaces
    return text.strip()

def split_sentences(text):
    """Split text into sentences using punctuation"""
    sentences = re.split(r"[\.!\?]+", text)
    return [s.strip() for s in sentences if len(s.strip()) > 3]

In [None]:
# Step 1: Load and preprocess
all_texts = load_texts(DATA_DIR)
cleaned_sentences = []
for t in all_texts:
    cleaned_sentences.extend(split_sentences(clean_text(t)))

print(f"Collected {len(cleaned_sentences)} sentences.")

# Step 2: Shuffle
random.shuffle(cleaned_sentences)

# Step 3: Split train/val/test (70/15/15)
n = len(cleaned_sentences)
train_split = int(0.7 * n)
val_split = int(0.85 * n)

train_data = cleaned_sentences[:train_split]
val_data = cleaned_sentences[train_split:val_split]
test_data = cleaned_sentences[val_split:]

# Step 4: Save JSONL
def save_jsonl(filename, sentences):
    with open(filename, "w", encoding="utf-8") as f:
        for i, s in enumerate(sentences):
            f.write(json.dumps({"id": f"{os.path.basename(filename)}_{i}", "text": s}, ensure_ascii=False) + "\n")

save_jsonl(TRAIN_FILE, train_data)
save_jsonl(VAL_FILE, val_data)
save_jsonl(TEST_FILE, test_data)

print(f"Saved {len(train_data)} train, {len(val_data)} val, {len(test_data)} test sentences.")
        