### Imports

In [4]:
import re
from pathlib import Path
from tqdm import tqdm

### Directories

In [5]:
raw_dir = Path("../data/raw")
cleaned_dir = Path("../data/cleaned")
cleaned_dir.mkdir(parents=True, exist_ok=True)

### Text Cleaning

In [6]:
# iterate through each raw text file in ../data/raw
for file_path in tqdm(sorted(raw_dir.glob("*_raw.txt")), desc="Cleaning Text Files"):
    lang = file_path.stem.replace("_raw", "")
    cleaned_path = cleaned_dir / f"{lang}_cleaned.txt"

    # read raw text
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # step 1: convert all to lowercase
    text = text.lower()

    # step 2: remove numbers
    # note: we can do this naively and not worry about whether this removes numbers other 
    # than verse numbers, such as numbers incorporated in the verses itself because
    # the main goal is to clean the text for language similarity analysis wherein
    # we will use ngrams and such, so we don't want numbers at all
    text = re.sub(r"\d+", " ", text)

    # step 3: remove punctuation and special characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # step 4: normalize whitespace
    # note: in the processing of the raw text, we already do this but just to be sure
    # we run it again here
    text = re.sub(r"\s+", " ", text).strip()

    # write the cleaned file to ../data/cleaned
    with open(cleaned_path, "w", encoding="utf-8") as f:
        f.write(text)

    # summary
    word_count = len(text.split())
    print(f"{lang:<12} → {word_count:>8,} cleaned words")

print("\nCleaning complete. Files saved in '../data/cleaned/'.")

Cleaning Text Files:  19%|█▉        | 3/16 [00:00<00:01, 11.71it/s]

adasen       →   93,399 cleaned words
bikolano     →   62,269 cleaned words
cebuano      →   67,364 cleaned words
chavacano    →   94,664 cleaned words


Cleaning Text Files:  44%|████▍     | 7/16 [00:00<00:00, 13.69it/s]

english      →   64,130 cleaned words
ilokano      →   57,114 cleaned words
ilonggo      →   73,302 cleaned words
kinaray-a    →   76,518 cleaned words


Cleaning Text Files:  69%|██████▉   | 11/16 [00:00<00:00, 15.20it/s]

masbatenyo   →   70,459 cleaned words
paranan      →   79,391 cleaned words
romblomanon  →   74,130 cleaned words
spanish      →   63,962 cleaned words


Cleaning Text Files:  94%|█████████▍| 15/16 [00:01<00:00, 15.17it/s]

tagalog      →   66,053 cleaned words
tausug       →   93,645 cleaned words
waray        →   70,613 cleaned words
yami         →   83,153 cleaned words


Cleaning Text Files: 100%|██████████| 16/16 [00:01<00:00, 14.28it/s]


Cleaning complete. Files saved in '../data/cleaned/'.



