### Imports

In [1]:
import re
from pathlib import Path
from tqdm import tqdm

### Directories

In [None]:
raw_dir = Path("../data/raw")
cleaned_dir = Path("../data/cleaned")
cleaned_dir.mkdir(parents=True, exist_ok=True)

### Text Cleaning

In [None]:
# iterate through each raw text file in ../data/raw
for file_path in tqdm(sorted(raw_dir.glob("*_raw.txt")), desc="Cleaning Text Files"):
    lang = file_path.stem.replace("_raw", "")
    cleaned_path = cleaned_dir / f"{lang}_cleaned.txt"

    # read raw text
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # step 1: convert all to lowercase
    text = text.lower()

    # step 2: remove numbers
    # note: we can do this naively and not worry about whether this removes numbers other 
    # than verse numbers, such as numbers incorporated in the verses itself because
    # the main goal is to clean the text for language similarity analysis wherein
    # we will use ngrams and such, so we don't want numbers at all
    text = re.sub(r"\d+", " ", text)

    # step 3: remove punctuation and special characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # step 4: normalize whitespace
    # note: in the processing of the raw text, we already do this but just to be sure
    # we run it again here
    text = re.sub(r"\s+", " ", text).strip()

    # write the cleaned file to ../data/cleaned
    with open(cleaned_path, "w", encoding="utf-8") as f:
        f.write(text)

    # summary
    word_count = len(text.split())
    print(f"{lang:<12} → {word_count:>8,} cleaned words")

print("\nCleaning complete. Files saved in '../data/cleaned/'.")

NameError: name 'raw_dir' is not defined