In [None]:
import re
import pandas as pd

# Load lexicon
df = pd.read_csv("bolinao_lexicon_final.csv")
VALID_WORDS = set(df["word"])

AFFIX_PREFIXES = ("ma", "na", "ka", "pa")
AFFIX_SUFFIXES = ("an", "en")
CONSONANTS = "bcdfghjklmnpqrstvwxyzâ€™"


def restore_medial_pepet(word):
    candidates = set()
    pattern = rf"([{CONSONANTS}])([{CONSONANTS}])"
    candidates.add(re.sub(pattern, r"\1e\2", word, count=1))
    candidates.add(re.sub(pattern, r"\1a\2", word, count=1))
    return candidates


def restore_glide_suffix(word):
    candidates = set()
    if word.endswith("yan"):
        stem = word[:-3]
        candidates |= {stem + "iyan", stem + "ien"}
    if word.endswith("wan"):
        stem = word[:-3]
        candidates |= {stem + "owan", stem + "oen"}
    return candidates


def near_affix(word):
    return (
        word.startswith(AFFIX_PREFIXES)
        or word.endswith(AFFIX_SUFFIXES)
        or word.endswith("yan")
        or word.endswith("wan")
    )


def undo_vowel_reduction(word):
    if not near_affix(word):
        return set()
    return restore_medial_pepet(word) | restore_glide_suffix(word)

all_generated = []
validated = []

for _, row in df.iterrows():
    word = row["word"]
    meaning = row["meaning_english"]
    upos = row["upos"]

    candidates = undo_vowel_reduction(word)

    for candidate in candidates:
        all_generated.append({
            "word": word,
            "generated_candidate": candidate,
            "exists_in_lexicon": candidate in VALID_WORDS
        })

        if candidate in VALID_WORDS and candidate != word:
            root_row = df[df["word"] == candidate].iloc[0]
            validated.append({
                "word": word,
                "root_candidate": candidate,
                "word_meaning": meaning,
                "root_meaning": root_row["meaning_english"],
                "upos_word": upos,
                "upos_root": root_row["upos"]
            })

df_all = pd.DataFrame(all_generated)
df_validated = pd.DataFrame(validated)

df_all.to_csv("bolinao_vowel_reduction_all_generated.csv", index=False)
df_validated.to_csv("bolinao_vowel_reduction_validated.csv", index=False)

print("Saved outputs:")
print("- bolinao_vowel_reduction_all_generated.csv")
print("- bolinao_vowel_reduction_validated.csv")

Saved outputs:
- bolinao_vowel_reduction_all_generated.csv
- bolinao_vowel_reduction_validated.csv
