# (Root → Surface) — Bolinao pronoun changes

This notebook does the **forward** direction of the constrained morphophonemic work in the pronoun notebook:

- **Input**: an underlying/root form (e.g., `iti`, `sai`, `ko`), or an **explicit token sequence** (e.g., `mo + ya`, `koN + ko`).
- **Output**: one or more **candidate surface forms** with an explicit **rule trace** and weights.

Scope / constraints (matches the analyzer philosophy):

- This is a **constrained surface generator**, not a full phonology engine.
- Lexicalized / honorific contractions (e.g., `mo + ya → ma`) are treated as **stored outcomes**.
- Some transformations are **optional / semi-reversible**, so we may output multiple candidates (including an identity fallback).

In [None]:
import pandas as pd
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Tuple


def _norm(s: str) -> str:
    return (s or "").strip().lower()


# Optional: load the lexicon for quick lookup of generated candidates.
LEXICON_PATH = "Bolinao Lexicon - bolinao_lexicon_final.csv"
try:
    bolinao_final = pd.read_csv(LEXICON_PATH)
except Exception:
    bolinao_final = None

In [None]:
@dataclass(frozen=True)
class Candidate:
    surface: str
    underlying_tokens: Tuple[str, ...]
    pos: str
    rule: str
    rule_type: str
    reversibility: str
    weight: float
    notes: str = ""


def _parse_underlying(user_input: str) -> Tuple[str, ...]:
    """
    Accepts either a single form ("iti") or a token sequence ("mo + ya").
    """
    s = _norm(user_input)
    if "+" in s:
        toks = [t.strip() for t in s.split("+") if t.strip()]
        return tuple(toks)
    return (s,)


# -------------------------------------------------------------------
# Forward (underlying/root → surface) rules
# These are the forward counterparts of the rules in the pronoun analyzer notebook.
# -------------------------------------------------------------------
_LEXICALIZED_FORWARD: Dict[Tuple[str, ...], Tuple[str, float, str]] = {
    ("mo", "ya"): ("ma", 0.98, "Lexicalized/honorific contraction: mo + ya → ma (stored outcome)."),
    ("ko", "ka"): ("ta", 0.98, "Lexicalized/honorific contraction: ko + ka → ta (stored outcome)."),
    ("ko", "ya"): ("kwa", 0.98, "Lexicalized/honorific contraction: ko + ya → kwa (stored outcome)."),
    ("mo", "ko"): ("nako", 0.98, "Lexicalized/honorific contraction: mo + ko → nako (stored outcome)."),
}

# Demonstrative clitic/reduction surfaces (inverse of demo_map in analyzer).
_DPR_UNDERLYING_TO_SURFACES: Dict[str, List[Tuple[str, float, str]]] = {
    "iti": [("moyti", 0.60, "DPr clitic/reduction variant (semi-reversible)."), ("modti", 0.60, "DPr clitic/reduction variant (semi-reversible).")],
    "in": [("moin", 0.60, "DPr clitic/reduction variant (semi-reversible).")],
    "isen": [("modsen", 0.60, "DPr clitic/reduction variant (semi-reversible).")],
    "taw": [("moytaw", 0.60, "DPr clitic/reduction variant (semi-reversible).")],
    "itaw": [("modtaw", 0.60, "DPr clitic/reduction variant (semi-reversible).")],
}

# koN-/ikon- supportive surfaces (inverse of kon_underlying in analyzer).
_KON_UNDERLYING_TO_SURFACE: Dict[Tuple[str, ...], Tuple[str, float, str]] = {
    ("kon", "ko"): ("kongko", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "ko"): ("ikongko", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "ta"): ("konta", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "ta"): ("ikonta", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "tamo"): ("kontamo", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "tamo"): ("ikontamo", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "mi"): ("komi", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "mi"): ("ikomi", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "mo"): ("komo", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "mo"): ("ikomo", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "moyo"): ("komoyo", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "moyo"): ("ikomoyo", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "na"): ("kona", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "na"): ("ikona", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
    ("kon", "ra"): ("konra", 0.85, "koN-/kon- supportive pronoun outcome (semi-reversible)."),
    ("ikon", "ra"): ("ikonra", 0.85, "ikon- supportive pronoun outcome (semi-reversible)."),
}


def generate_surface_candidates(
    underlying_input: str,
    pos: Optional[str] = None,
    *,
    allow_identity: bool = True,
    allow_intrg_an_prefix: bool = True,
    allow_dpr_variants: bool = True,
    allow_dpr_linker_suffix: bool = False,
    allow_productive_kon_prefixing: bool = False,
    max_results: int = 25,
    _pos_forced: bool = False,
) -> List[Dict[str, Any]]:
    """
    Forward generator that mirrors the rule inventory used in the pronoun analyzer notebook.

    If `pos` is None or "auto", this function will generate across the relevant POS categories
    (intrg pron / DPr / indfpro / expl) while keeping the rule set constrained.
    """
    pos_in = (pos or "").strip()
    pos_norm = pos_in.lower()
    if pos_norm in ("", "auto", "any", "all"):
        # Generate across the POS values used in the analyzer notebook.
        all_pos = ["intrg pron", "DPr", "indfpro", "expl"]
        combined: List[Dict[str, Any]] = []
        for p in all_pos:
            combined.extend(
                generate_surface_candidates(
                    underlying_input,
                    p,
                    allow_identity=allow_identity,
                    allow_intrg_an_prefix=allow_intrg_an_prefix,
                    allow_dpr_variants=allow_dpr_variants,
                    allow_dpr_linker_suffix=allow_dpr_linker_suffix,
                    allow_productive_kon_prefixing=allow_productive_kon_prefixing,
                    max_results=max_results,
                    _pos_forced=True,
                )
            )
        # Deduplicate across POS runs (surface+underlying+rule); keep best weight.
        best: Dict[Tuple[str, str, str], Dict[str, Any]] = {}
        for d in combined:
            k = (d["surface"], d["underlying"], d["rule"])
            if k not in best or float(d["weight"]) > float(best[k]["weight"]):
                best[k] = d
        out = list(best.values())
        out.sort(key=lambda d: float(d["weight"]), reverse=True)
        return out[:max_results]

    # POS-specific generation below
    pos = pos_in
    underlying_tokens = _parse_underlying(underlying_input)
    candidates: List[Candidate] = []

    # 1) Lexicalized/honorific contractions: only fire when the underlying is explicitly a sequence.
    if underlying_tokens in _LEXICALIZED_FORWARD:
        surface, w, note = _LEXICALIZED_FORWARD[underlying_tokens]
        candidates.append(
            Candidate(
                surface=surface,
                underlying_tokens=underlying_tokens,
                pos=pos,
                rule=f"LEX_CONTRACTION_{surface}",
                rule_type="lexicalized",
                reversibility="non-reversible",
                weight=w,
                notes=note,
            )
        )

    # 2) koN-/ikon- supportive forms: can fire on explicit (koN + X)/(ikon + X).
    if underlying_tokens in _KON_UNDERLYING_TO_SURFACE:
        surface, w, note = _KON_UNDERLYING_TO_SURFACE[underlying_tokens]
        candidates.append(
            Candidate(
                surface=surface,
                underlying_tokens=underlying_tokens,
                pos=pos,
                rule=f"KON_supportive_{surface}",
                rule_type="morphophonemic",
                reversibility="semi-reversible",
                weight=w,
                notes=note,
            )
        )

    # 3) POS-specific forward generation when input is a single root token.
    if len(underlying_tokens) == 1:
        base = underlying_tokens[0]

        # Interrogatives: optional an- prefix (forward counterpart of INTRG_an_prefix_optional_add).
        if allow_intrg_an_prefix and pos == "intrg pron":
            if not base.startswith("an"):
                candidates.append(
                    Candidate(
                        surface="an" + base,
                        underlying_tokens=underlying_tokens,
                        pos=pos,
                        rule="INTRG_an_prefix_optional_add",
                        rule_type="morphophonemic",
                        reversibility="semi-reversible",
                        weight=0.55,
                        notes="Optional/historical an- prefix (semi-reversible).",
                    )
                )

        # Demonstratives: reduced/cliticized variants (inverse of DPR_reduction_*).
        if allow_dpr_variants and pos == "DPr":
            for surface, w, note in _DPR_UNDERLYING_TO_SURFACES.get(base, []):
                candidates.append(
                    Candidate(
                        surface=surface,
                        underlying_tokens=underlying_tokens,
                        pos=pos,
                        rule=f"DPR_reduction_{surface}",
                        rule_type="morphophonemic",
                        reversibility="semi-reversible",
                        weight=w,
                        notes=note,
                    )
                )

            # Conservative linker suffix generation (forward counterpart to DPR_linker_y/n).
            if allow_dpr_linker_suffix and len(base) > 1:
                candidates.append(
                    Candidate(
                        surface=base + "y",
                        underlying_tokens=underlying_tokens,
                        pos=pos,
                        rule="DPR_linker_y",
                        rule_type="morphophonemic",
                        reversibility="semi-reversible",
                        weight=0.35,
                        notes="Conservative: treat final -y as linker/clitic.",
                    )
                )
                candidates.append(
                    Candidate(
                        surface=base + "n",
                        underlying_tokens=underlying_tokens,
                        pos=pos,
                        rule="DPR_linker_n",
                        rule_type="morphophonemic",
                        reversibility="semi-reversible",
                        weight=0.35,
                        notes="Conservative: treat final -n as linker/clitic.",
                    )
                )

        # Optional: productive koN-/ikon- prefixing from a basic pronoun token.
        if allow_productive_kon_prefixing:
            for pref in ("kon", "ikon"):
                key = (pref, base)
                if key in _KON_UNDERLYING_TO_SURFACE:
                    surface, w, note = _KON_UNDERLYING_TO_SURFACE[key]
                    candidates.append(
                        Candidate(
                            surface=surface,
                            underlying_tokens=(pref, base),
                            pos=pos,
                            rule=f"KON_supportive_{surface}",
                            rule_type="morphophonemic",
                            reversibility="semi-reversible",
                            weight=min(0.50, w),
                            notes=note + " (generated assuming optional supportive prefixing)",
                        )
                    )

    # 4) Identity candidate (always safe; does not claim derivation).
    # In auto mode we still want identity; in POS-forced runs, identity would duplicate a lot; keep it but dedupe later.
    if allow_identity and (not _pos_forced or pos in ("indfpro", "expl")):
        surface_identity = " + ".join(underlying_tokens) if len(underlying_tokens) > 1 else underlying_tokens[0]
        candidates.append(
            Candidate(
                surface=surface_identity,
                underlying_tokens=underlying_tokens,
                pos=pos,
                rule="LEX_FALLBACK_identity",
                rule_type="lexicalized",
                reversibility="non-reversible",
                weight=0.10,
                notes="Identity fallback: keep underlying as surface (no derivation claim).",
            )
        )

    # Deduplicate by (surface, underlying_tokens, rule).
    seen = set()
    uniq: List[Candidate] = []
    for c in candidates:
        key = (c.surface, c.underlying_tokens, c.rule)
        if key in seen:
            continue
        seen.add(key)
        uniq.append(c)

    uniq.sort(key=lambda c: c.weight, reverse=True)
    uniq = uniq[:max_results]

    return [
        {
            "surface": c.surface,
            "underlying": " + ".join(c.underlying_tokens),
            "pos": c.pos,
            "rule": c.rule,
            "rule_type": c.rule_type,
            "reversibility": c.reversibility,
            "weight": float(c.weight),
            "notes": c.notes,
        }
        for c in uniq
    ]


def lookup_in_lexicon(surface_forms: Sequence[str]) -> Optional[pd.DataFrame]:
    if bolinao_final is None:
        return None
    if "word" not in bolinao_final.columns:
        return None
    forms = {_norm(s) for s in surface_forms}
    df = bolinao_final.copy()
    df["_word_norm"] = df["word"].astype(str).str.strip().str.lower()
    out = df[df["_word_norm"].isin(forms)].drop(columns=["_word_norm"])
    return out.reset_index(drop=True)

## How to use

1. Set `root_or_underlying` to either:
   - a single root form like `iti` / `sai` / `ko`, **or**
   - an explicit sequence like `mo + ya` or `koN + ko` (case-insensitive).
2. Set `pos` to either:
   - `"auto"` (recommended if you only input a root), **or**
   - a specific tag: `intrg pron`, `DPr`, `indfpro`, `expl`.
3. Run the next cell to get candidate surface forms and (if the CSV is available) lexicon matches.

Tip: For lexicalized contractions (e.g., `ma`), enter the full underlying sequence (`mo + ya`) because the contraction depends on both tokens.

In [None]:
# --- User inputs ---
root_or_underlying = "sain"   # examples: "sai", "iti", "mo + ya", "koN + ko"
pos = "auto"                # "auto" lets you input only a root and still get candidates

# --- Generation knobs (keep constrained by default) ---
allow_dpr_linker_suffix = False
allow_productive_kon_prefixing = False  # set True if you want candidates like ko -> kongko / ikongko

cands = generate_surface_candidates(
    root_or_underlying,
    pos,
    allow_dpr_linker_suffix=allow_dpr_linker_suffix,
    allow_productive_kon_prefixing=allow_productive_kon_prefixing,
    max_results=25,
 )

cand_df = pd.DataFrame(cands)

# Rename columns for presentation
rename_map = {"underlying": "root word"}
cand_df = cand_df.rename(columns=rename_map)

# Remove POS column from display (still used internally for generation)
cand_df = cand_df.drop(columns=[c for c in ["pos", "pos of root word"] if c in cand_df.columns])

# Reorder columns for readability
preferred = ["root word", "surface", "rule", "rule_type", "reversibility", "weight", "notes",
]
ordered_cols = [c for c in preferred if c in cand_df.columns] + [c for c in cand_df.columns if c not in preferred]
cand_df = cand_df[ordered_cols]

cand_df

Unnamed: 0,root word,surface,rule,rule_type,reversibility,weight,notes
0,sain,ansain,INTRG_an_prefix_optional_add,morphophonemic,semi-reversible,0.55,Optional/historical an- prefix (semi-reversible).
1,sain,sain,LEX_FALLBACK_identity,lexicalized,non-reversible,0.1,Identity fallback: keep underlying as surface ...


In [None]:
# --- Verify surface candidates against the lexicon ---

# Collect surface forms from the previous cell
if "cand_df" in globals() and isinstance(cand_df, pd.DataFrame) and "surface" in cand_df.columns:
    surface_forms = cand_df["surface"].astype(str).tolist()
elif "cands" in globals():
    surface_forms = [str(d.get("surface", "")) for d in cands]
else:
    surface_forms = []
surface_forms = [s.strip() for s in surface_forms if str(s).strip()]
surface_forms_unique = list(dict.fromkeys(surface_forms))

# Make pandas display show full contents (no truncation) for this output only.
_display_opts = {
    "display.max_colwidth": None,
    "display.max_columns": None,
    "display.max_rows": None,
    "display.width": 0,
}

if bolinao_final is None:
    print("Lexicon CSV was not loaded (check LEXICON_PATH). Cannot verify surface forms.")
    verified_df = pd.DataFrame({"surface": surface_forms_unique, "in_lexicon": [False] * len(surface_forms_unique)})
    with pd.option_context(*sum(_display_opts.items(), ())):
        display(verified_df)
else:
    if "word" not in bolinao_final.columns:
        raise ValueError("Lexicon DataFrame is missing required column: 'word'")
    lex = bolinao_final.copy()
    lex["_word_norm"] = lex["word"].astype(str).str.strip().str.lower()
    in_lex = set(lex["_word_norm"].tolist())
    verified_df = pd.DataFrame({"surface": surface_forms_unique})
    verified_df["_surface_norm"] = verified_df["surface"].map(_norm)
    verified_df["in_lexicon"] = verified_df["_surface_norm"].isin(in_lex)
    verified_df = verified_df.drop(columns=["_surface_norm"])
    matched_rows = lex[lex["_word_norm"].isin({_norm(s) for s in surface_forms_unique})].drop(columns=["_word_norm"]).reset_index(drop=True)
    print(f"Found {int(verified_df['in_lexicon'].sum())} / {len(verified_df)} surface form(s) in the lexicon.")
    with pd.option_context(*sum(_display_opts.items(), ())):
        display(verified_df)
        if len(matched_rows) > 0:
            display(matched_rows)

Found 2 / 2 surface form(s) in the lexicon.


Unnamed: 0,surface,in_lexicon
0,ansain,True
1,sain,True


Unnamed: 0,word,part_of_speech,meaning_english,meaning_filipino,sample_bolinao,sample_english,upos
0,ansain,intrg pron,"Identifies a specific grouping of things close to the hearer when preceded by ""no"".",Kung ano.,Ansain say lako' mo?,What are those which you are selling?,PRON
1,sain,pr,"That group or activity near you, the hearer or an activity that happened previously.",Mga iyan.,Sain tamo' a awit mo?,Is it only those things that you will take?,ADP
