In [60]:
"""
DT-LSTM-MARKOV FILTER: MORPHOLOGY PARSER WITH DECISION TREE PRIORS
===================================================================

This notebook implements a morphology parser for Quechua that combines:
1. BiLSTM neural network for boundary prediction
2. Decision Tree (DT) priors based on token-window features
3. Privileged knowledge (K-teacher) for regularization

The parser segments Quechua words into morphemes by predicting boundary positions
between tokens. It uses:
- Gold standard data (Sue Kalt dataset) as the base training data
- Decision Tree classifier trained on token-window features to provide priors
- K-teacher regularization to improve generalization

Key Differences from Markov-LSTM-MarkovFilter:
- Uses Decision Trees instead of HMM for prior generation
- DT priors are based on local token-window features (left/right context)
- HMM priors (in Markov-LSTM) use suffix patterns and forward-backward algorithm

Key Features:
- Model checkpointing: saves/loads models to avoid retraining
- Decision Tree caching: saves/loads DT priors to avoid retraining
- Comprehensive evaluation metrics (precision, recall, F1, exact match)
- Token-window feature extraction for DT prior

All data is read from the 'data' folder and models are saved to the 'models_DT-LSTM-MarkovFilter' folder.
"""

import pandas as pd
import os
import json
import hashlib
import pickle
import torch
import torch.nn as nn

In [61]:
# =========================
# DATA FOLDER CONFIGURATION
# =========================
# All data files should be read from and saved to the data folder
DATA_FOLDER = "data"

# Model folder named after this notebook
MODEL_NAME = "DT-LSTM-MarkovFilter"
MODELS_FOLDER = f"models_{MODEL_NAME}"

# Create models folder if it doesn't exist
os.makedirs(MODELS_FOLDER, exist_ok=True)

# =========================
# LOAD GOLD STANDARD DATA
# =========================
# The gold standard dataset contains high-quality morphological segmentations
# This is the base training data
print("Loading gold standard data...")
gold_df = pd.read_parquet(os.path.join(DATA_FOLDER, "Sue_kalt.parquet"))
gold_df['Word'] = gold_df['word']
gold_df['morph'] = gold_df['morph'].str.replace('-', ' ')  # Normalize separators
gold_df['Morph_split_str'] = gold_df['morph']  # String version
gold_df['Morph_split'] = gold_df['morph'].str.split(' ')  # List version
gold_df = gold_df[['Word', 'Morph_split', 'Morph_split_str']]
gold_df.drop_duplicates(subset='Word', keep='first', inplace=True)
gold_df.dropna(subset=['Word'], inplace=True)
print(f"Loaded {len(gold_df):,} gold standard examples")

Loading gold standard data...
Loaded 6,896 gold standard examples


In [62]:
gold_df.head(50)

Unnamed: 0,Word,Morph_split,Morph_split_str
0,cementerioman,"[cementerio, man]",cementerio man
1,kawsachkananta,"[kawsa, chka, na, n, ta]",kawsa chka na n ta
2,mañakunpis,"[maña, ku, n, pis]",maña ku n pis
3,imaynapichus,"[imayna, pi, chus]",imayna pi chus
4,qipiyuq,"[qipi, yuq]",qipi yuq
5,Quispepis,"[Quispe, pis]",Quispe pis
6,ñichkanmanchá,"[ñi, chka, nman, chá]",ñi chka nman chá
7,qukuni,"[qu, ku, ni]",qu ku ni
8,dejasunpunichu,"[deja, sun, puni, chu]",deja sun puni chu
9,phutikunki,"[phuti, ku, nki]",phuti ku nki


In [63]:
# =========================
# LOAD TEST DATA
# =========================
# Load the test/accuracy evaluation dataset
# This dataset is used for final evaluation of the trained model

acc_df = pd.read_parquet(os.path.join(DATA_FOLDER, "cleaned_data_df.parquet"))

print("="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"Training data shape: {gold_df.shape}")
print(f"Test data shape: {acc_df.shape}")
print(f"Models folder: {MODELS_FOLDER}")
print("="*60)

DATASET SUMMARY
Training data shape: (6896, 3)
Test data shape: (913, 5)
Models folder: models_DT-LSTM-MarkovFilter


In [64]:
graphemes = [
    "ch","ll","rr","tr","kw","ph",  # digraphs/trigraphs
    "a","b","d","e","f","g","h","i","k","l","m","n","ñ","o","p","q",
    "r","s","t","u","v","w","x","y"
]

In [65]:
import re

In [66]:
pattern = re.compile("|".join(sorted(graphemes, key=len, reverse=True)))

def tokenize_morphemes(morphs):
    return [pattern.findall(m.lower()) for m in morphs]

In [67]:
gold_df["Char_split"] = gold_df["Morph_split"].apply(tokenize_morphemes)

In [68]:
vowels = {"a", "i", "e", "o", "u"}

def grapheme_to_cv(grapheme):
    return "V" if grapheme in vowels else "C"

def morphs_to_cv(morphs):
    return [[grapheme_to_cv(g) for g in morph] for morph in morphs]

In [69]:
gold_df["CV_split"] = gold_df["Char_split"].apply(morphs_to_cv)

In [70]:
def cv_to_string(cv_split):
    """Convert nested CV list to dash-separated string."""
    return "-".join("".join(m) for m in cv_split)

In [71]:
str_df = pd.DataFrame()

In [72]:
import numpy as np

In [73]:
str_df["Full_chain"] = gold_df["CV_split"].apply(cv_to_string)

# Create Trimmed_chain, but use NaN if no dash
str_df["Trimmed_chain"] = str_df["Full_chain"].apply(
    lambda x: x.split("-", 1)[1] if "-" in x else np.nan
)

str_df["Word"] = gold_df["Word"]
str_df["Char_split"] = gold_df["Char_split"]
str_df["Morph_split"] = gold_df["Morph_split"]

# Drop rows where Trimmed_chain is NaN
str_df = str_df.dropna(subset=["Trimmed_chain"]).reset_index(drop=True)

In [74]:
# word length
str_df["Word_len"] = str_df["Word"].str.len()

# consonant and vowel count from Full_chain
str_df["Vowel_no"] = str_df["Full_chain"].str.count("V")
str_df["Cons_no"] = str_df["Full_chain"].str.count("C")

# tail consonant and vowel counts (last segment in Full_chain after '-')
str_df["Tail_cons_no"] = str_df["Trimmed_chain"].str.count("C")
str_df["Tail_vowel_no"] = str_df["Trimmed_chain"].str.count("V")

# number of splits from Morph_split
str_df["No_splits"] = str_df["Morph_split"].str.len()

# total y/w count in word
str_df["YW_count"] = str_df["Word"].str.count("[yw]")

# tail y/w count (all morphs except first)
str_df["Tail_YW_count"] = str_df["Morph_split"].apply(
    lambda ms: sum(m.count("y") + m.count("w") for m in ms[1:])
)

In [75]:
str_df.head()

Unnamed: 0,Full_chain,Trimmed_chain,Word,Char_split,Morph_split,Word_len,Vowel_no,Cons_no,Tail_cons_no,Tail_vowel_no,No_splits,YW_count,Tail_YW_count
0,VCVCCVCVV-CVC,CVC,cementerioman,"[[e, m, e, n, t, e, r, i, o], [m, a, n]]","[cementerio, man]",13,6,6,2,1,2,0,0
1,CVCCV-CCV-CV-C-CV,CCV-CV-C-CV,kawsachkananta,"[[k, a, w, s, a], [ch, k, a], [n, a], [n], [t,...","[kawsa, chka, na, n, ta]",14,5,8,5,3,5,1,0
2,CVCV-CV-C-CVC,CV-C-CVC,mañakunpis,"[[m, a, ñ, a], [k, u], [n], [p, i, s]]","[maña, ku, n, pis]",10,4,6,4,2,4,0,0
3,VCVCCV-CV-CVC,CV-CVC,imaynapichus,"[[i, m, a, y, n, a], [p, i], [ch, u, s]]","[imayna, pi, chus]",12,5,6,3,2,3,1,0
4,CVCV-CVC,CVC,qipiyuq,"[[q, i, p, i], [y, u, q]]","[qipi, yuq]",7,3,4,2,1,2,1,1


In [76]:
import ast
import re
import math
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [77]:
RANDOM_STATE = 42
END_LABEL = "Ø"
VOWELS = set(list("aeiou"))

In [78]:
def safe_literal_list(obj):
    """Parse '[[p, i],[k, u, n, a],[s]]' → [['p','i'], ...] if string; pass through if already list."""
    if isinstance(obj, list):
        return obj
    if pd.isna(obj):
        return None
    s = str(obj).strip()
    try:
        val = ast.literal_eval(s)
        return val
    except Exception:
        return None

def flatten_char_split(char_split):
    """Flatten list-of-lists of char tokens into a single list of tokens."""
    if not isinstance(char_split, list):
        return None
    out = []
    for seg in char_split:
        if isinstance(seg, list):
            out.extend([str(x) for x in seg])
        else:
            out.append(str(seg))
    return out

def tokens_to_word(tokens):
    """Join tokens to surface word. If tokens are graphemes (like 'ch'), just concatenate."""
    if not tokens:
        return ""
    return "".join(tokens)

In [79]:

def split_chain(chain: str):
    if chain is None:
        return []
    s = str(chain).strip()
    return [] if not s else s.split('-')

def extract_root_and_trimmed(full_chain: str):
    segs = split_chain(full_chain)
    if not segs:
        return "", END_LABEL
    root = segs[0]
    trimmed = '-'.join(segs[1:]) if len(segs) > 1 else END_LABEL
    return root, trimmed

def suffixes_from_trimmed(trimmed: str):
    if trimmed is None or trimmed == END_LABEL or str(trimmed).strip() == "":
        return []
    return str(trimmed).split('-')

In [80]:
NEW_NUM_FEATS = [
    "Word_len", "Vowel_no", "Cons_no",
    "Tail_cons_no", "Tail_vowel_no",
    "No_splits", "YW_count", "Tail_YW_count"
]

In [81]:
def root_cv_features(root_cv: str):
    s = root_cv or ""
    L = len(s)
    feats = {
        "root_cv": s,
        "root_len": L,
        "root_end": s[-1:] if L else "",
        "root_start": s[:1] if L else "",
        "root_suffix2": s[-2:] if L >= 2 else s,
        "root_prefix2": s[:2] if L >= 2 else s,
        "num_C": s.count('C'),
        "num_V": s.count('V'),
        "has_CC": int('CC' in s),
        "has_VV": int('VV' in s),
    }
    for i in range(L-1):
        feats[f"bg_{s[i:i+2]}"] = 1
    for i in range(L-2):
        feats[f"tg_{s[i:i+3]}"] = 1
    return feats

def last_char_features(word: str, k_chars=(1,2,3)):
    feats = {}
    if not word:
        return feats
    w = word
    # raw last n characters
    for k in k_chars:
        s = w[-k:] if len(w) >= k else w
        feats[f"last{k}"] = s
    # last character vowel/consonant
    last = w[-1]
    feats["last_is_vowel"] = int(last in VOWELS)
    feats["last_char"] = last
    # last vowel identity (if any)
    last_vowel = ''
    for ch in reversed(w):
        if ch in VOWELS:
            last_vowel = ch.lower()
            break
    feats["last_vowel"] = last_vowel
    return feats

def last_cluster_features(char_tokens: list, k_clusters=(1,2)):
    feats = {}
    if not char_tokens:
        return feats
    toks = char_tokens
    for k in k_clusters:
        tail = toks[-k:] if len(toks) >= k else toks
        feats[f"lastTok{k}"] = "|".join(tail)  # keep as categorical string
    feats["lastTok1"] = toks[-1]  # ensure always present
    return feats

def cv_tail_features(word: str):
    """Approximate CV tail from raw word if CV_split not present (best-effort)."""
    if not word:
        return {}
    def cv(c):
        return 'V' if c in VOWELS else 'C'
    tail_cv = ''.join(cv(ch) for ch in word[-3:])  # last 3 chars' CV
    return {"tail_cv_approx": tail_cv, "tail_last_cv": tail_cv[-1:]}

def build_features_row(row):
    """Unified per-row feature dict (now also pulls your numeric counters if present)."""
    feats = {}
    # root CV (always)
    feats.update(root_cv_features(row.get("root_cv", "")))

    # surface word / tokens
    word = ""
    char_tokens = None

    if "Char_split" in row and row["Char_split"] is not None:
        cs = safe_literal_list(row["Char_split"])
        toks = flatten_char_split(cs) if cs is not None else None
        char_tokens = toks
        word = tokens_to_word(toks) if toks else ""
    elif "Word" in row and pd.notna(row.get("Word", None)):
        word = str(row["Word"])
    else:
        word = ""

    # end-of-word character features
    feats.update(last_char_features(word, k_chars=(1,2,3)))

    # if we have tokenized clusters (handles digraphs like 'ch')
    if char_tokens:
        feats.update(last_cluster_features(char_tokens, k_clusters=(1,2)))

    # approximate CV tail from raw string (fallback)
    feats.update(cv_tail_features(word))

    # ==== NEW: attach your numeric counters if present on the row ====
    for k in NEW_NUM_FEATS:
        if k in row and pd.notna(row[k]):
            # cast to float so DictVectorizer treats them as numeric
            try:
                feats[k] = float(row[k])
            except Exception:
                # if any stray non-numeric sneaks in, skip silently
                pass

    return feats

In [82]:
def build_dataset(df_in: pd.DataFrame):
    rows = []
    for _, r in df_in.iterrows():
        full = r['Full_chain']
        root, trimmed_auto = extract_root_and_trimmed(full)
        trimmed = r['Trimmed_chain'] if 'Trimmed_chain' in df_in.columns and pd.notna(r['Trimmed_chain']) else trimmed_auto
        suffixes = suffixes_from_trimmed(trimmed)

        row = {
            "full_chain": full,
            "root_cv": root,
            "trimmed": trimmed if trimmed else END_LABEL,
            "suffixes": suffixes,
            "suffix_len": len(suffixes),
        }
        # carry optional richer columns if present
        for opt in ("Word","Char_split","CV_split"):
            if opt in df_in.columns:
                row[opt] = r[opt]

        # ==== NEW: carry through your precomputed numeric columns ====
        for k in NEW_NUM_FEATS:
            if k in df_in.columns:
                row[k] = r[k]

        rows.append(row)
    return pd.DataFrame(rows)

In [83]:
def dicts_from_df(df: pd.DataFrame, add_prev=None):
    """Turn rows into feature dicts. add_prev = {'y_step1': 'CV', ...} keys present in df or to inject."""
    feat_dicts = []
    for _, r in df.iterrows():
        base = build_features_row(r)
        if add_prev:
            for k in add_prev:
                if k in r and pd.notna(r[k]):
                    base[k] = r[k]
        feat_dicts.append(base)
    return feat_dicts

In [84]:
def vec_fit_transform(feat_dicts):
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(feat_dicts)
    return X, vec

def vec_transform(vec, feat_dicts):
    return vec.transform(feat_dicts)

In [85]:
def grouped_split(df, train_size=0.8, seed=RANDOM_STATE):
    gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=seed)
    groups = df['root_cv'].astype(str).values
    tr_idx, te_idx = next(gss.split(df, groups=groups))
    return df.iloc[tr_idx].reset_index(drop=True), df.iloc[te_idx].reset_index(drop=True)

In [86]:
def topN_labels_by_freq(y, Ns=(16,25,37,57,103)):
    ctr = Counter(y)
    most_common = ctr.most_common()
    return {N: set([lab for lab,_ in most_common[:N]]) for N in Ns}, ctr

def eval_subsets(y_true, y_pred, labels_by_topN):
    out = {}
    y_true = np.array(y_true); y_pred = np.array(y_pred)
    for N, labelset in labels_by_topN.items():
        idx = [i for i,lab in enumerate(y_true) if lab in labelset]
        if not idx:
            out[N] = {"accuracy": np.nan, "f1_macro": np.nan, "f1_weighted": np.nan, "support": 0}
            continue
        yt, yp = y_true[idx], y_pred[idx]
        out[N] = {
            "accuracy": accuracy_score(yt, yp),
            "f1_macro": f1_score(yt, yp, average='macro', zero_division=0),
            "f1_weighted": f1_score(yt, yp, average='weighted', zero_division=0),
            "support": len(idx),
        }
    return out

def print_subset_metrics(name, d):
    print(f"\n== {name}: Top-N subsets ==")
    for N in sorted(d.keys()):
        m = d[N]
        print(f"Top-{N:>3} (n={m['support']:>4}): Acc={m['accuracy']:.3f} | F1_mac={m['f1_macro']:.3f} | F1_wt={m['f1_weighted']:.3f}")


In [87]:
def make_classifier(kind="tree", **kwargs):
    if kind == "rf":
        return RandomForestClassifier(
            n_estimators=80, max_depth=10, min_samples_leaf=5,
            random_state=RANDOM_STATE, n_jobs=-1
        )
    # default: simple tree
    return DecisionTreeClassifier(
        criterion="entropy",
        max_depth=kwargs.get("max_depth", 6),
        min_samples_leaf=kwargs.get("min_samples_leaf", 10),
        random_state=RANDOM_STATE
    )

In [88]:
def run_single_shot(df_all, clf_kind="tree"):
    df_tr, df_te = grouped_split(df_all, train_size=0.8)
    # vectorize
    Xtr_dicts = dicts_from_df(df_tr)
    Xtr, vec = vec_fit_transform(Xtr_dicts)
    ytr = df_tr['trimmed'].astype(str).values

    clf = make_classifier(clf_kind)
    clf.fit(Xtr, ytr)

    Xte = vec_transform(vec, dicts_from_df(df_te))
    yte = df_te['trimmed'].astype(str).values
    yhat = clf.predict(Xte)

    acc = accuracy_score(yte, yhat)
    f1m = f1_score(yte, yhat, average='macro', zero_division=0)
    f1w = f1_score(yte, yhat, average='weighted', zero_division=0)

    print("=== Single-shot classifier ===")
    print(f"Test: Acc={acc:.3f} | F1_macro={f1m:.3f} | F1_weighted={f1w:.3f}")

    labels_by_topN, _ = topN_labels_by_freq(df_tr['trimmed'].astype(str).values)
    subset = eval_subsets(yte, yhat, labels_by_topN)
    print_subset_metrics("Single-shot", subset)

    # gentle warning if we had no surface info
    if ("Word" not in df_all.columns) and ("Char_split" not in df_all.columns):
        print("\n[warn] No 'Word' or 'Char_split' columns found. Using root-only features. "
              "Add surface columns for better results.")

    return {"clf": clf, "vec": vec, "test_df": df_te, "test_pred": yhat}

In [89]:
def train_length_classifier(df_tr, clf_kind="tree"):
    # target: suffix_len binned: 1,2,3,4+  (map 4..6 to '4+')
    ylen = []
    for n in df_tr['suffix_len'].values:
        ylen.append(str(n) if n in (1,2,3) else "4+")
    ylen = np.array(ylen)

    X_dicts = dicts_from_df(df_tr)
    X, vec = vec_fit_transform(X_dicts)
    clf = make_classifier(clf_kind, max_depth=5, min_samples_leaf=10)
    clf.fit(X, ylen)
    return clf, vec

def make_step_frame(df, step):
    # y_step = suffix at position 'step' from end, or END if none
    y = []
    for sufs in df['suffixes']:
        if len(sufs) >= step:
            y.append(sufs[-step])
        else:
            y.append(END_LABEL)
    df2 = df.copy()
    df2[f"y_step{step}"] = y
    return df2

def run_sequential(df_all, clf_kind="tree", max_steps_cap=5):
    df_tr, df_te = grouped_split(df_all, train_size=0.8)

    # 1) Length-first
    len_clf, len_vec = train_length_classifier(df_tr, clf_kind=clf_kind)

    # determine max steps to train (≤ cap)
    max_steps = min(max_steps_cap, 4)  # we only need up to 4 because 4+ bucket
    print(f"\n=== Length-first + Sequential ===\nTraining up to {max_steps} steps (last→first)")

    # 2) Train step-wise trees (teacher forcing for previous predictions)
    step_vecs, step_clfs = {}, {}
    prev_cols = []
    for step in range(1, max_steps+1):
        df_step = make_step_frame(df_tr, step)
        X_dicts = dicts_from_df(df_step, add_prev=set(prev_cols))
        X, vec = vec_fit_transform(X_dicts)
        y = df_step[f"y_step{step}"].astype(str).values

        clf = make_classifier(clf_kind, max_depth=6, min_samples_leaf=8)
        clf.fit(X, y)

        step_vecs[step] = vec
        step_clfs[step] = clf
        prev_cols.append(f"y_step{step}")

    # 3) Inference
    gold_full = df_te['trimmed'].astype(str).values
    preds_full = []

    # diagnostics per-step
    per_step_gold = defaultdict(list)
    per_step_pred = defaultdict(list)

    # Pre-compute length predictions
    Xlen = vec_transform(len_vec, dicts_from_df(df_te))
    ylen_pred = len_clf.predict(Xlen)

    for i, r in df_te.iterrows():
        # predicted number of suffixes to output
        k_str = ylen_pred[i]
        K = 4 if k_str == "4+" else int(k_str)

        prev_preds = []
        # Build base feat (constant for this word; we do not peel in features)
        base_row = r.to_dict()

        for step in range(1, K+1):
            # feature dict with previous predicted labels injected
            feat = build_features_row(base_row)
            for j, lab in enumerate(prev_preds, start=1):
                feat[f"y_step{j}"] = lab

            X_one = vec_transform(step_vecs[step], [feat])
            yhat = step_clfs[step].predict(X_one)[0]

            # collect step metrics (gold at this step)
            gold_suffixes = r['suffixes']
            ygold = gold_suffixes[-step] if len(gold_suffixes) >= step else END_LABEL
            per_step_gold[step].append(ygold)
            per_step_pred[step].append(yhat)

            if yhat == END_LABEL:
                break
            prev_preds.append(yhat)

        # reconstruct chain (earliest→latest)
        pred_chain = '-'.join(reversed(prev_preds)) if prev_preds else END_LABEL
        preds_full.append(pred_chain)

    # Evaluate exact chain
    acc = accuracy_score(gold_full, preds_full)
    f1m = f1_score(gold_full, preds_full, average='macro', zero_division=0)
    f1w = f1_score(gold_full, preds_full, average='weighted', zero_division=0)

    print(f"Test: Acc={acc:.3f} | F1_macro={f1m:.3f} | F1_weighted={f1w:.3f}")

    labels_by_topN, _ = topN_labels_by_freq(df_tr['trimmed'].astype(str).values)
    subset = eval_subsets(gold_full, preds_full, labels_by_topN)
    print_subset_metrics("Sequential", subset)

    # per-step metrics (sanity)
    for step in range(1, max_steps+1):
        if len(per_step_gold[step]) == 0:
            continue
        ys = np.array(per_step_gold[step]); ps = np.array(per_step_pred[step])
        a = accuracy_score(ys, ps)
        fm = f1_score(ys, ps, average='macro', zero_division=0)
        fw = f1_score(ys, ps, average='weighted', zero_division=0)
        print(f"Step {step}: Acc={a:.3f} | F1_macro={fm:.3f} | F1_weighted={fw:.3f}")

    # gentle warning if no surface columns
    if ("Word" not in df_all.columns) and ("Char_split" not in df_all.columns):
        print("\n[warn] No 'Word' or 'Char_split' found. Sequential features can’t see surface endings; "
              "add them to boost Step1/Step2 substantially.")

    return {"len_clf": len_clf, "len_vec": len_vec,
            "step_clfs": step_clfs, "step_vecs": step_vecs,
            "test_df": df_te, "test_pred": preds_full}

In [90]:
# =========================
# SUFFIX CLASSIFIER SAVING/LOADING FUNCTIONS
# =========================
# These functions handle saving and loading the single-shot and sequential
# classifiers (RandomForest/DecisionTree) that predict suffixes

def generate_suffix_classifier_id(str_df, clf_kind="tree"):
    """
    Generate a unique identifier for suffix classifiers based on data and classifier type.
    
    Args:
        str_df: DataFrame used for training
        clf_kind: Type of classifier ("tree" or "rf")
    
    Returns:
        A string identifier (hash) for the classifiers
    """
    # Create a dictionary of parameters
    params_dict = {
        'clf_kind': clf_kind,
        'df_shape': str_df.shape if str_df is not None else (0, 0),
        'df_columns': sorted(str_df.columns.tolist()) if str_df is not None else []
    }
    
    # Convert to JSON string and hash it
    params_str = json.dumps(params_dict, sort_keys=True)
    classifier_id = hashlib.md5(params_str.encode()).hexdigest()[:16]
    return classifier_id

def save_suffix_classifiers(single, seq, classifier_id, data_folder=DATA_FOLDER):
    """
    Save the single-shot and sequential suffix classifiers.
    
    Args:
        single: Dictionary with single-shot classifier results
        seq: Dictionary with sequential classifier results
        classifier_id: Unique identifier for these classifiers
        data_folder: Folder to save classifiers in
    """
    classifier_dir = os.path.join(data_folder, f"suffix_classifiers_{classifier_id}")
    os.makedirs(classifier_dir, exist_ok=True)
    
    # Save single-shot classifier
    if single is not None:
        single_path = os.path.join(classifier_dir, "single.pkl")
        with open(single_path, "wb") as f:
            pickle.dump(single, f)
    
    # Save sequential classifier
    if seq is not None:
        seq_path = os.path.join(classifier_dir, "seq.pkl")
        with open(seq_path, "wb") as f:
            pickle.dump(seq, f)
    
    # Save metadata
    metadata_path = os.path.join(classifier_dir, "metadata.json")
    with open(metadata_path, "w") as f:
        json.dump({
            'classifier_id': classifier_id,
            'clf_kind': single.get('clf').__class__.__name__ if single and 'clf' in single else 'unknown'
        }, f, indent=2)
    
    print(f"Suffix classifiers saved to {classifier_dir}")
    return classifier_dir

def load_suffix_classifiers(classifier_id, data_folder=DATA_FOLDER):
    """
    Load the single-shot and sequential suffix classifiers.
    
    Args:
        classifier_id: Unique identifier for the classifiers
        data_folder: Folder where classifiers are saved
    
    Returns:
        Tuple of (single, seq) or (None, None) if not found
    """
    classifier_dir = os.path.join(data_folder, f"suffix_classifiers_{classifier_id}")
    
    if not os.path.exists(classifier_dir):
        return None, None
    
    single_path = os.path.join(classifier_dir, "single.pkl")
    seq_path = os.path.join(classifier_dir, "seq.pkl")
    
    single = None
    seq = None
    
    if os.path.exists(single_path):
        with open(single_path, "rb") as f:
            single = pickle.load(f)
    
    if os.path.exists(seq_path):
        with open(seq_path, "rb") as f:
            seq = pickle.load(f)
    
    if single is not None or seq is not None:
        print(f"Suffix classifiers loaded from {classifier_dir}")
    
    return single, seq

def run_all(str_df, clf_kind="tree"):
    """
    Train or load suffix prediction classifiers (single-shot and sequential).
    
    This function will:
    1. Check if classifiers with the same parameters already exist
    2. If found, load them and return them (skipping training)
    3. If not found, train new classifiers and save them
    
    Args:
        str_df: DataFrame with morphological data
        clf_kind: "tree" (default) or "rf" (small RandomForest for extra lift).
                  Your `str_df` can be:
                  - minimal: ['Full_chain'] or ['Full_chain','Trimmed_chain']
                  - richer: add ['Word','Char_split','CV_split'] for much better results
    
    Returns:
        Tuple of (single, seq) dictionaries containing classifiers and results
    """
    # Generate classifier identifier
    classifier_id = generate_suffix_classifier_id(str_df, clf_kind=clf_kind)
    
    # Try to load existing classifiers
    print(f"Checking for existing suffix classifiers with ID: {classifier_id}")
    single, seq = load_suffix_classifiers(classifier_id, data_folder=DATA_FOLDER)
    
    if single is not None and seq is not None:
        print(f"✅ Found existing classifiers! Loading from data folder.")
        return single, seq
    
    # Classifiers don't exist, need to train
    print(f"No existing classifiers found. Training new classifiers...")
    
    df_all = build_dataset(str_df)
    print("Total samples:", len(df_all))
    print("Unique Trimmed_chain:", df_all['trimmed'].nunique())
    print("Suffix length distribution:", df_all['suffix_len'].value_counts().sort_index().to_dict())

    single = run_single_shot(df_all, clf_kind=clf_kind)
    seq = run_sequential(df_all, clf_kind=clf_kind, max_steps_cap=5)
    
    # Save the trained classifiers
    print(f"\nSaving trained suffix classifiers with ID: {classifier_id}")
    save_suffix_classifiers(single, seq, classifier_id, data_folder=DATA_FOLDER)
    
    return single, seq

In [91]:
single, seq = run_all(str_df, clf_kind="tree")

Checking for existing suffix classifiers with ID: eeab9cd2ca1ef3f6
Suffix classifiers loaded from data\suffix_classifiers_eeab9cd2ca1ef3f6
✅ Found existing classifiers! Loading from data folder.


In [92]:
single, seq = run_all(str_df, clf_kind="rf")

Checking for existing suffix classifiers with ID: b04651ccd6358c94
Suffix classifiers loaded from data\suffix_classifiers_b04651ccd6358c94
✅ Found existing classifiers! Loading from data folder.


In [93]:
import ast, re, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_recall_fscore_support

In [94]:
RNG = 42
torch.manual_seed(RNG)
np.random.seed(RNG)

NEW_NUM_FEATS = [
    "Word_len", "Vowel_no", "Cons_no",
    "Tail_cons_no", "Tail_vowel_no",
    "No_splits", "YW_count", "Tail_YW_count"
]


In [95]:
# =========================
# DATA PREPROCESSING HELPER FUNCTIONS
# =========================
# These functions convert DataFrame rows into training samples with tokens,
# boundary labels, and privileged features

def safe_list(x):
    """
    Safely convert a string representation of a list to an actual list.
    Handles various formats that pandas might use when storing lists.
    
    Args:
        x: Either a list or a string representation of a list
    
    Returns:
        A Python list
    """
    if isinstance(x, list): 
        return x
    s = str(x)
    try:
        return ast.literal_eval(s)
    except Exception:
        # Try alternative format conversion for nested lists
        s2 = s.replace("[[", "[['").replace("]]", "']]").replace("], [", "'],['").replace(", ", "','")
        return ast.literal_eval(s2)

def flatten(list_of_lists):
    """
    Flatten a nested list structure into a single list.
    
    Args:
        list_of_lists: A list containing sublists (e.g., [[a,b], [c], [d,e]])
    
    Returns:
        A flattened list (e.g., [a, b, c, d, e])
    """
    out = []
    for seg in list_of_lists: 
        out.extend(seg)
    return [str(t) for t in out]

def extract_priv_features_from_row(row, feat_names):
    """
    Extract privileged (numeric) features from a DataFrame row.
    
    Privileged features are features that are available during training
    but not during inference (e.g., word length, vowel count, etc.).
    These are used by the K-teacher regularizer.
    
    Args:
        row: A pandas Series (DataFrame row)
        feat_names: List of feature column names to extract
    
    Returns:
        List of feature values as floats
    """
    vec = []
    for k in feat_names:
        # Get value from row, defaulting to 0.0 if missing or NaN
        val = row[k] if (k in row and pd.notna(row[k])) else 0.0
        try: 
            vec.append(float(val))
        except Exception: 
            vec.append(0.0)  # Default to 0.0 if conversion fails
    return vec

def build_samples_with_priv(df, feat_names=NEW_NUM_FEATS):
    """
    Convert DataFrame rows into training samples with tokens, labels, and privileged features.
    
    Each sample contains:
    - tokens: List of character-level tokens (graphemes) for the word
    - y: Binary labels indicating boundary positions (1 = boundary, 0 = no boundary)
    - priv: Privileged numeric features (word length, vowel count, etc.)
    
    Args:
        df: DataFrame with columns including 'Char_split' and feature columns
        feat_names: List of privileged feature names to extract
    
    Returns:
        List of dictionaries, each containing:
          {"tokens": [...], "y": [0/1, ...], "priv": [f1, ..., fF]}
    """
    rows = []
    for _, r in df.iterrows():
        # Get character-level splits (list of lists of graphemes per morpheme)
        cs = safe_list(r["Char_split"])
        
        # Flatten to get all tokens in sequence
        toks = flatten(cs)
        
        # Calculate boundary positions based on morpheme lengths
        lens = [len(seg) for seg in cs]  # Length of each morpheme in tokens
        cut_idxs = set(np.cumsum(lens)[:-1].tolist())  # Cumulative positions where boundaries occur
        
        # Create binary labels: 1 if boundary after token i, 0 otherwise
        y = [1 if (i+1) in cut_idxs else 0 for i in range(len(toks)-1)]
        
        # Extract privileged features
        priv = extract_priv_features_from_row(r, feat_names)
        
        rows.append({"tokens": toks, "y": y, "priv": priv})
    return rows

In [96]:
# =========================
# DECISION TREE PRIOR FUNCTIONS
# =========================
# These functions create and use a Decision Tree classifier to provide boundary priors
# The DT is trained on token-window features (left/right context around each boundary)

def featurize_window(tokens, i, k_left=2, k_right=2):
    """
    Extract features for a token-window around position i.
    
    Creates features based on the local context (left and right tokens) around
    a potential boundary position. This includes:
    - Token identities (L1, L2, R1, R2)
    - CV patterns (consonant/vowel classification)
    - Character-level features
    
    Args:
        tokens: List of token strings
        i: Position index (boundary is after token i)
        k_left: Number of left context tokens to include
        k_right: Number of right context tokens to include
    
    Returns:
        Dictionary of feature name -> value mappings
    """
    feats = {}
    # Left context tokens (before the boundary)
    for k in range(1, k_left+1):
        idx = i-(k-1)
        feats[f"L{k}"] = tokens[idx] if idx >= 0 else "<BOS>"  # Beginning of sequence
    
    # Right context tokens (after the boundary)
    for k in range(1, k_right+1):
        idx = i+k
        feats[f"R{k}"] = tokens[idx] if idx < len(tokens) else "<EOS>"  # End of sequence
    
    # Helper function to classify characters as vowels or consonants
    def is_vowel(ch): 
        return ch.lower() in "aeiouáéíóú"
    
    # Extract CV (consonant/vowel) patterns from immediate neighbors
    L1 = feats["L1"]
    R1 = feats["R1"]
    feats["L1_cv"] = 'V' if is_vowel(L1[-1]) else 'C'  # Last char of left token
    feats["R1_cv"] = 'V' if (R1 != "<EOS>" and is_vowel(R1[0])) else 'C'  # First char of right token
    
    # Character-level features
    feats["L1_last"] = L1[-1]  # Last character of left token
    feats["R1_first"] = R1[0] if R1 != "<EOS>" else "<EOS>"  # First character of right token
    
    return feats

def train_dt_prior(samples, max_depth=6, min_leaf=8):
    """
    Train a Decision Tree classifier to predict boundary probabilities.
    
    The DT learns patterns in token-window features that indicate where
    morpheme boundaries are likely to occur. This prior is then used to
    guide the LSTM model during training.
    
    Args:
        samples: List of training samples with tokens and boundary labels
        max_depth: Maximum depth of the decision tree
        min_leaf: Minimum samples required in a leaf node
    
    Returns:
        Tuple of (DecisionTreeClassifier, DictVectorizer)
        - clf: Trained decision tree classifier
        - vec: Fitted vectorizer for converting feature dicts to vectors
    """
    # Extract features and labels for all boundary positions
    Xdict, y = [], []
    for s in samples:
        T = len(s["tokens"])
        # For each potential boundary position (between tokens)
        for i in range(T-1):
            # Extract window features around position i
            Xdict.append(featurize_window(s["tokens"], i))
            # Label: 1 if boundary exists, 0 otherwise
            y.append(s["y"][i])
    
    # Convert feature dictionaries to sparse matrix format
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(Xdict)
    
    # Train Decision Tree classifier
    clf = DecisionTreeClassifier(
        criterion="entropy",  # Use entropy for splitting
        max_depth=max_depth,
        min_samples_leaf=min_leaf,
        random_state=RNG  # For reproducibility
    )
    clf.fit(X, y)
    
    print(f"Trained DT prior: {clf.tree_.node_count} nodes, depth={clf.tree_.max_depth}")
    return clf, vec

def prior_probs_for_sample(clf, vec, tokens):
    """
    Get boundary probabilities from Decision Tree for a tokenized word.
    
    Args:
        clf: Trained DecisionTreeClassifier
        vec: Fitted DictVectorizer
        tokens: List of token strings for a word
    
    Returns:
        List of probabilities (one per potential boundary position)
    """
    if clf is None or vec is None or len(tokens) <= 1:
        # Default to 0.5 (uncertain) if no prior available
        return [0.5] * (max(len(tokens)-1, 0))
    
    # Extract features for each boundary position
    Xd = [featurize_window(tokens, i) for i in range(len(tokens)-1)]
    X = vec.transform(Xd)  # Convert to feature matrix
    
    # Get probability predictions from DT
    proba = clf.predict_proba(X)  # Returns [P(no_boundary), P(boundary)]
    return proba[:, 1].tolist()  # Return probability of boundary (class 1)

In [97]:
def train_k_teacher_priv(samples, feat_dim):
    """
    Train a regressor to predict K (number of cuts) from priv feature vector.
    """
    X = np.array([s["priv"] for s in samples], dtype=float)   # (N, F)
    y = np.array([int(np.sum(s["y"])) for s in samples], dtype=float)
    reg = DecisionTreeRegressor(max_depth=6, min_samples_leaf=10, random_state=RNG)
    reg.fit(X, y)
    return reg

def predict_k_hat_priv(reg, priv_batch):
    # priv_batch: (B, F) float tensor
    with torch.no_grad():
        k = reg.predict(priv_batch.cpu().numpy())
    return torch.tensor(k, dtype=torch.float32, device=priv_batch.device)

In [98]:
def build_vocab(samples, min_freq=1):
    from collections import Counter
    ctr = Counter()
    for s in samples: ctr.update(s["tokens"])
    vocab = {"<PAD>":0, "<UNK>":1}
    for t,c in sorted(ctr.items(), key=lambda x: (-x[1], x[0])):
        if c>=min_freq and t not in vocab:
            vocab[t] = len(vocab)
    return vocab

class SegDataset(Dataset):
    def __init__(self, samples, vocab, dt_clf=None, dt_vec=None, feat_dim=0):
        self.samples = samples
        self.vocab = vocab
        self.dt_clf = dt_clf
        self.dt_vec = dt_vec
        self.feat_dim = feat_dim
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        s = self.samples[idx]
        tokens = s["tokens"]
        ids = [self.vocab.get(t, self.vocab["<UNK>"]) for t in tokens]
        y = s["y"]  # length T-1
        prior = prior_probs_for_sample(self.dt_clf, self.dt_vec, tokens)
        priv = s["priv"] if self.feat_dim>0 else []
        return {"ids": ids, "y": y, "prior": prior, "priv": priv, "tokens": tokens}

def collate(batch):
    maxT = max(len(b["ids"]) for b in batch)
    maxB = maxT-1
    B = len(batch)

    ids = torch.full((B, maxT), 0, dtype=torch.long)
    mask_tok = torch.zeros((B, maxT), dtype=torch.bool)
    y = torch.full((B, maxB), -100, dtype=torch.long)
    prior = torch.zeros((B, maxB), dtype=torch.float32)
    mask_b = torch.zeros((B, maxB), dtype=torch.bool)

    feat_dim = len(batch[0]["priv"]) if isinstance(batch[0]["priv"], list) else 0
    priv = torch.zeros((B, feat_dim), dtype=torch.float32) if feat_dim>0 else None

    for i, b in enumerate(batch):
        T = len(b["ids"])
        ids[i,:T] = torch.tensor(b["ids"], dtype=torch.long)
        mask_tok[i,:T] = True
        if T>1:
            L = T-1
            y[i,:L] = torch.tensor(b["y"], dtype=torch.long)
            p = b["prior"] if len(b["prior"])==L else [0.5]*L
            prior[i,:L] = torch.tensor(p, dtype=torch.float32)
            mask_b[i,:L] = True
        if feat_dim>0:
            priv[i] = torch.tensor(b["priv"], dtype=torch.float32)

    return {
        "ids": ids, "mask_tok": mask_tok,
        "y": y, "prior": prior, "mask_b": mask_b,
        "priv": priv  # (B, F) or None
    }

In [99]:
class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, emb_dim=16, hidden_size=64, num_layers=2,
                 use_prior=True, dropout=0.1, freeze_emb=False, fuse_mode="logit_add"):
        super().__init__()
        self.use_prior = use_prior
        self.fuse_mode = fuse_mode
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        if freeze_emb:
            for p in self.emb.parameters(): p.requires_grad = False
        lstm_dropout = dropout if num_layers > 1 else 0.0
        self.lstm = nn.LSTM(
            input_size=emb_dim, hidden_size=hidden_size//2,
            num_layers=num_layers, dropout=lstm_dropout,
            bidirectional=True, batch_first=True
        )
        in_mlp = hidden_size + (1 if (use_prior and fuse_mode=="concat") else 0)
        self.boundary_mlp = nn.Sequential(
            nn.Linear(in_mlp, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, 2)
        )
        if use_prior and fuse_mode == "logit_add":
            self.alpha = nn.Parameter(torch.tensor(1.0))

    def forward(self, ids, prior, mask_tok):
        emb = self.emb(ids)
        h, _ = self.lstm(emb)          # (B,T,H)
        left = h[:, :-1, :]            # (B,T-1,H)
        if self.use_prior and self.fuse_mode == "concat":
            feat = torch.cat([left, prior.unsqueeze(-1)], dim=-1)
            return self.boundary_mlp(feat)
        logits = self.boundary_mlp(left)
        if self.use_prior and self.fuse_mode == "logit_add":
            eps = 1e-6
            p = prior.clamp(eps, 1-eps)
            prior_logit = torch.log(p) - torch.log(1-p)
            logits[..., 1] = logits[..., 1] + self.alpha * prior_logit
        return logits


In [100]:
def boundary_metrics_from_lists(probs_list, gold_list, thr=0.5):
    if not probs_list: return 0.0,0.0,0.0
    p = torch.cat([t for t in probs_list if t.numel()>0], dim=0).numpy()
    g = torch.cat([t for t in gold_list if t.numel()>0], dim=0).numpy()
    pred = (p >= thr).astype(int)
    P,R,F1,_ = precision_recall_fscore_support(g, pred, average='binary', zero_division=0)
    return P,R,F1

def exact_match_rate_from_lists(probs_list, gold_list, thr=0.5):
    if not probs_list: return 0.0
    em=[]
    for p,g in zip(probs_list, gold_list):
        if g.numel()==0: em.append(1.0)
        else:
            pred = (p.numpy() >= thr).astype(int)
            em.append(float(np.array_equal(pred, g.numpy())))
    return float(np.mean(em))

@torch.no_grad()
def predict(model, loader):
    model.eval()
    probs_list, gold_list = [], []
    for batch in loader:
        logits = model(batch["ids"], batch["prior"], batch["mask_tok"])
        probs = torch.softmax(logits, dim=-1)[..., 1]      # (B,T-1)
        y = batch["y"]; mask = batch["mask_b"]
        B = probs.shape[0]
        for b in range(B):
            L = int(mask[b].sum().item())
            if L==0:
                probs_list.append(torch.empty(0))
                gold_list.append(torch.empty(0, dtype=torch.long))
            else:
                probs_list.append(probs[b,:L].cpu())
                gold_list.append(y[b,:L].cpu())
    return probs_list, gold_list

In [101]:
criterion_ce  = nn.CrossEntropyLoss()
criterion_bce = nn.BCEWithLogitsLoss(reduction="mean")
mse = nn.MSELoss(reduction="mean")

def train_epoch(model, loader, opt, lambda_prior=0.1, lambda_k=0.1, k_reg=None):
    model.train()
    tot=0; n=0
    for batch in loader:
        ids, prior, y, mask_b = batch["ids"], batch["prior"], batch["y"], batch["mask_b"]
        priv = batch["priv"]  # (B,F) or None

        logits = model(ids, prior, batch["mask_tok"])    # (B,T-1,2)
        logits_flat = logits[mask_b]                     # (N,2)
        y_true = y[mask_b]                               # (N,)

        # (1) CE on gold boundaries
        loss = criterion_ce(logits_flat, y_true)

        # (2) Optional: distill toward DT prior on cut-logit
        if lambda_prior > 0:
            cut_logit = logits[..., 1]                   # (B,T-1)
            prior_flat = prior[mask_b]                   # (N,)
            loss_pr = criterion_bce(cut_logit[mask_b], prior_flat)
            loss = loss + lambda_prior * loss_pr

        # (3) K-regularizer using privileged K-hat
        if (lambda_k > 0) and (k_reg is not None) and (priv is not None):
            with torch.no_grad():
                k_hat = predict_k_hat_priv(k_reg, priv)  # (B,)
            # expected number of cuts from model = sum(sigmoid(cut_logit))
            cut_logit = logits[..., 1]                   # (B,T-1)
            p_cut = torch.sigmoid(cut_logit)             # (B,T-1)
            exp_K = p_cut.sum(dim=1)                     # (B,)
            loss_k = mse(exp_K, k_hat)
            loss = loss + lambda_k * loss_k

        opt.zero_grad(); loss.backward(); opt.step()
        tot += loss.item(); n += 1
    return tot/max(n,1)

def split_train_test(samples, test_ratio=0.2):
    n = len(samples); idx = np.arange(n); np.random.shuffle(idx)
    cut = int(n*(1-test_ratio))
    tr = [samples[i] for i in idx[:cut]]
    te = [samples[i] for i in idx[cut:]]
    return tr, te

def best_threshold_for_exact(probs_list, gold_list, grid=None):
    if grid is None: grid = np.linspace(0.3, 0.9, 61)
    best_thr, best_em, best_f1 = 0.5, -1.0, 0.0
    p_all = np.concatenate([t.numpy() for t in probs_list if t.numel()>0], axis=0)
    g_all = np.concatenate([t.numpy() for t in gold_list  if t.numel()>0], axis=0)
    for thr in grid:
        ems=[]
        for p,g in zip(probs_list, gold_list):
            if g.numel()==0: ems.append(1.0); continue
            ems.append(float(np.array_equal((p.numpy()>=thr).astype(int), g.numpy())))
        em = float(np.mean(ems))
        pred_all = (p_all>=thr).astype(int)
        P,R,F1,_ = precision_recall_fscore_support(g_all, pred_all, average='binary', zero_division=0)
        if em>best_em or (np.isclose(em,best_em) and F1>best_f1):
            best_thr, best_em, best_f1 = thr, em, F1
    print(f"[Exact-opt threshold] thr={best_thr:.3f} | exact={best_em:.3f} | boundaryF1={best_f1:.3f}")
    return best_thr

In [102]:
# =========================
# MODEL AND DECISION TREE SAVING/LOADING FUNCTIONS
# =========================
# These functions handle saving and loading trained models and Decision Trees
# to avoid retraining. Models are saved to a folder named after the notebook.

def generate_model_id(df, epochs, use_prior, fuse_mode, lambda_prior, lambda_k, 
                     batch_size, hparams, max_depth=6, min_leaf=8):
    """
    Generate a unique identifier for a model based on its training parameters.
    This ensures that models with the same parameters can be reused.
    
    Args:
        All training parameters that affect the model
        max_depth: Decision Tree max depth
        min_leaf: Decision Tree min samples per leaf
    
    Returns:
        A string identifier (hash) for the model
    """
    # Create a dictionary of all parameters
    params_dict = {
        'epochs': epochs,
        'use_prior': use_prior,
        'fuse_mode': fuse_mode,
        'lambda_prior': lambda_prior,
        'lambda_k': lambda_k,
        'batch_size': batch_size,
        'hparams': hparams,
        'max_depth': max_depth,
        'min_leaf': min_leaf,
        'df_shape': df.shape if df is not None else (0, 0)
    }
    
    # Convert to JSON string and hash it
    params_str = json.dumps(params_dict, sort_keys=True)
    model_id = hashlib.md5(params_str.encode()).hexdigest()[:16]
    return model_id

def save_dt_prior(dt_clf, dt_vec, model_id, models_folder=MODELS_FOLDER):
    """
    Save the Decision Tree prior (classifier and vectorizer).
    
    Args:
        dt_clf: Trained DecisionTreeClassifier
        dt_vec: Fitted DictVectorizer
        model_id: Unique identifier for this model
        models_folder: Folder to save models in
    """
    model_dir = os.path.join(models_folder, model_id)
    os.makedirs(model_dir, exist_ok=True)
    
    # Save Decision Tree classifier
    dt_clf_path = os.path.join(model_dir, "dt_clf.pkl")
    with open(dt_clf_path, "wb") as f:
        pickle.dump(dt_clf, f)
    
    # Save DictVectorizer
    dt_vec_path = os.path.join(model_dir, "dt_vec.pkl")
    with open(dt_vec_path, "wb") as f:
        pickle.dump(dt_vec, f)
    
    print(f"Decision Tree prior saved to {model_dir}")

def load_dt_prior(model_id, models_folder=MODELS_FOLDER):
    """
    Load the Decision Tree prior (classifier and vectorizer).
    
    Args:
        model_id: Unique identifier for the model
        models_folder: Folder where models are saved
    
    Returns:
        Tuple of (dt_clf, dt_vec) or (None, None) if not found
    """
    model_dir = os.path.join(models_folder, model_id)
    
    dt_clf_path = os.path.join(model_dir, "dt_clf.pkl")
    dt_vec_path = os.path.join(model_dir, "dt_vec.pkl")
    
    if not os.path.exists(dt_clf_path) or not os.path.exists(dt_vec_path):
        return None, None
    
    with open(dt_clf_path, "rb") as f:
        dt_clf = pickle.load(f)
    
    with open(dt_vec_path, "rb") as f:
        dt_vec = pickle.load(f)
    
    print(f"Decision Tree prior loaded from {model_dir}")
    return dt_clf, dt_vec

def save_model(model, vocab, out, model_id, models_folder=MODELS_FOLDER):
    """
    Save a trained model and its associated artifacts.
    
    Args:
        model: The trained PyTorch model
        vocab: Vocabulary dictionary
        out: Dictionary containing dt_clf, dt_vec, k_teacher, best_thr, etc.
        model_id: Unique identifier for this model
        models_folder: Folder to save models in
    """
    model_dir = os.path.join(models_folder, model_id)
    os.makedirs(model_dir, exist_ok=True)
    
    # Save model state
    model_path = os.path.join(model_dir, "model.pt")
    torch.save(model.state_dict(), model_path)
    
    # Save vocabulary
    vocab_path = os.path.join(model_dir, "vocab.pkl")
    with open(vocab_path, "wb") as f:
        pickle.dump(vocab, f)
    
    # Save Decision Tree prior separately (if present)
    if out.get("dt_clf") is not None and out.get("dt_vec") is not None:
        save_dt_prior(out["dt_clf"], out["dt_vec"], model_id, models_folder)
    
    # Save other artifacts (k_teacher, best_thr, etc.) - exclude dt_clf/dt_vec as they're saved separately
    artifacts = {k: v for k, v in out.items() if k not in ["dt_clf", "dt_vec"]}
    artifacts_path = os.path.join(model_dir, "artifacts.pkl")
    with open(artifacts_path, "wb") as f:
        pickle.dump(artifacts, f)
    
    # Save metadata (parameters used)
    metadata_path = os.path.join(model_dir, "metadata.json")
    with open(metadata_path, "w") as f:
        json.dump({
            'model_id': model_id,
            'vocab_size': len(vocab),
            'model_name': MODEL_NAME
        }, f, indent=2)
    
    print(f"Model saved to {model_dir}")
    return model_dir

def load_model(model_id, models_folder=MODELS_FOLDER, vocab_size=None):
    """
    Load a trained model and its associated artifacts.
    
    Args:
        model_id: Unique identifier for the model
        models_folder: Folder where models are saved
        vocab_size: Vocabulary size (needed to reconstruct model architecture)
    
    Returns:
        Dictionary with 'vocab', 'out', 'dt_clf', 'dt_vec', 'model_state_path', 'model_dir' or None if not found
    """
    model_dir = os.path.join(models_folder, model_id)
    
    if not os.path.exists(model_dir):
        return None
    
    # Load vocabulary
    vocab_path = os.path.join(model_dir, "vocab.pkl")
    if not os.path.exists(vocab_path):
        return None
    
    with open(vocab_path, "rb") as f:
        vocab = pickle.load(f)
    
    # Load Decision Tree prior
    dt_clf, dt_vec = load_dt_prior(model_id, models_folder)
    
    # Load other artifacts
    artifacts_path = os.path.join(model_dir, "artifacts.pkl")
    if not os.path.exists(artifacts_path):
        return None
    
    with open(artifacts_path, "rb") as f:
        artifacts = pickle.load(f)
    
    # Combine artifacts with DT prior
    out = {**artifacts, "dt_clf": dt_clf, "dt_vec": dt_vec}
    
    # Load model state
    model_path = os.path.join(model_dir, "model.pt")
    if not os.path.exists(model_path):
        return None
    
    print(f"Model artifacts loaded from {model_dir}")
    return {
        'vocab': vocab,
        'out': out,
        'dt_clf': dt_clf,
        'dt_vec': dt_vec,
        'model_state_path': model_path,
        'model_dir': model_dir
    }


In [103]:
# ===================================================================
# MAIN TRAINING FUNCTION WITH MODEL AND DECISION TREE CHECKPOINTING
# ===================================================================
# This function trains a morphology parser model. It checks if a model with
# the same parameters already exists and loads it instead of retraining.
# It also saves/loads the Decision Tree prior to avoid retraining it.

def run_segmentation_with_privK(
    df,
    epochs=15,
    use_prior=True,
    fuse_mode="logit_add",
    lambda_prior=0.1,     # DT prior distillation weight
    lambda_k=0.2,         # privileged K-regularizer weight (try 0.1~0.4)
    batch_size=64,
    hparams=None,
    max_depth=6,          # Decision Tree max depth
    min_leaf=8            # Decision Tree min samples per leaf
):
    """
    Train or load a morphology parser model with Decision Tree priors.
    
    This function will:
    1. Check if a model with the same parameters already exists
    2. If found, load it and return it (skipping training)
    3. If not found, train a new model and save it
    4. Also saves/loads the Decision Tree prior separately
    
    Args:
        df: Training DataFrame
        epochs: Number of training epochs
        use_prior: Whether to use Decision Tree prior
        fuse_mode: How to fuse prior with model predictions
        lambda_prior: Weight for prior distillation loss
        lambda_k: Weight for K-regularizer loss
        batch_size: Training batch size
        hparams: Model hyperparameters dictionary
        max_depth: Decision Tree maximum depth
        min_leaf: Decision Tree minimum samples per leaf
    
    Returns:
        Tuple of (model, vocab, out_dict)
    """
    if hparams is None:
        hparams = dict(emb_dim=16, hidden_size=64, num_layers=2,
                       dropout=0.25, lr=1e-3, weight_decay=1e-4, freeze_emb=False)
    
    # Generate model identifier based on parameters
    model_id = generate_model_id(
        df, epochs, use_prior, fuse_mode, lambda_prior, lambda_k, 
        batch_size, hparams, max_depth=max_depth, min_leaf=min_leaf
    )
    
    # Try to load existing model
    print(f"Checking for existing model with ID: {model_id}")
    loaded = load_model(model_id, models_folder=MODELS_FOLDER)
    
    if loaded is not None:
        print(f"✅ Found existing model! Loading from {loaded['model_dir']}")
        # Reconstruct model architecture
        vocab = loaded['vocab']
        out = loaded['out']
        dt_clf = loaded['dt_clf']
        dt_vec = loaded['dt_vec']
        model_state_path = loaded['model_state_path']
        
        model = BiLSTMTagger(
            vocab_size=len(vocab),
            emb_dim=hparams.get("emb_dim", 16),
            hidden_size=hparams.get("hidden_size", 64),
            num_layers=hparams.get("num_layers", 2),
            use_prior=(use_prior and fuse_mode!="none"),
            dropout=hparams.get("dropout", 0.25),
            freeze_emb=hparams.get("freeze_emb", False),
            fuse_mode=fuse_mode
        )
        
        # Load model weights
        model.load_state_dict(torch.load(model_state_path))
        model.eval()
        
        print("Model and Decision Tree loaded successfully. Skipping training.")
        return model, vocab, out
    
    # Model doesn't exist, need to train
    print(f"No existing model found. Training new model...")
    
    # Build samples with privileged numeric features
    samples = build_samples_with_priv(df, feat_names=NEW_NUM_FEATS)
    train_s, test_s = split_train_test(samples, 0.2)

    # DT prior (token-window) trained on TRAIN ONLY
    dt_clf, dt_vec = (None, None)
    if use_prior:
        # Check if DT prior exists separately (for cases where only DT needs to be reused)
        dt_clf, dt_vec = load_dt_prior(model_id, models_folder=MODELS_FOLDER)
        if dt_clf is None or dt_vec is None:
            print("Training new Decision Tree prior...")
            dt_clf, dt_vec = train_dt_prior(train_s, max_depth=max_depth, min_leaf=min_leaf)
        else:
            print("Using existing Decision Tree prior.")

    # K-teacher (privileged) on TRAIN ONLY
    feat_dim = len(NEW_NUM_FEATS)
    k_reg = train_k_teacher_priv(train_s, feat_dim=feat_dim)

    # Build vocabulary from training data
    vocab = build_vocab(train_s, min_freq=1)

    # Create datasets and data loaders
    train_ds = SegDataset(train_s, vocab, dt_clf, dt_vec, feat_dim=feat_dim)
    test_ds  = SegDataset(test_s,  vocab, dt_clf, dt_vec, feat_dim=feat_dim)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  collate_fn=collate)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, collate_fn=collate)

    # Initialize BiLSTM model
    model = BiLSTMTagger(
        vocab_size=len(vocab),
        emb_dim=hparams.get("emb_dim", 16),
        hidden_size=hparams.get("hidden_size", 64),
        num_layers=hparams.get("num_layers", 2),
        use_prior=(use_prior and fuse_mode!="none"),
        dropout=hparams.get("dropout", 0.25),
        freeze_emb=hparams.get("freeze_emb", False),
        fuse_mode=fuse_mode
    )

    # Initialize optimizer
    opt = torch.optim.AdamW(model.parameters(), lr=hparams.get("lr", 1e-3), weight_decay=hparams.get("weight_decay", 1e-4))

    # Training loop
    final_probs_list, final_gold_list = None, None
    for ep in range(1, epochs+1):
        loss = train_epoch(model, train_loader, opt, lambda_prior=lambda_prior, lambda_k=lambda_k, k_reg=k_reg)
        probs_list, gold_list = predict(model, test_loader)
        P,R,F1 = boundary_metrics_from_lists(probs_list, gold_list, thr=0.5)
        EM = exact_match_rate_from_lists(probs_list, gold_list, thr=0.5)
        print(f"Epoch {ep:02d} | loss={loss:.4f} | boundary P/R/F1={P:.3f}/{R:.3f}/{F1:.3f} | exact={EM:.3f}")
        final_probs_list, final_gold_list = probs_list, gold_list

    # Find best threshold for exact match rate
    best_thr = best_threshold_for_exact(final_probs_list, final_gold_list)

    # Prepare output dictionary
    out = {
        "probs_list": final_probs_list,
        "gold_list": final_gold_list,
        "dt_clf": dt_clf, 
        "dt_vec": dt_vec,
        "k_teacher": k_reg,
        "best_thr": best_thr
    }
    
    # Save the trained model and Decision Tree
    print(f"\nSaving trained model with ID: {model_id}")
    save_model(model, vocab, out, model_id, models_folder=MODELS_FOLDER)

    return model, vocab, out

In [104]:
def tokenize_with_vocab(word: str, vocab: dict, max_token_len: int = 4):
    i, toks = 0, []
    while i < len(word):
        matched = None
        Lmax = min(max_token_len, len(word)-i)
        for L in range(Lmax, 0, -1):
            seg = word[i:i+L]
            if seg in vocab:
                matched = seg; break
        toks.append(matched if matched else word[i])
        i += len(toks[-1])
    return toks

@torch.no_grad()
def segment_tokens(model, vocab, tokens, dt_clf=None, dt_vec=None, thr=0.5):
    ids = torch.tensor([[vocab.get(t, vocab["<UNK>"]) for t in tokens]], dtype=torch.long)
    mask_tok = torch.ones_like(ids, dtype=torch.bool)
    T = len(tokens)
    if T<=1: return "".join(tokens), np.array([])
    prior_list = prior_probs_for_sample(dt_clf, dt_vec, tokens)
    prior = torch.tensor([prior_list], dtype=torch.float32)
    logits = model(ids, prior, mask_tok)
    probs = torch.softmax(logits, dim=-1)[0, :, 1].cpu().numpy()
    cuts = (probs >= thr).astype(int)
    out=[]
    for i, tok in enumerate(tokens):
        out.append(tok)
        if i < T-1 and cuts[i]==1: out.append("-")
    return "".join(out), probs

In [105]:
best = {
  "emb_dim": 16, "hidden_size": 64, "num_layers": 2,
  "dropout": 0.25, "lr": 0.001, "weight_decay": 0.0001, "freeze_emb": False,
}

In [106]:
model, vocab, out = run_segmentation_with_privK(
    str_df,
    epochs=50,
    use_prior=True,          # keep DT window prior
    fuse_mode="logit_add",   # or "concat" / "none" / "logit_add"
    lambda_prior=0.1,        # DT prior distillation
    lambda_k=0.2,            # privileged K-regularizer (uses NEW_NUM_FEATS)
    batch_size=64,
    hparams=best
)

Checking for existing model with ID: b4157b221fb77816
Decision Tree prior loaded from models_DT-LSTM-MarkovFilter\b4157b221fb77816
Model artifacts loaded from models_DT-LSTM-MarkovFilter\b4157b221fb77816
✅ Found existing model! Loading from models_DT-LSTM-MarkovFilter\b4157b221fb77816
Model and Decision Tree loaded successfully. Skipping training.


In [107]:
word = "pikunas"
tokens = tokenize_with_vocab(word, vocab, max_token_len=4)
thr = out.get("best_thr", 0.5)
seg_string, boundary_probs = segment_tokens(model, vocab, tokens, dt_clf=out["dt_clf"], dt_vec=out["dt_vec"], thr=thr)
print("Tokens:", tokens)
print("Boundary probs:", np.round(boundary_probs, 3).tolist())
print(f"Segmentation (thr={thr:.3f}):", seg_string)

Tokens: ['p', 'i', 'k', 'u', 'n', 'a', 's']
Boundary probs: [0.0, 0.6949999928474426, 0.0, 0.9259999990463257, 0.0, 0.9929999709129333]
Segmentation (thr=0.430): pi-ku-na-s


In [108]:
import numpy as np
from typing import List, Set, Tuple

# ---------- helpers to turn segs into boundary sets (char offsets) ----------
def offsets_from_morphemes(morphs: List[str]) -> Set[int]:
    # boundaries after each morph except the last
    offs = []
    s = 0
    for i, m in enumerate(morphs):
        s += len(m)
        if i < len(morphs) - 1:
            offs.append(s)
    return set(offs)

def offsets_from_tokens_and_mask(tokens: List[str], mask01: np.ndarray) -> Set[int]:
    # boundaries after token i where mask01[i]==1, measured in character offsets
    offs = set()
    cum = 0
    for i, t in enumerate(tokens):
        cum += len(t)
        if i < len(tokens) - 1 and mask01[i] == 1:
            offs.add(cum)
    return offs

def f1_from_sets(pred: Set[int], gold: Set[int]) -> Tuple[float, float, float, int, int, int]:
    tp = len(pred & gold)
    fp = len(pred - gold)
    fn = len(gold - pred)
    P = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    R = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    F1 = 2 * P * R / (P + R) if (P + R) > 0 else 0.0
    return P, R, F1, tp, fp, fn

def normalize_gold_variants(gold_variants):
    """
    Convert gold_variants to a list format, handling numpy arrays and nested structures.
    """
    if gold_variants is None:
        return []
    
    # If it's a numpy array, convert to list
    if isinstance(gold_variants, np.ndarray):
        gold_variants = gold_variants.tolist()
    
    # If it's already a list, ensure nested elements are also lists (not numpy arrays)
    if isinstance(gold_variants, list):
        normalized = []
        for variant in gold_variants:
            if isinstance(variant, np.ndarray):
                normalized.append(variant.tolist())
            elif isinstance(variant, list):
                # Recursively normalize nested lists
                normalized.append([item.tolist() if isinstance(item, np.ndarray) else item for item in variant])
            else:
                normalized.append(variant)
        return normalized
    
    return []

# ===================================================================
# NEW CODE: Suffix Validator Function
# ===================================================================

def is_segmentation_valid(
    segmentation: list[str],
    allowed_suffixes: set[str]
) -> bool:
    """
    Checks if a segmentation is valid based on a list of allowed suffixes.

    The first morpheme is assumed to be the root and is ignored. All subsequent
    morphemes must be in the `allowed_suffixes` set.

    Args:
        segmentation (list[str]): The predicted segmentation, e.g., ['pay', 'kunaq'].
        allowed_suffixes (set[str]): A set of valid suffix strings.

    Returns:
        bool: True if the segmentation is valid, False otherwise.
    """
    if len(segmentation) <= 1:
        # A word with no splits is always valid.
        return True

    # Check every morpheme starting from the second one.
    for morpheme in segmentation[1:]:
        if morpheme not in allowed_suffixes:
            return False  # Found a suffix that is not in the allowed list.

    return True

# ---------- main evaluation ----------
# ===================================================================
# MODIFIED CODE: Evaluation function with a rejection step
# ===================================================================
def evaluate_with_rejection(
    df, model, vocab, out,
    allowed_suffixes: list[str], # <-- New required argument
    max_token_len=4,
    use_tuned_thr=True,
    show_sample=5
):
    dt_clf, dt_vec = out["dt_clf"], out["dt_vec"]
    thr = float(out.get("best_thr", 0.5)) if use_tuned_thr else 0.5

    # Convert the list to a set for fast lookups
    allowed_suffixes_set = set(allowed_suffixes)

    total_tp = total_fp = total_fn = 0
    exact_hits = 0
    n_eval = 0
    rejection_count = 0  # <-- Counter for rejected predictions
    examples = []

    for _, row in df.iterrows():
        word = str(row["Word"])
        gold_variants = row["Gold"]

        # Normalize gold_variants (convert numpy arrays to lists)
        gold_variants = normalize_gold_variants(gold_variants)

        if not isinstance(gold_variants, list) or len(gold_variants) == 0:
            continue

        # 1. Get the model's prediction
        toks = tokenize_with_vocab(word, vocab, max_token_len=max_token_len)
        seg_string, probs = segment_tokens(model, vocab, toks, dt_clf=dt_clf, dt_vec=dt_vec, thr=thr)
        predicted_morphs = seg_string.split('-')

        # 2. Validate the prediction using the suffix list
        if is_segmentation_valid(predicted_morphs, allowed_suffixes_set):
            # VALID: Score it normally
            mask01 = (probs >= thr).astype(int)
            pred_set = offsets_from_tokens_and_mask(toks, mask01)
        else:
            # REJECTED: Treat as a complete failure (no boundaries found)
            rejection_count += 1
            pred_set = set() # An empty set means 0 true positives and 0 false positives.

        # 3. Compare with gold standard (this part is the same)
        gold_sets = [offsets_from_morphemes(gv) for gv in gold_variants]
        if any(pred_set == gs for gs in gold_sets):
            exact_hits += 1

        best = max((f1_from_sets(pred_set, gs) + (gs,) for gs in gold_sets), key=lambda z: z[2])
        P, R, F1, tp, fp, fn, best_gs = best

        total_tp += tp
        total_fp += fp
        total_fn += fn
        n_eval += 1

        if len(examples) < show_sample:
            # reconstruct a nice gold string for the best variant
            best_morphs = None
            for gv in gold_variants:
                if offsets_from_morphemes(gv) == best_gs:
                    best_morphs = gv; break
            gold_str = "-".join(best_morphs) if best_morphs else "(ambig)"
            examples.append({
                "word": word,
                "tokens": toks,
                "pred_seg": seg_string,
                "gold_best": gold_str,
                "P": round(P,3), "R": round(R,3), "F1": round(F1,3)
            })

    # --- Final Metrics ---
    micro_P = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    micro_R = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    micro_F1 = 2 * micro_P * micro_R / (micro_P + micro_R) if (micro_P + micro_R) > 0 else 0.0
    exact_rate = exact_hits / n_eval if n_eval > 0 else 0.0

    print(f"Evaluated {n_eval} words")
    print(f"Predictions Rejected by Suffix Validator: {rejection_count} ({rejection_count/n_eval:.2%})")
    print("--- Final Scores (with rejections counted as failures) ---")
    print(f"Boundary (micro)  P/R/F1 = {micro_P:.3f}/{micro_R:.3f}/{micro_F1:.3f}")
    print(f"Exact-match rate  = {exact_rate:.3f}")
    if examples:
        print("\nSample predictions:")
        for ex in examples:
            print(f"- {ex['word']}\n  tokens: {ex['tokens']}\n  pred  : {ex['pred_seg']}\n  gold  : {ex['gold_best']}\n  P/R/F1: {ex['P']}/{ex['R']}/{ex['F1']}\n")

    # ... (code for printing examples remains the same) ...
    return { "micro_f1": micro_F1, "exact_match_rate": exact_rate, "rejection_count": rejection_count }

# ===================================================================
# REVISED CODE: Evaluation function that ignores rejected predictions
# ===================================================================

def evaluate_and_ignore_rejected(
    df, model, vocab, out,
    allowed_suffixes: list[str], # Required argument for the validator
    max_token_len=4,
    use_tuned_thr=True,
    show_sample=5
):
    dt_clf, dt_vec = out["dt_clf"], out["dt_vec"]
    thr = float(out.get("best_thr", 0.5)) if use_tuned_thr else 0.5
    allowed_suffixes_set = set(allowed_suffixes)

    total_tp = total_fp = total_fn = 0
    exact_hits = 0
    
    n_total_words = 0      # Counts all words we attempt to evaluate
    n_evaluated_words = 0  # Counts only words with valid, scored predictions
    rejection_count = 0
    examples = []

    for _, row in df.iterrows():
        word = str(row["Word"])
        gold_variants = row["Gold"]

        # Normalize gold_variants (convert numpy arrays to lists)
        gold_variants = normalize_gold_variants(gold_variants)

        if not isinstance(gold_variants, list) or len(gold_variants) == 0:
            continue
        
        n_total_words += 1

        # 1. Get the model's prediction
        toks = tokenize_with_vocab(word, vocab, max_token_len=max_token_len)
        seg_string, probs = segment_tokens(model, vocab, toks, dt_clf=dt_clf, dt_vec=dt_vec, thr=thr)
        predicted_morphs = seg_string.split('-')

        # 2. Validate the prediction. If invalid, ignore this row and continue.
        if not is_segmentation_valid(predicted_morphs, allowed_suffixes_set):
            rejection_count += 1
            continue  # <-- KEY CHANGE: Skip the rest of the loop for this word

        # --- If we reach this point, the prediction is valid and will be scored ---
        n_evaluated_words += 1
        
        # 3. Score the valid prediction
        mask01 = (probs >= thr).astype(int)
        pred_set = offsets_from_tokens_and_mask(toks, mask01)
        gold_sets = [offsets_from_morphemes(gv) for gv in gold_variants]

        if any(pred_set == gs for gs in gold_sets):
            exact_hits += 1

        best = max((f1_from_sets(pred_set, gs) + (gs,) for gs in gold_sets), key=lambda z: z[2])
        P, R, F1, tp, fp, fn, best_gs = best

        total_tp += tp
        total_fp += fp
        total_fn += fn

        if len(examples) < show_sample:
            # reconstruct a nice gold string for the best variant
            best_morphs = None
            for gv in gold_variants:
                if offsets_from_morphemes(gv) == best_gs:
                    best_morphs = gv; break
            gold_str = "-".join(best_morphs) if best_morphs else "(ambig)"
            examples.append({
                "word": word,
                "tokens": toks,
                "pred_seg": seg_string,
                "gold_best": gold_str,
                "P": round(P,3), "R": round(R,3), "F1": round(F1,3)
            })

    # --- Final Metrics ---
    # Note: Denominators now use n_evaluated_words, which is smaller than n_total_words
    micro_P = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    micro_R = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    micro_F1 = 2 * micro_P * micro_R / (micro_P + micro_R) if (micro_P + micro_R) > 0 else 0.0
    exact_rate = exact_hits / n_evaluated_words if n_evaluated_words > 0 else 0.0

    print(f"Attempted to evaluate {n_total_words} words")
    print(f"Predictions Rejected by Suffix Validator: {rejection_count} ({rejection_count/n_total_words:.2%})")
    print(f"Final scores are based on the remaining {n_evaluated_words} valid predictions.")
    print("--- Final Scores (on non-rejected predictions only) ---")
    print(f"Boundary (micro)  P/R/F1 = {micro_P:.3f}/{micro_R:.3f}/{micro_F1:.3f}")
    print(f"Exact-match rate  = {exact_rate:.3f}")

    if examples:
        print("\nSample predictions:")
        for ex in examples:
            print(f"- {ex['word']}\n  tokens: {ex['tokens']}\n  pred  : {ex['pred_seg']}\n  gold  : {ex['gold_best']}\n  P/R/F1: {ex['P']}/{ex['R']}/{ex['F1']}\n")
    return { "micro_f1": micro_F1, "exact_match_rate": exact_rate, "rejection_count": rejection_count }

In [109]:
# Remove words with length > 16
# acc_df = acc_df[acc_df['Word'].str.len() <= 14].reset_index(drop=True)

# Remove rows where all gold variants have only one morpheme
# acc_df = acc_df[acc_df['Gold'].apply(lambda variants: any(len(variant) > 1 for variant in variants))].reset_index(drop=True)

In [110]:
acc_df.head()

Unnamed: 0,Word,Gold,Morph_split,Morph_split_str,Filename
0,unupas,"[[unu, pas]]","[unu, pas]",unu pas,For_Annotation_1_LS.csv
1,umankus,"[[uma, nku, s]]","[uma, nku, s]",uma nku s,For_Annotation_1_LS.csv
2,hikurin,"[[hikuri, n]]","[hikuri, n]",hikuri n,For_Annotation_1_LS.csv
3,sutipi,"[[suti, pi]]","[suti, pi]",suti pi,For_Annotation_1_LS.csv
4,pikunas,"[[pi, kuna, s]]","[pi, kuna, s]",pi kuna s,For_Annotation_1_LS.csv


In [111]:
def read_suffixes(filename):
    suffixes = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Split into number and suffix part
            parts = line.split(maxsplit=1)
            if len(parts) == 2:
                _, suffix = parts
                suffixes.append(suffix[1:])
    return suffixes


filename = "data\suffixesCQ-Anettte-Rios_LS.txt"  # <-- replace with your file name
suffix_list = read_suffixes(filename)

In [112]:
# 3. Call the NEW evaluation function with your suffix list
print("\n--- Evaluating with Post-Processing Rejection Filter ---")
results_with_rejection = evaluate_with_rejection(
    acc_df,              # The test dataframe
    model, vocab, out,   # The trained model and its artifacts
    allowed_suffixes=suffix_list, # Your list of rules!
    show_sample=8
)


--- Evaluating with Post-Processing Rejection Filter ---
Evaluated 913 words
Predictions Rejected by Suffix Validator: 265 (29.03%)
--- Final Scores (with rejections counted as failures) ---
Boundary (micro)  P/R/F1 = 0.838/0.603/0.702
Exact-match rate  = 0.470

Sample predictions:
- unupas
  tokens: ['u', 'n', 'u', 'p', 'a', 's']
  pred  : unupa-s
  gold  : unu-pas
  P/R/F1: 0.0/0.0/0.0

- umankus
  tokens: ['u', 'm', 'a', 'n', 'k', 'u', 's']
  pred  : uma-nku-s
  gold  : uma-nku-s
  P/R/F1: 1.0/1.0/1.0

- hikurin
  tokens: ['h', 'i', 'k', 'u', 'r', 'i', 'n']
  pred  : hiku-ri-n
  gold  : hikuri-n
  P/R/F1: 0.5/1.0/0.667

- sutipi
  tokens: ['s', 'u', 't', 'i', 'p', 'i']
  pred  : suti-pi
  gold  : suti-pi
  P/R/F1: 1.0/1.0/1.0

- pikunas
  tokens: ['p', 'i', 'k', 'u', 'n', 'a', 's']
  pred  : pi-ku-na-s
  gold  : pi-kuna-s
  P/R/F1: 0.667/1.0/0.8

- atipaq
  tokens: ['a', 't', 'i', 'p', 'a', 'q']
  pred  : atipaq
  gold  : ati-paq
  P/R/F1: 0.0/0.0/0.0

- tomani
  tokens: ['t', 'o',

In [113]:
# 3. Call the NEW evaluation function with your suffix list
print("\n--- Evaluating with Post-Processing Rejection Filter ---")
results_with_rejection = evaluate_and_ignore_rejected(
    acc_df,              # The test dataframe
    model, vocab, out,   # The trained model and its artifacts
    allowed_suffixes=suffix_list, # Your list of rules!
    show_sample=8
)


--- Evaluating with Post-Processing Rejection Filter ---
Attempted to evaluate 913 words
Predictions Rejected by Suffix Validator: 265 (29.03%)
Final scores are based on the remaining 648 valid predictions.
--- Final Scores (on non-rejected predictions only) ---
Boundary (micro)  P/R/F1 = 0.838/0.899/0.867
Exact-match rate  = 0.648

Sample predictions:
- unupas
  tokens: ['u', 'n', 'u', 'p', 'a', 's']
  pred  : unupa-s
  gold  : unu-pas
  P/R/F1: 0.0/0.0/0.0

- umankus
  tokens: ['u', 'm', 'a', 'n', 'k', 'u', 's']
  pred  : uma-nku-s
  gold  : uma-nku-s
  P/R/F1: 1.0/1.0/1.0

- hikurin
  tokens: ['h', 'i', 'k', 'u', 'r', 'i', 'n']
  pred  : hiku-ri-n
  gold  : hikuri-n
  P/R/F1: 0.5/1.0/0.667

- sutipi
  tokens: ['s', 'u', 't', 'i', 'p', 'i']
  pred  : suti-pi
  gold  : suti-pi
  P/R/F1: 1.0/1.0/1.0

- pikunas
  tokens: ['p', 'i', 'k', 'u', 'n', 'a', 's']
  pred  : pi-ku-na-s
  gold  : pi-kuna-s
  P/R/F1: 0.667/1.0/0.8

- atipaq
  tokens: ['a', 't', 'i', 'p', 'a', 'q']
  pred  : atipa

In [114]:
import numpy as np
from typing import List, Set, Tuple

# ---------- helpers to turn segs into boundary sets (char offsets) ----------
def offsets_from_morphemes(morphs: List[str]) -> Set[int]:
    # boundaries after each morph except the last
    offs = []
    s = 0
    for i, m in enumerate(morphs):
        s += len(m)
        if i < len(morphs) - 1:
            offs.append(s)
    return set(offs)

def offsets_from_tokens_and_mask(tokens: List[str], mask01: np.ndarray) -> Set[int]:
    # boundaries after token i where mask01[i]==1, measured in character offsets
    offs = set()
    cum = 0
    for i, t in enumerate(tokens):
        cum += len(t)
        if i < len(tokens) - 1 and mask01[i] == 1:
            offs.add(cum)
    return offs

def f1_from_sets(pred: Set[int], gold: Set[int]) -> Tuple[float, float, float, int, int, int]:
    tp = len(pred & gold)
    fp = len(pred - gold)
    fn = len(gold - pred)
    P = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    R = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    F1 = 2 * P * R / (P + R) if (P + R) > 0 else 0.0
    return P, R, F1, tp, fp, fn

def normalize_gold_variants(gold_variants):
    """
    Convert gold_variants to a list format, handling numpy arrays and nested structures.
    """
    if gold_variants is None:
        return []
    
    # If it's a numpy array, convert to list
    if isinstance(gold_variants, np.ndarray):
        gold_variants = gold_variants.tolist()
    
    # If it's already a list, ensure nested elements are also lists (not numpy arrays)
    if isinstance(gold_variants, list):
        normalized = []
        for variant in gold_variants:
            if isinstance(variant, np.ndarray):
                normalized.append(variant.tolist())
            elif isinstance(variant, list):
                # Recursively normalize nested lists
                normalized.append([item.tolist() if isinstance(item, np.ndarray) else item for item in variant])
            else:
                normalized.append(variant)
        return normalized
    
    return []

# ---------- main evaluation ----------
def evaluate_on_gold_df(df, model, vocab, out, max_token_len=4, use_tuned_thr=True, show_sample=5):
    dt_clf, dt_vec = out["dt_clf"], out["dt_vec"]
    thr = float(out.get("best_thr", 0.5)) if use_tuned_thr else 0.5

    total_tp = total_fp = total_fn = 0
    exact_hits = 0
    n_eval = 0
    examples = []

    for _, row in df.iterrows():
        word = str(row["Word"])
        gold_variants = row["Gold"]  # e.g., [['pi','kuna','s'], ['pi','ku','nas']]

        # Normalize gold_variants (convert numpy arrays to lists)
        gold_variants = normalize_gold_variants(gold_variants)

        # skip if no gold
        if not isinstance(gold_variants, list) or len(gold_variants) == 0:
            continue

        # tokenize & predict
        toks = tokenize_with_vocab(word, vocab, max_token_len=max_token_len)
        seg_string, probs = segment_tokens(model, vocab, toks, dt_clf=dt_clf, dt_vec=dt_vec, thr=thr)
        mask01 = (probs >= thr).astype(int)
        pred_set = offsets_from_tokens_and_mask(toks, mask01)

        # build gold sets for all variants
        gold_sets = [offsets_from_morphemes(gv) for gv in gold_variants]

        # exact match if we match ANY gold variant
        if any(pred_set == gs for gs in gold_sets):
            exact_hits += 1

        # choose the gold variant that gives best F1 for this word
        best = max((f1_from_sets(pred_set, gs) + (gs,) for gs in gold_sets), key=lambda z: z[2])
        P, R, F1, tp, fp, fn, best_gs = best

        total_tp += tp; total_fp += fp; total_fn += fn
        n_eval += 1

        if len(examples) < show_sample:
            # reconstruct a nice gold string for the best variant
            best_morphs = None
            for gv in gold_variants:
                if offsets_from_morphemes(gv) == best_gs:
                    best_morphs = gv; break
            gold_str = "-".join(best_morphs) if best_morphs else "(ambig)"
            examples.append({
                "word": word,
                "tokens": toks,
                "pred_seg": seg_string,
                "gold_best": gold_str,
                "P": round(P,3), "R": round(R,3), "F1": round(F1,3)
            })

    # micro metrics
    micro_P = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    micro_R = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    micro_F1 = 2 * micro_P * micro_R / (micro_P + micro_R) if (micro_P + micro_R) > 0 else 0.0
    exact_rate = exact_hits / n_eval if n_eval > 0 else 0.0

    print(f"Evaluated {n_eval} words")
    print(f"Boundary (micro)  P/R/F1 = {micro_P:.3f}/{micro_R:.3f}/{micro_F1:.3f}")
    print(f"Exact-match rate  = {exact_rate:.3f}")
    if examples:
        print("\nSample predictions:")
        for ex in examples:
            print(f"- {ex['word']}\n  tokens: {ex['tokens']}\n  pred  : {ex['pred_seg']}\n  gold  : {ex['gold_best']}\n  P/R/F1: {ex['P']}/{ex['R']}/{ex['F1']}\n")

    return {
        "n_eval": n_eval,
        "micro_precision": micro_P,
        "micro_recall": micro_R,
        "micro_f1": micro_F1,
        "exact_match_rate": exact_rate,
        "examples": examples
    }

In [115]:
results = evaluate_on_gold_df(
    acc_df,                     # your concatenated DataFrame with Word + Gold (list of variants)
    model, vocab, out,      # from training
    max_token_len=4,        # must match your tokenize scheme
    use_tuned_thr=True,     # use the best threshold found on dev
    show_sample=8           # print a few qualitative examples
)

Evaluated 913 words
Boundary (micro)  P/R/F1 = 0.787/0.845/0.815
Exact-match rate  = 0.541

Sample predictions:
- unupas
  tokens: ['u', 'n', 'u', 'p', 'a', 's']
  pred  : unupa-s
  gold  : unu-pas
  P/R/F1: 0.0/0.0/0.0

- umankus
  tokens: ['u', 'm', 'a', 'n', 'k', 'u', 's']
  pred  : uma-nku-s
  gold  : uma-nku-s
  P/R/F1: 1.0/1.0/1.0

- hikurin
  tokens: ['h', 'i', 'k', 'u', 'r', 'i', 'n']
  pred  : hiku-ri-n
  gold  : hikuri-n
  P/R/F1: 0.5/1.0/0.667

- sutipi
  tokens: ['s', 'u', 't', 'i', 'p', 'i']
  pred  : suti-pi
  gold  : suti-pi
  P/R/F1: 1.0/1.0/1.0

- pikunas
  tokens: ['p', 'i', 'k', 'u', 'n', 'a', 's']
  pred  : pi-ku-na-s
  gold  : pi-kuna-s
  P/R/F1: 0.667/1.0/0.8

- atipaq
  tokens: ['a', 't', 'i', 'p', 'a', 'q']
  pred  : atipaq
  gold  : ati-paq
  P/R/F1: 0.0/0.0/0.0

- tomani
  tokens: ['t', 'o', 'm', 'a', 'n', 'i']
  pred  : toma-ni
  gold  : toma-ni
  P/R/F1: 1.0/1.0/1.0

- rantiq
  tokens: ['r', 'a', 'n', 't', 'i', 'q']
  pred  : rantiq
  gold  : ranti-q
  P/R/F

In [118]:
# ===================================================================
# DEMONSTRATION: Decision Tree Prior Processing with Actual Model
# ===================================================================
import numpy as np

def prior_probs_for_sample_verbose(clf, vec, tokens):
    """
    Get boundary probabilities from Decision Tree with detailed verbose output
    showing feature extraction and prediction process.
    """
    if clf is None or vec is None or len(tokens) <= 1:
        return [0.5] * (max(len(tokens)-1, 0))
    
    print(f"\n{'='*70}")
    print(f"DT PRIOR PROCESSING: '{''.join(tokens)}' (tokens: {tokens})")
    print(f"{'='*70}")
    
    # Show DT configuration
    print(f"\nDecision Tree Configuration:")
    print(f"  Number of nodes: {clf.tree_.node_count}")
    print(f"  Max depth: {clf.tree_.max_depth}")
    print(f"  Number of features: {clf.n_features_in_}")
    
    # Extract features for each boundary position
    print(f"\n{'─'*70}")
    print("FEATURE EXTRACTION - For each boundary position")
    print(f"{'─'*70}")
    
    Xd = []
    for i in range(len(tokens)-1):
        feats = featurize_window(tokens, i)
        Xd.append(feats)
        
        print(f"\n  Position {i} (boundary after token '{tokens[i]}'):")
        print(f"    Left context: L1='{feats.get('L1', 'N/A')}', L2='{feats.get('L2', 'N/A')}'")
        print(f"    Right context: R1='{feats.get('R1', 'N/A')}', R2='{feats.get('R2', 'N/A')}'")
        print(f"    CV patterns: L1_cv='{feats.get('L1_cv', 'N/A')}', R1_cv='{feats.get('R1_cv', 'N/A')}'")
        print(f"    Characters: L1_last='{feats.get('L1_last', 'N/A')}', R1_first='{feats.get('R1_first', 'N/A')}'")
    
    # Convert to feature matrix
    X = vec.transform(Xd)
    
    print(f"\n{'─'*70}")
    print("DECISION TREE PREDICTION")
    print(f"{'─'*70}")
    
    # Get probability predictions
    proba = clf.predict_proba(X)  # Returns [P(no_boundary), P(boundary)]
    
    print(f"\n  For each boundary position, DT outputs:")
    print(f"    [P(no_boundary), P(boundary)]")
    
    priors = []
    for i in range(len(tokens)-1):
        p_no_boundary = proba[i, 0]
        p_boundary = proba[i, 1]
        priors.append(p_boundary)
        
        print(f"\n  Position {i} (after '{tokens[i]}'):")
        print(f"    P(no_boundary) = {p_no_boundary:.4f}")
        print(f"    P(boundary) = {p_boundary:.4f}")
        
        # Show which path through the tree (simplified)
        # We can't easily show the full path, but we can show the prediction
        prediction = clf.predict(X[i:i+1])[0]
        print(f"    Prediction: {'BOUNDARY' if prediction == 1 else 'NO BOUNDARY'}")
    
    # Final summary
    print(f"\n{'─'*70}")
    print("FINAL OUTPUT")
    print(f"{'─'*70}")
    print(f"Word: '{''.join(tokens)}'")
    print(f"Tokens: {tokens}")
    print(f"\nBoundary probabilities:")
    print(f"  {' '.join([f'{p:.3f}' for p in priors])}")
    print(f"\nVisualization:")
    print(f"  {' '.join(tokens)}")
    print(f"  {' '.join([' ' if p < 0.3 else '|' if p < 0.7 else '||' for p in priors])}")
    print(f"  {' '.join([f'{p:.2f}' for p in priors])}")
    
    return priors

# Try to use existing model if available
try:
    if 'out' in globals() and 'dt_clf' in out and 'dt_vec' in out and 'model' in globals() and 'vocab' in globals():
        dt_clf = out['dt_clf']
        dt_vec = out['dt_vec']
        thr = out.get("best_thr", 0.5)
        print("✅ Using existing Decision Tree prior from loaded model")
        
        # Process words from dataset - only output for correct segmentations
        # Try words from acc_df first, then gold_df if needed
        max_words_to_show = 3  # Maximum number of correct segmentations to display
        correct_count = 0
        total_count = 0
        
        # Combine words from both dataframes (acc_df first, then gold_df)
        words_to_try = []
        if 'acc_df' in globals() and len(acc_df) > 0:
            words_to_try.extend(acc_df['Word'].tolist())
        if 'gold_df' in globals() and len(gold_df) > 0:
            # Add words from gold_df that aren't already in the list
            gold_words = gold_df['Word'].tolist()
            words_to_try.extend([w for w in gold_words if w not in words_to_try])
        
        if len(words_to_try) == 0:
            print("⚠️  No words found in acc_df or gold_df")
        else:
            print(f"🔍 Searching through {len(words_to_try)} words for correct segmentations...")
            
            for word in words_to_try:
                try:
                    total_count += 1
                    
                    # Get predicted segmentation
                    tokens = tokenize_with_vocab(word, vocab, max_token_len=4)
                    seg_string, probs = segment_tokens(model, vocab, tokens, dt_clf=dt_clf, dt_vec=dt_vec, thr=thr)
                    predicted_morphs = seg_string.split('-')
                    
                    # Normalize predicted morphs to lowercase
                    pred_normalized = [m.lower().strip() for m in predicted_morphs if m.strip()]
                    
                    # Get gold segmentation from acc_df (test data)
                    gold_row = acc_df[acc_df['Word'] == word] if 'acc_df' in globals() else pd.DataFrame()
                    if len(gold_row) == 0:
                        # Try gold_df as fallback
                        gold_row = gold_df[gold_df['Word'] == word] if 'gold_df' in globals() else pd.DataFrame()
                        if len(gold_row) == 0:
                            continue  # Skip silently if word not found
                        # Use Morph_split from gold_df
                        gold_morphs = gold_row['Morph_split'].iloc[0]
                        if not isinstance(gold_morphs, list):
                            gold_morphs = list(gold_morphs) if hasattr(gold_morphs, '__iter__') else [str(gold_morphs)]
                        gold_variants = [gold_morphs]
                    else:
                        # Use Gold column from acc_df (list of variants)
                        gold_variants_raw = gold_row['Gold'].iloc[0]
                        # Normalize gold_variants (handle numpy arrays, nested structures)
                        gold_variants = normalize_gold_variants(gold_variants_raw)
                        if not isinstance(gold_variants, list) or len(gold_variants) == 0:
                            continue  # Skip silently if no valid gold variants
                    
                    # Check if prediction matches any gold variant exactly
                    is_correct = False
                    matched_gold = None
                    for gold_variant in gold_variants:
                        if not isinstance(gold_variant, list):
                            gold_variant = list(gold_variant) if hasattr(gold_variant, '__iter__') else [str(gold_variant)]
                        gold_normalized = [m.lower().strip() for m in gold_variant if m.strip()]
                        if pred_normalized == gold_normalized:
                            is_correct = True
                            matched_gold = gold_variant
                            break
                    
                    if is_correct:
                        correct_count += 1
                        # Only output verbose information for correct segmentations
                        priors = prior_probs_for_sample_verbose(dt_clf, dt_vec, tokens)
                        print(f"\n✅ CORRECT SEGMENTATION: '{word}'")
                        print(f"   Predicted: {seg_string}")
                        print(f"   Gold:      {'-'.join(matched_gold)}")
                        print("\n" + "="*70 + "\n")
                        
                        # Stop after finding max_words_to_show correct segmentations
                        if correct_count >= max_words_to_show:
                            break
                    # Silently skip incorrect segmentations
                            
                except Exception as e:
                    # Silently skip errors, continue to next word
                    continue
            
            print(f"\n📊 Summary: Found {correct_count} correct segmentation(s) out of {total_count} words checked.")
            if correct_count == 0:
                print("   No correct segmentations found. Try checking more words or adjusting the threshold.")
        
    else:
        print("⚠️  No model found in memory.")
        print("\nTo use this demonstration:")
        print("1. First run your model training/loading cell")
        print("2. Then run this cell again")
        print("\nAlternatively, you can manually specify:")
        print("  dt_clf = out['dt_clf']")
        print("  dt_vec = out['dt_vec']")
        print("  tokens = ['p', 'i', 'k', 'u', 'n', 'a', 's']")
        print("  priors = prior_probs_for_sample_verbose(dt_clf, dt_vec, tokens)")
except NameError as e:
    print(f"❌ {e}")
    print("\nTo use this demonstration:")
    print("1. First run your model training/loading cell")
    print("2. Then run this cell again")

✅ Using existing Decision Tree prior from loaded model
🔍 Searching through 7713 words for correct segmentations...

DT PRIOR PROCESSING: 'umankus' (tokens: ['u', 'm', 'a', 'n', 'k', 'u', 's'])

Decision Tree Configuration:
  Number of nodes: 45
  Max depth: 6
  Number of features: 170

──────────────────────────────────────────────────────────────────────
FEATURE EXTRACTION - For each boundary position
──────────────────────────────────────────────────────────────────────

  Position 0 (boundary after token 'u'):
    Left context: L1='u', L2='<BOS>'
    Right context: R1='m', R2='a'
    CV patterns: L1_cv='V', R1_cv='C'
    Characters: L1_last='u', R1_first='m'

  Position 1 (boundary after token 'm'):
    Left context: L1='m', L2='u'
    Right context: R1='a', R2='n'
    CV patterns: L1_cv='C', R1_cv='V'
    Characters: L1_last='m', R1_first='a'

  Position 2 (boundary after token 'a'):
    Left context: L1='a', L2='m'
    Right context: R1='n', R2='k'
    CV patterns: L1_cv='V', R1_c