In [1]:
"""
SEGMENTER-MORFESSOR: BILSTM WITH MORFESSOR ENSEMBLE PRIORS
===========================================================

This notebook implements a grapheme-level BiLSTM model for morphological segmentation
of Quechua words that uses Morfessor ensemble models as an additional feature channel.
Unlike segmenter.ipynb which only uses BiLSTM, this notebook integrates unsupervised
Morfessor segmentations as boundary probability features.

Key Features:
- Grapheme-level tokenization (recognizes Quechua multigraphs like "ch", "ll", "rr")
- BiLSTM architecture with additional Morfessor feature channel
- Morfessor ensemble: trains N differently-seeded Morfessor Baseline models
- Boundary probabilities from Morfessor ensemble are used as features alongside embeddings
- Binary classification: predicts boundary (1) or no boundary (0) at each grapheme position
- Comprehensive evaluation metrics (precision, recall, F1, exact match, split-count accuracy)
- Model checkpointing to avoid redundant training (saves both BiLSTM and Morfessor models)

Key Differences from segmenter.ipynb:
- Adds Morfessor ensemble boundary probabilities as an additional feature channel
- The BiLSTM model concatenates embeddings with Morfessor features before LSTM processing
- Morfessor models are trained unsupervised on word surface forms
- Both Morfessor ensemble and BiLSTM models are saved/loaded for reproducibility

All data is read from the 'data' folder and models are saved to the 'models_segmenter-morfessor' folder.
"""

import ast
import os
import json
import hashlib
import pickle
import pandas as pd

In [2]:
# =========================
# DATA FOLDER CONFIGURATION
# =========================
# All data files should be read from and saved to the data folder
DATA_FOLDER = "data"

# Model folder named after this notebook
MODEL_NAME = "segmenter-morfessor"
MODELS_FOLDER = f"models_{MODEL_NAME}"

# Create folders if they don't exist
os.makedirs(DATA_FOLDER, exist_ok=True)
os.makedirs(MODELS_FOLDER, exist_ok=True)

# =========================
# LOAD GOLD STANDARD DATA
# =========================
# The gold standard dataset contains high-quality morphological segmentations
# This is the base training data for the BiLSTM+Morfessor model
print("Loading gold standard data...")
gold_df = pd.read_parquet(os.path.join(DATA_FOLDER, "Sue_kalt.parquet"))
gold_df['Word'] = gold_df['word']
gold_df['morph'] = gold_df['morph'].str.replace('-', ' ')  # Normalize separators
gold_df['Morph_split_str'] = gold_df['morph']  # String version
gold_df['Morph_split'] = gold_df['morph'].str.split(' ')  # List version
gold_df = gold_df[['Word', 'Morph_split', 'Morph_split_str']]
gold_df = gold_df.drop_duplicates(subset=['Word']).reset_index(drop=True)
gold_df = gold_df.dropna(subset=['Word'])
print(f"Loaded {len(gold_df):,} gold standard examples")

Loading gold standard data...
Loaded 6,896 gold standard examples


In [3]:
gold_df.head()

Unnamed: 0,Word,Morph_split,Morph_split_str
0,cementerioman,"[cementerio, man]",cementerio man
1,kawsachkananta,"[kawsa, chka, na, n, ta]",kawsa chka na n ta
2,mañakunpis,"[maña, ku, n, pis]",maña ku n pis
3,imaynapichus,"[imayna, pi, chus]",imayna pi chus
4,qipiyuq,"[qipi, yuq]",qipi yuq


In [4]:
gold_df.shape

(6896, 3)

In [5]:
# %%  (put this near your imports)
import unicodedata, regex as re
import string

# >>> CHANGED: unify apostrophes to a single codepoint for ejectives, etc.
APOSTROPHE_CHARS = {"'", "’", "ʼ", "‛", "`"}
STD_APOS = "\u02BC"  # ʼ

# build a translation table that deletes punctuation
_EXTRA_PUNCT = "±，“”‘’"   # add any more special symbols you want stripped
_DELETE = str.maketrans("", "", string.punctuation + _EXTRA_PUNCT)

def normalize_text(s: str) -> str:
    # NFC compose; lowercase; unify apostrophes
    s = unicodedata.normalize("NFC", str(s)).lower()
    s = "".join(STD_APOS if ch in APOSTROPHE_CHARS else ch for ch in s)
    # remove punctuation (ASCII + extras) and strip whitespace
    s = s.translate(_DELETE).strip()
    return s

# >>> CHANGED: Quechua multigraph inventory (extend if your corpus has more)
QUECHUA_MULTIGRAPHS = [
    "ch"+STD_APOS, "k"+STD_APOS, "p"+STD_APOS, "q"+STD_APOS, "t"+STD_APOS,  # ejectives (optional)
    "ch", "ph", "qh", "kh", "ll", "rr", "sh",
]
MG_SET = set(QUECHUA_MULTIGRAPHS)
MAX_MG = max((len(mg) for mg in QUECHUA_MULTIGRAPHS), default=1)

def to_graphemes_quechua(s: str) -> list[str]:
    """Greedy longest-match multigraph fusion; fallback to Unicode grapheme clusters (\X)."""
    s = normalize_text(s)
    tokens, i, n = [], 0, len(s)
    while i < n:
        match = None
        for L in range(MAX_MG, 1, -1):
            if i + L <= n:
                cand = s[i:i+L]
                if cand in MG_SET:
                    match = cand
                    break
        if match:
            tokens.append(match)
            i += len(match)
        else:
            m = re.match(r"\X", s[i:])  # single Unicode grapheme cluster
            g = m.group(0)
            tokens.append(g)
            i += len(g)
    return tokens


In [6]:
# =========================
# GRAPHEME TOKENIZATION
# =========================
# Convert words and morphemes to grapheme sequences
# This recognizes Quechua multigraphs (e.g., "ch", "ll", "rr", "sh") as single tokens
# The tokenization is used for both input to the model and for computing boundary labels

gold_df['token_seq'] = gold_df['Word'].apply(lambda w: to_graphemes_quechua(w))
gold_df['morph_token_splits'] = gold_df['Morph_split'].apply(
    lambda var: [to_graphemes_quechua(m) for m in var]
)

In [7]:
gold_df.head()

Unnamed: 0,Word,Morph_split,Morph_split_str,token_seq,morph_token_splits
0,cementerioman,"[cementerio, man]",cementerio man,"[c, e, m, e, n, t, e, r, i, o, m, a, n]","[[c, e, m, e, n, t, e, r, i, o], [m, a, n]]"
1,kawsachkananta,"[kawsa, chka, na, n, ta]",kawsa chka na n ta,"[k, a, w, s, a, ch, k, a, n, a, n, t, a]","[[k, a, w, s, a], [ch, k, a], [n, a], [n], [t,..."
2,mañakunpis,"[maña, ku, n, pis]",maña ku n pis,"[m, a, ñ, a, k, u, n, p, i, s]","[[m, a, ñ, a], [k, u], [n], [p, i, s]]"
3,imaynapichus,"[imayna, pi, chus]",imayna pi chus,"[i, m, a, y, n, a, p, i, ch, u, s]","[[i, m, a, y, n, a], [p, i], [ch, u, s]]"
4,qipiyuq,"[qipi, yuq]",qipi yuq,"[q, i, p, i, y, u, q]","[[q, i, p, i], [y, u, q]]"


In [8]:
def get_boundary_labels_tokens(tokens: list[str], morph_tokens: list[list[str]]) -> list[int]:
    labels = [0] * len(tokens)
    idx = 0
    # all but last morpheme end in a boundary
    for mt in morph_tokens[:-1]:
        idx += len(mt)
        if 0 < idx <= len(tokens):
            labels[idx-1] = 1
    return labels

gold_df['boundary_labels'] = gold_df.apply(
    lambda row: get_boundary_labels_tokens(row['token_seq'], row['morph_token_splits']),
    axis=1
)

# (Optional diagnostics)
gold_df['num_morphemes'] = gold_df['Morph_split'].apply(len)
gold_df['word_len_tokens'] = gold_df['token_seq'].apply(len)

gold_df['char_seq'] = gold_df['token_seq']

In [9]:
# =========================
# MORFESSOR ENSEMBLE BOUNDARY FEATURIZER
# =========================
# Uses N differently-seeded Morfessor Baseline models; each gives a hard segmentation.
# Boundary *probability* at position t is the fraction of models that place a boundary at t.
# Maps boundaries to your tokenization from `to_graphemes_quechua(...)`.
# 
# The Morfessor ensemble provides unsupervised boundary probabilities that serve as
# an additional feature channel for the BiLSTM model, helping it learn better
# segmentation patterns by leveraging unsupervised morphological analysis.

from dataclasses import dataclass
from typing import List, Dict, Tuple
import random
import morfessor
import numpy as np
import torch
import torch.nn as nn

@dataclass
class MorfessorConfig:
    n_models: int = 5
    seed_base: int = 123
    corpus_min_count: int = 1   # keep all words
    lowercase: bool = True

class MorfessorBoundaryFeaturizer:
    def __init__(self, cfg: MorfessorConfig):
        self.cfg = cfg
        self.models = []
        self._fitted = False

    def _build_model(self, seed: int):
        # Baseline trainer
        io = morfessor.MorfessorIO()
        model = morfessor.BaselineModel()
        # baseline config tweaks (optional): cost weights etc.
        random.seed(seed)
        np.random.seed(seed)
        return model

    def fit(self, words: List[str]):
        # Train N models with different random shuffles / seeds
        words = [w.lower() if self.cfg.lowercase else w for w in words]
        uniq = list(set(words))
        rng = np.random.default_rng(self.cfg.seed_base)
        self.models = []

        for i in range(self.cfg.n_models):
            seed = self.cfg.seed_base + i
            model = self._build_model(seed)
            # shuffle corpus
            shuffled = uniq.copy()
            rng.shuffle(shuffled)
            # train: Morfessor expects (frequency, word) tuples
            # use integer counts; also drop the count_modifier so ints stay ints
            train_data = [(1, w) for w in shuffled]  # uniform integer frequency
            model.load_data(train_data)
            model.train_batch()  # default algorithm='recursive' is fine
            # (optional) you can also try: model.train_batch(algorithm='viterbi')
            self.models.append(model)

        self._fitted = True

    @staticmethod
    def _boundaries_from_segments(word: str, segments: List[str]) -> List[int]:
        """Return hard boundary vector over raw characters: 1 at segment ends (except final)."""
        # mark char positions at the end of each segment
        b = [0] * len(word)
        pos = 0
        for seg_i, seg in enumerate(segments[:-1]):  # all but last seg end
            pos += len(seg)
            if 0 <= pos - 1 < len(word):
                b[pos - 1] = 1
        return b

    @staticmethod
    def _char_to_token_boundaries(word: str, tokens: List[str], char_bound: List[int]) -> List[float]:
        """
        Map char-level boundaries to your tokenization.
        We reconstruct the normalized word from tokens to ensure lengths match.
        """
        # Rebuild the normalized string from tokens; this is what your model actually sees.
        rebuilt = "".join(tokens)
        if len(rebuilt) != len(char_bound):
            # Fallback: if the original word length differs, recompute hard boundaries on `rebuilt`
            # by resegmenting `rebuilt` so positions align.
            # In practice, you can also just pad/truncate; here we do a safe clamp.
            char_bound = (char_bound[:len(rebuilt)] + [0]*(len(rebuilt) - len(char_bound)))[:len(rebuilt)]

        token_ends = []
        p = 0
        for tok in tokens:
            p += len(tok)
            token_ends.append(p - 1)  # last char index for this token

        probs = [0.0] * len(tokens)
        for t_i, char_end in enumerate(token_ends[:-1]):  # all but last token may host a boundary
            probs[t_i] = float(char_bound[char_end])
        return probs

    def boundary_probs_for_word(self, word: str, tokens: List[str]) -> List[float]:
        assert self._fitted, "Call fit() first."
        word_ = word.lower() if self.cfg.lowercase else word

        # collect hard boundary vectors from each model, map to token space, then average
        per_model_probs = []
        for m in self.models:
            segs, _ = m.viterbi_segment(word_)
            char_b = self._boundaries_from_segments(word_, segs)
            per_model_probs.append(self._char_to_token_boundaries(word_, tokens, char_b))

        # average across ensemble
        # (lengths match tokens; if any model failed weirdly, fall back to zeros)
        if not per_model_probs:
            return [0.0] * len(tokens)
        probs = np.mean(np.array(per_model_probs), axis=0).tolist()
        return probs

    def boundary_probs_for_words(self, words: List[str], tok_lists: List[List[str]]) -> List[List[float]]:
        return [self.boundary_probs_for_word(w, toks) for w, toks in zip(words, tok_lists)]

# ====== DATASET WITH MORFESSOR CHANNEL ======
class CharBoundaryDatasetMorf(torch.utils.data.Dataset):
    def __init__(self, df, morf_featurizer: MorfessorBoundaryFeaturizer, stoi):
        self.words = df["Word"].tolist()
        self.x = df["char_seq"].tolist()             # tokens from your pipeline
        self.y = df["boundary_labels"].tolist()
        # precompute morfessor boundary probabilities per word
        self.morf = morf_featurizer.boundary_probs_for_words(
            self.words, self.x
        )

        self.stoi = stoi

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        # returns tokens, labels, morf_probs
        return self.x[idx], self.y[idx], self.morf[idx]

def pad_batch_with_morf(batch, pad_id=0):
    # batch: List[(List[str], List[int], List[float])]
    seqs, labels, morf = zip(*batch)

    x_ids = [[stoi.get(t, stoi["<UNK>"]) for t in s] for s in seqs]
    y_ids = [lab for lab in labels]
    m_probs = [mp for mp in morf]

    lengths = [len(x) for x in x_ids]
    maxlen = max(lengths)

    x_pad = [xi + [pad_id]*(maxlen - len(xi)) for xi in x_ids]
    y_pad = [yi + [0]*(maxlen - len(yi)) for yi in y_ids]        # masked as 0
    m_pad = [mi + [0.0]*(maxlen - len(mi)) for mi in m_probs]
    mask  = [[1]*len(xi) + [0]*(maxlen - len(xi)) for xi in x_ids]

    return (
        torch.LongTensor(x_pad),          # x
        torch.FloatTensor(y_pad),         # targets
        torch.BoolTensor(mask),           # mask
        torch.LongTensor(lengths),        # lengths
        torch.FloatTensor(m_pad).unsqueeze(-1),  # morf channel: (B,T,1)
    )

# ====== BiLSTM with extra feature channel ======
class BiLSTMBoundaryWithMorf(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int = 16, hidden_size: int = 16,
                 num_layers: int = 1, dropout: float = 0.1, freeze_emb: bool = False,
                 extra_feat_dim: int = 1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        if freeze_emb:
            for p in self.emb.parameters():
                p.requires_grad = False

        self.in_dim = emb_dim + extra_feat_dim
        self.lstm = nn.LSTM(
            input_size=self.in_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_size * 2, 1)

    def forward(self, x_ids, lengths, extra_feats):
        # x_ids: (B,T) Long
        # extra_feats: (B,T,extra_feat_dim) float
        emb = self.emb(x_ids)                     # (B,T,E)
        x_in = torch.cat([emb, extra_feats], -1)  # (B,T,E+F)

        packed = nn.utils.rnn.pack_padded_sequence(
            x_in, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = self.dropout(out)
        logits = self.out(out).squeeze(-1)
        return logits

In [10]:
from torch.utils.data import Dataset, DataLoader

# ---- If you haven't already: ensure you have `gold_df` with columns:
# 'char_seq' (List[str]) and 'boundary_labels' (List[int]) ----
# e.g., produced by your earlier cells: gold_df['char_seq'] ... boundary labels ...  :contentReference[oaicite:1]{index=1}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==== 1) Build character vocab (embedding-friendly, not one-hot) ====
# %%  (unchanged API, but works on token lists)
PAD, UNK = "<PAD>", "<UNK>"

def build_vocab(seqs: List[List[str]]):
    toks = {t for seq in seqs for t in seq}
    itos = [PAD, UNK] + sorted(toks)
    stoi = {t:i for i,t in enumerate(itos)}
    return stoi, itos

stoi, itos = build_vocab(gold_df["char_seq"].tolist())

def encode(seq: List[str]) -> List[int]:
    return [stoi.get(t, stoi[UNK]) for t in seq]

def encode_labels(labels: List[int]) -> List[int]:
    return labels

# =========================
# MORFESSOR SAVING/LOADING FUNCTIONS
# =========================
# Functions to save and load Morfessor ensemble models to avoid retraining

def generate_morfessor_id(n_models, seed_base, lowercase):
    """
    Generate a unique identifier for a Morfessor ensemble based on its configuration.
    
    Args:
        n_models: Number of Morfessor models in the ensemble
        seed_base: Base seed for random initialization
        lowercase: Whether to lowercase words
    
    Returns:
        A string identifier (hash) for the Morfessor ensemble
    """
    params_dict = {
        'n_models': n_models,
        'seed_base': seed_base,
        'lowercase': lowercase
    }
    params_str = json.dumps(params_dict, sort_keys=True)
    morfessor_id = hashlib.md5(params_str.encode()).hexdigest()[:16]
    return morfessor_id

def save_morfessor_ensemble(morf_featurizer, morfessor_id, models_folder=MODELS_FOLDER):
    """
    Save Morfessor ensemble models to the models folder.
    
    Args:
        morf_featurizer: Trained MorfessorBoundaryFeaturizer
        morfessor_id: Unique identifier for this Morfessor ensemble
        models_folder: Folder to save models in
    """
    morfessor_dir = os.path.join(models_folder, f"morfessor_{morfessor_id}")
    os.makedirs(morfessor_dir, exist_ok=True)
    
    # Save each Morfessor model in the ensemble
    for i, model in enumerate(morf_featurizer.models):
        model_path = os.path.join(morfessor_dir, f"morfessor_model_{i}.pkl")
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
    
    # Save configuration
    config_path = os.path.join(morfessor_dir, "morfessor_config.json")
    with open(config_path, "w") as f:
        json.dump({
            'n_models': morf_featurizer.cfg.n_models,
            'seed_base': morf_featurizer.cfg.seed_base,
            'lowercase': morf_featurizer.cfg.lowercase,
            'morfessor_id': morfessor_id
        }, f, indent=2)
    
    print(f"Morfessor ensemble saved to {morfessor_dir}")
    return morfessor_dir

def load_morfessor_ensemble(morfessor_id, models_folder=MODELS_FOLDER):
    """
    Load Morfessor ensemble models from the models folder.
    
    Args:
        morfessor_id: Unique identifier for the Morfessor ensemble
        models_folder: Folder where models are saved
    
    Returns:
        MorfessorBoundaryFeaturizer instance or None if not found
    """
    morfessor_dir = os.path.join(models_folder, f"morfessor_{morfessor_id}")
    config_path = os.path.join(morfessor_dir, "morfessor_config.json")
    
    if not os.path.exists(config_path):
        return None
    
    # Load configuration
    with open(config_path, "r") as f:
        config_data = json.load(f)
    
    # Recreate config and featurizer
    m_cfg = MorfessorConfig(
        n_models=config_data['n_models'],
        seed_base=config_data['seed_base'],
        lowercase=config_data['lowercase']
    )
    morf = MorfessorBoundaryFeaturizer(m_cfg)
    
    # Load each model in the ensemble
    morf.models = []
    for i in range(m_cfg.n_models):
        model_path = os.path.join(morfessor_dir, f"morfessor_model_{i}.pkl")
        if os.path.exists(model_path):
            with open(model_path, 'rb') as f:
                morf.models.append(pickle.load(f))
        else:
            print(f"Warning: Morfessor model {i} not found at {model_path}")
            return None
    
    morf._fitted = True
    print(f"Morfessor ensemble loaded from {morfessor_dir}")
    return morf

# =========================
# FIT OR LOAD MORFESSOR ENSEMBLE
# =========================
# Train Morfessor ensemble on training words (unsupervised)
# Or load existing ensemble if already trained with same configuration

# Morfessor configuration
m_cfg = MorfessorConfig(n_models=5, seed_base=123, lowercase=True)

# Generate Morfessor ensemble identifier
morfessor_id = generate_morfessor_id(m_cfg.n_models, m_cfg.seed_base, m_cfg.lowercase)

# Try to load existing Morfessor ensemble
print(f"Checking for existing Morfessor ensemble with ID: {morfessor_id}")
morf = load_morfessor_ensemble(morfessor_id, models_folder=MODELS_FOLDER)

if morf is None:
    print(f"No existing Morfessor ensemble found. Training new ensemble...")
    morf = MorfessorBoundaryFeaturizer(m_cfg)
    morf.fit(gold_df["Word"].tolist())  # Unsupervised; uses only surface forms
    # Save the trained ensemble
    save_morfessor_ensemble(morf, morfessor_id, models_folder=MODELS_FOLDER)
    print(f"Morfessor ensemble training complete! Saved with ID: {morfessor_id}")
else:
    print(f"Using existing Morfessor ensemble (ID: {morfessor_id})")

Device: cuda
Checking for existing Morfessor ensemble with ID: a11bda6186f98db1
Morfessor ensemble loaded from models_segmenter-morfessor\morfessor_a11bda6186f98db1
Using existing Morfessor ensemble (ID: a11bda6186f98db1)


In [11]:
# =========================
# LOAD TEST DATA
# =========================
# Load the test/accuracy evaluation dataset
print("Loading test data...")
acc_df = pd.read_parquet(os.path.join(DATA_FOLDER, "cleaned_data_df.parquet"))
print(f"Loaded {len(acc_df):,} test examples")

Loading test data...
Loaded 913 test examples


In [12]:
# =========================
# TRAIN/VALIDATION SPLIT
# =========================
# Split data into 90% training and 10% validation
# Create datasets with Morfessor features precomputed
rng = np.random.default_rng(42)
indices = np.arange(len(gold_df))
rng.shuffle(indices)
split = int(0.9*len(indices))
train_idx, val_idx = indices[:split], indices[split:]

train_df = gold_df.iloc[train_idx].reset_index(drop=True)
val_df   = gold_df.iloc[val_idx].reset_index(drop=True)

print(f"Training samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")

# Create datasets with Morfessor boundary probabilities precomputed
train_ds = CharBoundaryDatasetMorf(train_df, morf, stoi)
val_ds   = CharBoundaryDatasetMorf(val_df,   morf, stoi)

BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=pad_batch_with_morf)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=pad_batch_with_morf)

# ==== 4) Loss (masked BCEWithLogits) & Optimizer ====
# %%  (compute once on train_df)
def count_pos_neg(df_):
    pos = sum(sum(lbls) for lbls in df_['boundary_labels'])
    total = sum(len(seq) for seq in df_['char_seq'])
    neg = total - pos
    return pos, neg

pos, neg = count_pos_neg(gold_df)
pos_weight_value = float(neg) / max(float(pos), 1.0)

def masked_bce_loss(logits, targets, mask):
    loss_fn = nn.BCEWithLogitsLoss(reduction="none",
                                   pos_weight=torch.tensor(pos_weight_value, device=logits.device))
    loss_per_token = loss_fn(logits, targets) * mask.float()
    denom = mask.float().sum().clamp_min(1.0)
    return loss_per_token.sum() / denom

# ==== 5) Metrics ====
def sigmoid(x): return 1 / (1 + torch.exp(-x))

def boundary_f1(logits, targets, mask, threshold=0.5):
    with torch.no_grad():
        probs = torch.sigmoid(logits)
        preds = (probs >= threshold).long()
        t = targets.long()
        m = mask.long()

        tp = ((preds == 1) & (t == 1) & (m == 1)).sum().item()
        fp = ((preds == 1) & (t == 0) & (m == 1)).sum().item()
        fn = ((preds == 0) & (t == 1) & (m == 1)).sum().item()

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1   = 2*prec*rec / (prec + rec) if (prec + rec) > 0 else 0.0
        return prec, rec, f1

def predict_boundaries_with_morf(words: List[str], model, stoi, morf_featurizer,
                                 threshold=0.5, device=device):
    model.eval()
    token_lists = [to_graphemes_quechua(w) for w in words]

    # ids + lengths + mask
    x_ids = [[stoi.get(t, stoi["<UNK>"]) for t in toks] for toks in token_lists]
    lengths = [len(x) for x in x_ids]
    maxlen = max(lengths) if lengths else 0
    pad_id = stoi["<PAD>"]
    x_pad = [xi + [pad_id]*(maxlen - len(xi)) for xi in x_ids]
    mask = [[1]*len(xi) + [0]*(maxlen - len(xi)) for xi in x_ids]

    # Morfessor probs per word → pad
    morf_probs = morf_featurizer.boundary_probs_for_words(words, token_lists)
    morf_pad = [mp + [0.0]*(maxlen - len(mp)) for mp in morf_probs]
    morf_feat = torch.FloatTensor(morf_pad).unsqueeze(-1)

    x = torch.LongTensor(x_pad).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    mask_t = torch.BoolTensor(mask).to(device)
    morf_feat = morf_feat.to(device)

    with torch.no_grad():
        logits = model(x, lengths_t, morf_feat)
        probs = torch.sigmoid(logits)
        preds = (probs >= threshold) & mask_t

    out = []
    for i, L in enumerate(lengths):
        out.append(preds[i, :L].int().tolist())
    return out

def apply_boundaries_tokens(tokens: list[str], boundary_labels: List[int]) -> List[str]:
    """Reconstruct morphemes from token list and boundary labels, returning strings."""
    segs, start = [], 0
    for i, b in enumerate(boundary_labels):
        if b == 1:
            segs.append("".join(tokens[start:i+1]))
            start = i+1
    if start < len(tokens):
        segs.append("".join(tokens[start:]))
    return segs

def normalize_gold_variants(gold_variants):
    """
    Convert gold_variants to a list format, handling numpy arrays and nested structures.
    """
    if gold_variants is None:
        return []
    
    # If it's a numpy array, convert to list
    if isinstance(gold_variants, np.ndarray):
        gold_variants = gold_variants.tolist()
    
    # If it's already a list, ensure nested elements are also lists (not numpy arrays)
    if isinstance(gold_variants, list):
        normalized = []
        for variant in gold_variants:
            if isinstance(variant, np.ndarray):
                normalized.append(variant.tolist())
            elif isinstance(variant, list):
                # Recursively normalize nested lists
                normalized.append([item.tolist() if isinstance(item, np.ndarray) else item for item in variant])
            else:
                normalized.append(variant)
        return normalized
    
    return []

def evaluate_accuracy_morf(df, model, stoi, morf_featurizer, device="cpu", threshold=0.5):
    """
    Accuracy = proportion of words where predicted segmentation == any gold variant,
    using the Morfessor feature channel at inference time.
    """
    all_words = df["Word"].tolist()
    all_gold  = df["Gold"].tolist()

    all_boundaries = predict_boundaries_with_morf(
        all_words, model, stoi, morf_featurizer, threshold=threshold, device=device
    )

    correct = 0
    for word, gold_variants, boundary_labels in zip(all_words, all_gold, all_boundaries):
        # Normalize gold variants
        gold_variants = normalize_gold_variants(gold_variants)
        toks = to_graphemes_quechua(word)
        predicted = apply_boundaries_tokens(toks, boundary_labels)
        if any(predicted == variant for variant in gold_variants):
            correct += 1
    return correct / len(all_words) if all_words else 0.0

Training samples: 6,206
Validation samples: 690


In [13]:
# =========================
# BILSTM MODEL CHECKPOINTING FUNCTIONS
# =========================
# Functions to save and load BiLSTM models to avoid redundant training

def generate_model_id(vocab_size, emb_dim, hidden_size, num_layers, dropout, 
                      freeze_emb, extra_feat_dim, lr, weight_decay, morfessor_id, epochs):
    """
    Generate a unique identifier for a BiLSTM model based on its hyperparameters.
    
    Args:
        vocab_size: Size of the vocabulary
        emb_dim: Embedding dimension
        hidden_size: LSTM hidden size
        num_layers: Number of LSTM layers
        dropout: Dropout rate
        freeze_emb: Whether embeddings are frozen
        extra_feat_dim: Dimension of extra features (Morfessor channel)
        lr: Learning rate
        weight_decay: Weight decay
        morfessor_id: ID of the Morfessor ensemble used
        epochs: Number of training epochs
    
    Returns:
        A string identifier (hash) for the model
    """
    params_dict = {
        'vocab_size': vocab_size,
        'emb_dim': emb_dim,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'dropout': dropout,
        'freeze_emb': freeze_emb,
        'extra_feat_dim': extra_feat_dim,
        'lr': lr,
        'weight_decay': weight_decay,
        'morfessor_id': morfessor_id,
        'epochs': epochs
    }
    params_str = json.dumps(params_dict, sort_keys=True)
    model_id = hashlib.md5(params_str.encode()).hexdigest()[:16]
    return model_id

def save_model_checkpoint(model, stoi, itos, model_id, models_folder=MODELS_FOLDER, 
                         suffix="", best_metric_value=None):
    """
    Save BiLSTM model checkpoint to the models folder.
    
    Args:
        model: Trained BiLSTM model
        stoi: String-to-index vocabulary mapping
        itos: Index-to-string vocabulary mapping
        model_id: Unique identifier for this model
        models_folder: Folder to save models in
        suffix: Optional suffix for the checkpoint filename (e.g., "_best_f1", "_best_acc")
        best_metric_value: Optional best metric value to save
    """
    model_dir = os.path.join(models_folder, f"model_{model_id}")
    os.makedirs(model_dir, exist_ok=True)
    
    checkpoint_path = os.path.join(model_dir, f"bilstm_morfessor{suffix}.pt")
    checkpoint = {
        "model_state": model.state_dict(),
        "stoi": stoi,
        "itos": itos,
        "model_id": model_id,
        "morfessor_id": morfessor_id
    }
    if best_metric_value is not None:
        checkpoint["best_metric_value"] = best_metric_value
    
    torch.save(checkpoint, checkpoint_path)
    print(f"Model checkpoint saved to {checkpoint_path}")
    return checkpoint_path

def load_model_checkpoint(model_id, models_folder=MODELS_FOLDER, suffix=""):
    """
    Load BiLSTM model checkpoint from the models folder.
    
    Args:
        model_id: Unique identifier for the model
        models_folder: Folder where models are saved
        suffix: Optional suffix for the checkpoint filename
    
    Returns:
        Dictionary with model_state, stoi, itos, and other saved data, or None if not found
    """
    model_dir = os.path.join(models_folder, f"model_{model_id}")
    checkpoint_path = os.path.join(model_dir, f"bilstm_morfessor{suffix}.pt")
    
    if not os.path.exists(checkpoint_path):
        return None
    
    # Load checkpoint with weights_only=False to allow numpy objects and other trusted data
    # These are our own saved checkpoints, so they're trusted
    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
    print(f"Model checkpoint loaded from {checkpoint_path}")
    return checkpoint

# =========================
# MODEL CONFIGURATION
# =========================
# Define BiLSTM model hyperparameters
# These parameters determine the model architecture and training behavior

# Model architecture parameters
VOCAB_SIZE = len(itos)
EMB_DIM = 64
HIDDEN_SIZE = 128
NUM_LAYERS = 2
DROPOUT = 0.3
FREEZE_EMB = True  # Set False to fine-tune embeddings
EXTRA_FEAT_DIM = 1  # Morfessor feature channel dimension

# Training parameters
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
EPOCHS = 20

# Generate model ID based on hyperparameters
model_id = generate_model_id(
    vocab_size=VOCAB_SIZE,
    emb_dim=EMB_DIM,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    freeze_emb=FREEZE_EMB,
    extra_feat_dim=EXTRA_FEAT_DIM,
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    morfessor_id=morfessor_id,
    epochs=EPOCHS
)

print(f"Model ID: {model_id}")
print(f"Using Morfessor ensemble ID: {morfessor_id}")

# Check if model already exists
print(f"Checking for existing model with ID: {model_id}")
checkpoint = load_model_checkpoint(model_id, models_folder=MODELS_FOLDER, suffix="_best_f1")

if checkpoint is not None:
    print(f"Found existing model! Loading checkpoint...")
    # Model will be loaded in the training cell
    model_loaded = True
    saved_stoi = checkpoint["stoi"]
    saved_itos = checkpoint["itos"]
else:
    print(f"No existing model found. Will train new model.")
    model_loaded = False
    saved_stoi = None
    saved_itos = None

# =========================
# CREATE MODEL AND OPTIMIZER
# =========================
# Initialize BiLSTM model with Morfessor feature channel
model = BiLSTMBoundaryWithMorf(
    vocab_size=VOCAB_SIZE,
    emb_dim=EMB_DIM,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    freeze_emb=FREEZE_EMB,
    extra_feat_dim=EXTRA_FEAT_DIM
).to(device)

# If model was loaded, restore its state
if model_loaded:
    model.load_state_dict(checkpoint["model_state"])
    print("Model state restored from checkpoint")
    # Use saved vocabularies if they match
    if saved_stoi == stoi and saved_itos == itos:
        print("Vocabulary matches saved checkpoint")
    else:
        print("Warning: Vocabulary mismatch with saved checkpoint. Using current vocabulary.")

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Model ID: 1c35f73b7670b1bf
Using Morfessor ensemble ID: a11bda6186f98db1
Checking for existing model with ID: 1c35f73b7670b1bf
Model checkpoint loaded from models_segmenter-morfessor\model_1c35f73b7670b1bf\bilstm_morfessor_best_f1.pt
Found existing model! Loading checkpoint...
Model state restored from checkpoint
Vocabulary matches saved checkpoint


In [14]:
# =========================
# TRAINING LOOP
# =========================
# Train the BiLSTM model with Morfessor features
# Skip training if model was already loaded from checkpoint

if model_loaded:
    print("Model already trained and loaded from checkpoint. Skipping training.")
    print("To retrain, delete the model checkpoint or change hyperparameters.")
    # Load best metrics from checkpoint if available
    best_val_f1 = checkpoint.get("best_metric_value", 0.0)
    best_val_acc = checkpoint.get("best_metric_value", 0.0)
    print(f"Best F1 from checkpoint: {best_val_f1:.4f}")
    print(f"Best Accuracy from checkpoint: {best_val_acc:.4f}")
else:
    print("Starting training...")
    best_val_f1  = 0.0
    best_val_acc = 0.0

    for epoch in range(1, EPOCHS+1):
        # ===== Training Phase =====
        model.train()
        total_loss = 0.0
        total_tokens = 0
        for x, y, mask, lengths, morf_feat in train_loader:
            x = x.to(device); y = y.to(device); mask = mask.to(device)
            lengths = lengths.to(device); morf_feat = morf_feat.to(device)

            # Forward pass: model predicts boundary probabilities
            logits = model(x, lengths, morf_feat)
            loss = masked_bce_loss(logits, y, mask)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()

            total_loss += loss.item() * mask.sum().item()
            total_tokens += mask.sum().item()

        train_loss = total_loss / max(total_tokens, 1)

        # ===== Validation Phase =====
        # Evaluate on validation set (F1 on token-level boundaries)
        model.eval()
        val_loss, val_tokens = 0.0, 0
        all_prec, all_rec, all_f1 = [], [], []
        with torch.no_grad():
            for x, y, mask, lengths, morf_feat in val_loader:
                x = x.to(device); y = y.to(device); mask = mask.to(device)
                lengths = lengths.to(device); morf_feat = morf_feat.to(device)

                logits = model(x, lengths, morf_feat)
                loss = masked_bce_loss(logits, y, mask)

                val_loss += loss.item() * mask.sum().item()
                val_tokens += mask.sum().item()

                # Compute token-level boundary metrics
                p, r, f = boundary_f1(logits, y, mask, threshold=0.5)
                all_prec.append(p); all_rec.append(r); all_f1.append(f)

        val_loss = val_loss / max(val_tokens, 1)
        prec = np.mean(all_prec) if all_prec else 0.0
        rec  = np.mean(all_rec) if all_rec else 0.0
        f1   = np.mean(all_f1) if all_f1 else 0.0

        # ===== Whole-Word Segmentation Accuracy =====
        # Evaluate on test set using Morfessor-aware prediction
        acc = evaluate_accuracy_morf(acc_df, model, stoi, morf, device=device, threshold=0.5)

        print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f}  val_loss={val_loss:.4f}  "
              f"P={prec:.3f} R={rec:.3f} F1={f1:.3f}  Acc={acc:.3f}")

        # ===== Save Best Models =====
        # Save checkpoint for best F1 score
        if f1 > best_val_f1:
            best_val_f1 = f1
            save_model_checkpoint(
                model, stoi, itos, model_id, 
                models_folder=MODELS_FOLDER, 
                suffix="_best_f1",
                best_metric_value=best_val_f1
            )
            print("  ↳ saved checkpoint by F1 (best so far)")

        # Save checkpoint for best Accuracy
        if acc > best_val_acc:
            best_val_acc = acc
            save_model_checkpoint(
                model, stoi, itos, model_id,
                models_folder=MODELS_FOLDER,
                suffix="_best_acc",
                best_metric_value=best_val_acc
            )
            print("  ↳ saved checkpoint by Accuracy (best so far)")

    print(f"\nTraining complete!")
    print(f"Best validation F1: {best_val_f1:.4f}")
    print(f"Best validation Accuracy: {best_val_acc:.4f}")

Model already trained and loaded from checkpoint. Skipping training.
To retrain, delete the model checkpoint or change hyperparameters.
Best F1 from checkpoint: 0.9566
Best Accuracy from checkpoint: 0.9566


In [15]:
test_words = ["rikuchkani", "pikunas", "ñichkanchus"]
pred_b = predict_boundaries_with_morf(test_words, model, stoi, morf, threshold=0.5)
for w, b in zip(test_words, pred_b):
    toks = to_graphemes_quechua(w)
    print(w, b, "->", apply_boundaries_tokens(toks, b))

rikuchkani [0, 0, 0, 1, 0, 0, 1, 0, 0] -> ['riku', 'chka', 'ni']
pikunas [0, 1, 0, 0, 0, 1, 0] -> ['pi', 'kuna', 's']
ñichkanchus [0, 1, 0, 0, 1, 1, 0, 0, 0] -> ['ñi', 'chka', 'n', 'chus']


In [16]:
# =========================
# LOAD TRAINED MODEL CHECKPOINT
# =========================
# Load the best model checkpoint using the new checkpointing system

print(f"Loading model with ID: {model_id}")

# Try loading best accuracy checkpoint first, then fall back to best F1
ckpt = load_model_checkpoint(model_id, models_folder=MODELS_FOLDER, suffix="_best_acc")

if ckpt is None:
    raise FileNotFoundError(
        f"Model checkpoint not found for model_id: {eval_model_id}\n"
        f"Please ensure the model has been trained first, or check that the hyperparameters match."
    )

stoi, itos = ckpt["stoi"], ckpt["itos"]

model.load_state_dict(ckpt["model_state"])
model.eval()
print("Model loaded successfully!")

# ==== 4) Evaluation ====
# ==== 4) Evaluation (token-space boundary precision/recall/F1) ====

def boundary_positions_from_labels(labels, L=None):
    """
    Convert per-token boundary labels to boundary positions 0..L-2.
    We ignore any 'boundary' at the last token index.
    """
    if not labels:
        return set()
    if L is None:
        L = len(labels)
    upto = min(L - 1, len(labels))
    return {i for i in range(upto) if labels[i] == 1}

def boundary_positions_from_morpheme_tokens(morpheme_token_lists):
    """
    Given a list of morphemes, each as a list of grapheme tokens,
    return boundary positions (end-of-morpheme token indices) excluding the last morpheme.
    """
    pos = set()
    acc = 0
    for k, toks in enumerate(morpheme_token_lists):
        acc += len(toks)
        if k < len(morpheme_token_lists) - 1:
            pos.add(acc - 1)
    return pos

def prf_from_sets(pred_set, gold_set):
    tp = len(pred_set & gold_set)
    fp = len(pred_set - gold_set)
    fn = len(gold_set - pred_set)

    if tp + fp == 0:
        precision = 1.0 if (tp + fn == 0) else 0.0
    else:
        precision = tp / (tp + fp)

    if tp + fn == 0:
        recall = 1.0 if (tp + fp == 0) else 0.0
    else:
        recall = tp / (tp + fn)

    if precision + recall == 0:
        f1 = 1.0 if (tp + fp + fn) == 0 else 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return tp, fp, fn, precision, recall, f1

def best_variant_metrics_token_space(word_tokens, pred_boundary_labels, gold_variants):
    """
    Compare predicted token-boundaries to each gold variant (converted to token boundaries).
    Return metrics for the gold variant that maximizes F1.
    """
    pred_b = boundary_positions_from_labels(pred_boundary_labels, L=len(word_tokens))

    best = None
    for variant in gold_variants:
        # Tokenize each morpheme with the same grapheme tokenizer
        variant_token_lists = [to_graphemes_quechua(m) for m in variant]
        gold_b = boundary_positions_from_morpheme_tokens(variant_token_lists)
        tp, fp, fn, P, R, F1 = prf_from_sets(pred_b, gold_b)
        key = (F1, tp, -fn, -fp)  # tie-breakers
        if (best is None) or (key > best[0]):
            best = (key, gold_b, tp, fp, fn, P, R, F1)

    if best is None:
        gold_b = set()
        tp, fp, fn, P, R, F1 = prf_from_sets(pred_b, gold_b)
        return pred_b, gold_b, tp, fp, fn, P, R, F1

    _, gold_b, tp, fp, fn, P, R, F1 = best
    return pred_b, gold_b, tp, fp, fn, P, R, F1

def is_correct_prediction(predicted, gold_variants):
    return any(predicted == variant for variant in gold_variants)

def split_count_metrics(predicted_segments, gold_variants):
    """
    Returns a dictionary with boolean flags:
    - exact: same split count as any gold variant
    - plus1: one more split than any gold variant
    - minus1: one less split than any gold variant
    - plusminus1: within ±1 split difference of any gold variant
    """
    pred_count = len(predicted_segments)
    gold_counts = [len(gold) for gold in gold_variants]

    exact = any(pred_count == g for g in gold_counts)
    plus1 = any(pred_count == g + 1 for g in gold_counts)
    minus1 = any(pred_count == g - 1 for g in gold_counts)
    plusminus1 = any(abs(pred_count - g) <= 1 for g in gold_counts)

    return {
        "Exact": exact,
        "+1": plus1,
        "-1": minus1,
        "±1": plusminus1
    }

def normalize_gold_variants(gold_variants):
    """
    Convert gold_variants to a list format, handling numpy arrays and nested structures.
    """
    if gold_variants is None:
        return []
    
    # If it's a numpy array, convert to list
    if isinstance(gold_variants, np.ndarray):
        gold_variants = gold_variants.tolist()
    
    # If it's already a list, ensure nested elements are also lists (not numpy arrays)
    if isinstance(gold_variants, list):
        normalized = []
        for variant in gold_variants:
            if isinstance(variant, np.ndarray):
                normalized.append(variant.tolist())
            elif isinstance(variant, list):
                # Recursively normalize nested lists
                normalized.append([item.tolist() if isinstance(item, np.ndarray) else item for item in variant])
            else:
                normalized.append(variant)
        return normalized
    
    return []


# ----- Batch predict -----
all_words = acc_df["Word"].tolist()
all_gold = acc_df["Gold"]

all_boundaries = predict_boundaries_with_morf(
    all_words, model, stoi, morf, threshold=0.5, device=device
)

records = []
micro_tp = micro_fp = micro_fn = 0
macro_Ps, macro_Rs, macro_F1s = [], [], []
exact_flags = []
split_exact_flags = []
split_plus1_flags = []
split_minus1_flags = []
split_pm1_flags = []
overlap_flags = []

for word, gold_variants, boundary_labels in zip(all_words, all_gold, all_boundaries):
    # Normalize gold variants
    gold_variants = normalize_gold_variants(gold_variants)
    
    # Tokenize word into graphemes
    toks = to_graphemes_quechua(word)

    # Build predicted segmentation from token-level labels
    predicted_segments = apply_boundaries_tokens(toks, boundary_labels)

    # Exact-match accuracy
    correct_exact = is_correct_prediction(predicted_segments, gold_variants)

    # (2) Split count metrics
    split_metrics = split_count_metrics(predicted_segments, gold_variants)

    # (3) Overlap
    overlap = correct_exact and split_metrics["Exact"]

    # Boundary metrics in token space (pick best gold per word)
    pred_b, gold_b_chosen, tp, fp, fn, P, R, F1 = best_variant_metrics_token_space(
        toks, boundary_labels, gold_variants
    )

    records.append({
        "Word": word,
        "Prediction": predicted_segments,
        "Gold": gold_variants,
        "PredBoundaries(tok_idx)": sorted(pred_b),
        "GoldBoundaries(Chosen tok_idx)": sorted(gold_b_chosen),
        "TP": tp, "FP": fp, "FN": fn,
        "P_word": P, "R_word": R, "F1_word": F1,
        "CorrectExactSeg": correct_exact,
        "CorrectSplitCount": split_metrics["Exact"],
        "SplitCount+1": split_metrics["+1"],
        "SplitCount-1": split_metrics["-1"],
        "SplitCount±1": split_metrics["±1"],
        "OverlapExactAndSplit": overlap
    })

    micro_tp += tp
    micro_fp += fp
    micro_fn += fn
    macro_Ps.append(P)
    macro_Rs.append(R)
    macro_F1s.append(F1)
    exact_flags.append(correct_exact)
    split_exact_flags.append(split_metrics["Exact"])
    split_plus1_flags.append(split_metrics["+1"])
    split_minus1_flags.append(split_metrics["-1"])
    split_pm1_flags.append(split_metrics["±1"])
    overlap_flags.append(overlap)

results_df = pd.DataFrame(records)

# Exact segmentation accuracy
accuracy = results_df["CorrectExactSeg"].mean()

# Micro (global) metrics
if micro_tp + micro_fp == 0:
    P_micro = 1.0 if micro_tp + micro_fn == 0 else 0.0
else:
    P_micro = micro_tp / (micro_tp + micro_fp)

if micro_tp + micro_fn == 0:
    R_micro = 1.0 if micro_tp + micro_fp == 0 else 0.0
else:
    R_micro = micro_tp / (micro_tp + micro_fn)

if P_micro + R_micro == 0:
    F1_micro = 1.0 if (micro_tp + micro_fp + micro_fn) == 0 else 0.0
else:
    F1_micro = 2 * P_micro * R_micro / (P_micro + R_micro)

# Macro (average of per-word scores)
P_macro = float(pd.Series(macro_Ps).mean()) if macro_Ps else 0.0
R_macro = float(pd.Series(macro_Rs).mean()) if macro_Rs else 0.0
F1_macro = float(pd.Series(macro_F1s).mean()) if macro_F1s else 0.0

exact_accuracy = np.mean(exact_flags)
split_exact_acc = np.mean(split_exact_flags)
split_plus1_acc = np.mean(split_plus1_flags)
split_minus1_acc = np.mean(split_minus1_flags)
split_pm1_acc = np.mean(split_pm1_flags)
overlap_accuracy = np.mean(overlap_flags)

print("=== Segmentation and Split Count Metrics ===")
print(f"Exact segmentation accuracy:  {exact_accuracy:.4f}")
print(f"Split-count (Exact):          {split_exact_acc:.4f}")
print(f"Split-count (+1):             {split_plus1_acc:.4f}")
print(f"Split-count (−1):             {split_minus1_acc:.4f}")
print(f"Split-count (±1):             {split_pm1_acc:.4f}")
print(f"Overlap (Exact ∩ Split):      {overlap_accuracy:.4f}")

print("Boundary metrics (token space):")
print(f"  Micro  - P: {P_micro:.4f}  R: {R_micro:.4f}  F1: {F1_micro:.4f}")
print(f"  Macro  - P: {P_macro:.4f}  R: {R_macro:.4f}  F1: {F1_macro:.4f}")

# Save enriched results
results_output_path = os.path.join(DATA_FOLDER, "bilstm_morfessor_eval_results.csv")
results_df.to_csv(results_output_path, index=False)



Loading model with ID: 1c35f73b7670b1bf
Model checkpoint loaded from models_segmenter-morfessor\model_1c35f73b7670b1bf\bilstm_morfessor_best_acc.pt
Model loaded successfully!
=== Segmentation and Split Count Metrics ===
Exact segmentation accuracy:  0.5509
Split-count (Exact):          0.6429
Split-count (+1):             0.2267
Split-count (−1):             0.0909
Split-count (±1):             0.9595
Overlap (Exact ∩ Split):      0.5509
Boundary metrics (token space):
  Micro  - P: 0.7962  R: 0.8835  F1: 0.8376
  Macro  - P: 0.8129  R: 0.8668  F1: 0.8207


In [17]:
results_df.head(50)

Unnamed: 0,Word,Prediction,Gold,PredBoundaries(tok_idx),GoldBoundaries(Chosen tok_idx),TP,FP,FN,P_word,R_word,F1_word,CorrectExactSeg,CorrectSplitCount,SplitCount+1,SplitCount-1,SplitCount±1,OverlapExactAndSplit
0,unupas,"[unupa, s]","[[unu, pas]]",[4],[2],0,1,1,0.0,0.0,0.0,False,True,False,False,True,False
1,umankus,"[uma, nku, s]","[[uma, nku, s]]","[2, 5]","[2, 5]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
2,hikurin,"[hiku, ri, n]","[[hikuri, n]]","[3, 5]",[5],1,1,0,0.5,1.0,0.666667,False,False,True,False,True,False
3,sutipi,"[suti, pi]","[[suti, pi]]",[3],[3],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
4,pikunas,"[pi, kuna, s]","[[pi, kuna, s]]","[1, 5]","[1, 5]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
5,atipaq,"[ati, pa, q]","[[ati, paq], [ati, pa, q]]","[2, 4]","[2, 4]",2,0,0,1.0,1.0,1.0,True,True,True,False,True,True
6,tomani,"[toma, ni]","[[toma, ni]]",[3],[3],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
7,rantiq,"[ranti, q]","[[ranti, q]]",[4],[4],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
8,imakunas,"[ima, kuna, s]","[[ima, kuna, s]]","[2, 6]","[2, 6]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
9,chiqaq,"[chiqa, q]",[[chiqaq]],[3],[],0,1,0,0.0,0.0,0.0,False,False,True,False,True,False
