In [27]:
"""
SEGMENTER-OLD: CHARACTER-LEVEL BILSTM MORPHOLOGY PARSER
========================================================

This notebook implements a character-level BiLSTM model for morphological segmentation
of Quechua words. It predicts boundary positions at the character level, marking where
morpheme boundaries occur within words.

Key Features:
- Character-level tokenization (each character is a token)
- BiLSTM architecture for sequence labeling
- Binary classification: predicts boundary (1) or no boundary (0) at each character position
- Comprehensive evaluation metrics (precision, recall, F1, exact match, split-count accuracy)

This is an older/alternative approach compared to the token-window based models in
DT-LSTM-MarkovFilter.ipynb and Markov-LSTM-MarkovFilter.ipynb.

All data is read from the 'data' folder and models are saved to the 'models_segmenter-old' folder.
"""

import ast
import pandas as pd
import os
import json
import hashlib
import pickle

In [28]:
# =========================
# DATA FOLDER CONFIGURATION
# =========================
# All data files should be read from and saved to the data folder
DATA_FOLDER = "data"

# Model folder named after this notebook
MODEL_NAME = "segmenter-old"
MODELS_FOLDER = f"models_{MODEL_NAME}"

# Create folders if they don't exist
os.makedirs(DATA_FOLDER, exist_ok=True)
os.makedirs(MODELS_FOLDER, exist_ok=True)

# =========================
# LOAD GOLD STANDARD DATA
# =========================
# The gold standard dataset contains high-quality morphological segmentations
# This is the base training data for the character-level BiLSTM model
print("Loading gold standard data...")
gold_df = pd.read_parquet(os.path.join(DATA_FOLDER, "Sue_kalt.parquet"))
gold_df['Word'] = gold_df['word']
gold_df['morph'] = gold_df['morph'].str.replace('-', ' ')  # Normalize separators
gold_df['Morph_split_str'] = gold_df['morph']  # String version
gold_df['Morph_split'] = gold_df['morph'].str.split(' ')  # List version
gold_df = gold_df[['Word', 'Morph_split', 'Morph_split_str']]
gold_df.drop_duplicates(subset='Word', keep='first', inplace=True)
gold_df.dropna(subset=['Word'], inplace=True)
print(f"Loaded {len(gold_df):,} gold standard examples")


Loading gold standard data...
Loaded 6,896 gold standard examples


In [29]:
gold_df.head()


Unnamed: 0,Word,Morph_split,Morph_split_str
0,cementerioman,"[cementerio, man]",cementerio man
1,kawsachkananta,"[kawsa, chka, na, n, ta]",kawsa chka na n ta
2,mañakunpis,"[maña, ku, n, pis]",maña ku n pis
3,imaynapichus,"[imayna, pi, chus]",imayna pi chus
4,qipiyuq,"[qipi, yuq]",qipi yuq


In [30]:
gold_df.shape


(6896, 3)

In [31]:
# =========================
# FEATURE EXTRACTION
# =========================
# Extract basic features for analysis and potential use in the model
gold_df['num_morphemes'] = gold_df['Morph_split'].apply(len)  # Number of morphemes per word
gold_df['word_len'] = gold_df['Word'].apply(len)  # Character length of word


In [32]:
gold_df.head()


Unnamed: 0,Word,Morph_split,Morph_split_str,num_morphemes,word_len
0,cementerioman,"[cementerio, man]",cementerio man,2,13
1,kawsachkananta,"[kawsa, chka, na, n, ta]",kawsa chka na n ta,5,14
2,mañakunpis,"[maña, ku, n, pis]",maña ku n pis,4,10
3,imaynapichus,"[imayna, pi, chus]",imayna pi chus,3,12
4,qipiyuq,"[qipi, yuq]",qipi yuq,2,7


In [33]:
# =========================
# BOUNDARY LABEL GENERATION
# =========================
# Convert morpheme splits into character-level boundary labels
# Labels mark the end position of each morpheme (except the last one)

def get_boundary_labels(word, split):
    """
    Generate binary boundary labels for a word given its morpheme split.
    
    Args:
        word: The full word string
        split: List of morphemes (e.g., ['kawsa', 'chka', 'na', 'n', 'ta'])
    
    Returns:
        List of binary labels (0=no boundary, 1=boundary) for each character position
        The label at position i indicates if there's a boundary after character i
    """
    labels = [0] * len(word)
    idx = 0
    # Mark boundaries after each morpheme (except the last one)
    for morpheme in split[:-1]: 
        idx += len(morpheme)
        if idx < len(word):
            labels[idx - 1] = 1  # Boundary at the end of this morpheme
    return labels


In [34]:
# =========================
# PREPARE TRAINING DATA
# =========================
# Convert words to character sequences and generate boundary labels
# This prepares the data for the character-level BiLSTM model

gold_df['char_seq'] = gold_df['Word'].apply(list)  # Convert word to list of characters
gold_df['boundary_labels'] = gold_df.apply(
    lambda row: get_boundary_labels(row['Word'], row['Morph_split']), axis=1
)


In [35]:
# =========================
# PYTORCH IMPORTS AND SETUP
# =========================
# Import libraries for neural network training and data handling

import math
import random
import numpy as np
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# =========================
# VOCABULARY CONSTRUCTION
# =========================
# Build character-level vocabulary for embedding layer
# Each unique character gets an integer ID

PAD, UNK = "<PAD>", "<UNK>"  # Special tokens for padding and unknown characters

def build_vocab(seqs: List[List[str]]):
    """
    Build vocabulary from character sequences.
    
    Args:
        seqs: List of character sequences (each sequence is a list of characters)
    
    Returns:
        Tuple of (stoi, itos):
        - stoi: Dictionary mapping character to integer ID
        - itos: List mapping integer ID to character
    """
    chars = {c for seq in seqs for c in seq}  # Collect all unique characters
    itos = [PAD, UNK] + sorted(chars)  # Index-to-string: [PAD, UNK, 'a', 'b', ...]
    stoi = {ch: i for i, ch in enumerate(itos)}  # String-to-index dictionary
    return stoi, itos

# Build vocabulary from all character sequences in the gold data
stoi, itos = build_vocab(gold_df["char_seq"].tolist())
print(f"Vocabulary size: {len(itos)} characters")

def encode(seq: List[str]) -> List[int]:
    """Convert character sequence to integer IDs."""
    return [stoi.get(c, stoi[UNK]) for c in seq]

def encode_labels(labels: List[int]) -> List[int]:
    """Labels are already 0/1, so just return them as-is."""
    return labels

# =========================
# DATASET AND DATALOADER
# =========================
# PyTorch Dataset and DataLoader for batching and padding sequences

class CharBoundaryDataset(Dataset):
    """
    PyTorch Dataset for character-level boundary prediction.
    Each sample contains a character sequence and its boundary labels.
    """
    def __init__(self, df):
        self.x = df["char_seq"].tolist()  # Character sequences
        self.y = df["boundary_labels"].tolist()  # Boundary labels
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def pad_batch(batch, pad_id=0):
    """
    Collate function for DataLoader: pads sequences to the same length.
    
    Args:
        batch: List of (character_sequence, boundary_labels) tuples
        pad_id: ID to use for padding (default: 0, which is PAD token)
    
    Returns:
        Tuple of tensors:
        - x_pad: Padded character sequences (B, T)
        - y_pad: Padded boundary labels (B, T)
        - mask: Boolean mask indicating valid positions (B, T)
        - lengths: Actual length of each sequence (B,)
    """
    # batch: List[ (List[str], List[int]) ]
    seqs, labels = zip(*batch)
    x_ids = [encode(s) for s in seqs]  # Convert characters to IDs
    y_ids = [encode_labels(y) for y in labels]  # Labels are already 0/1
    lengths = [len(x) for x in x_ids]
    maxlen = max(lengths)
    
    # Pad sequences and labels to maxlen
    x_pad = [xi + [pad_id]*(maxlen - len(xi)) for xi in x_ids]
    y_pad = [yi + [0]*(maxlen - len(yi)) for yi in y_ids]  # Pad labels as 0 (will be masked)
    mask  = [[1]*len(xi) + [0]*(maxlen - len(xi)) for xi in x_ids]  # 1 for valid, 0 for padding
    
    return (
        torch.LongTensor(x_pad),
        torch.FloatTensor(y_pad),   # BCE expects float targets
        torch.BoolTensor(mask),
        torch.LongTensor(lengths),
    )

# =========================
# TRAIN/VALIDATION SPLIT
# =========================
# Split data into 90% training and 10% validation
rng = np.random.default_rng(42)  # Fixed seed for reproducibility
indices = np.arange(len(gold_df))
rng.shuffle(indices)
split = int(0.9*len(indices))
train_idx, val_idx = indices[:split], indices[split:]

train_df = gold_df.iloc[train_idx].reset_index(drop=True)
val_df   = gold_df.iloc[val_idx].reset_index(drop=True)

print(f"Training samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")

train_ds = CharBoundaryDataset(train_df)
val_ds   = CharBoundaryDataset(val_df)

BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_batch)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_batch)

# =========================
# BILSTM MODEL ARCHITECTURE
# =========================
# Character-level BiLSTM for boundary prediction

class BiLSTMBoundary(nn.Module):
    """
    Bidirectional LSTM model for character-level boundary prediction.
    
    Architecture:
    1. Character embeddings (emb_dim dimensions)
    2. Bidirectional LSTM (hidden_size per direction)
    3. Dropout for regularization
    4. Linear output layer (predicts boundary probability at each position)
    
    The model processes sequences character-by-character and outputs a logit
    for each position indicating the probability of a boundary after that character.
    """
    def __init__(self, vocab_size: int, emb_dim: int = 16, hidden_size: int = 16, num_layers: int = 1, dropout: float = 0.1):
        super().__init__()
        # Character embedding layer
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,  # Process sequence in both directions
            batch_first=True,
        )
        self.dropout = nn.Dropout(dropout)
        # Output layer: 2*hidden_size because bidirectional LSTM concatenates forward/backward
        self.out = nn.Linear(hidden_size * 2, 1)  # Binary classification per time-step
    
    def forward(self, x, lengths):
        """
        Forward pass through the model.
        
        Args:
            x: Input character IDs (B, T) - Long tensor
            lengths: Actual length of each sequence (B,) - Long tensor
        
        Returns:
            logits: Boundary prediction logits (B, T) - Float tensor
        """
        emb = self.emb(x)  # (B, T, E) - Embed characters
        
        # Pack sequences to ignore padding during LSTM processing
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # (B, T, 2H)
        
        out = self.dropout(out)
        logits = self.out(out).squeeze(-1)  # (B, T) - One logit per character position
        return logits

# =========================
# LOSS FUNCTION
# =========================
# Masked binary cross-entropy loss (ignores padding positions)

def masked_bce_loss(logits, targets, mask):
    """
    Compute masked binary cross-entropy loss.
    Only computes loss on valid (non-padded) positions.
    
    Args:
        logits: Model predictions (B, T)
        targets: Ground truth labels (B, T)
        mask: Boolean mask indicating valid positions (B, T)
    
    Returns:
        Scalar loss value
    """
    loss_fn = nn.BCEWithLogitsLoss(reduction="none")
    loss_per_token = loss_fn(logits, targets)
    loss_per_token = loss_per_token * mask.float()  # Zero out padding positions
    denom = mask.float().sum().clamp_min(1.0)  # Total number of valid tokens
    return loss_per_token.sum() / denom

# =========================
# EVALUATION METRICS
# =========================
# Functions to compute precision, recall, and F1 score for boundary prediction

def boundary_f1(logits, targets, mask, threshold=0.5):
    """
    Compute precision, recall, and F1 score for boundary prediction.
    
    Args:
        logits: Model predictions (B, T)
        targets: Ground truth labels (B, T)
        mask: Boolean mask indicating valid positions (B, T)
        threshold: Probability threshold for binary classification (default: 0.5)
    
    Returns:
        Tuple of (precision, recall, f1_score)
    """
    with torch.no_grad():
        probs = torch.sigmoid(logits)  # Convert logits to probabilities
        preds = (probs >= threshold).long()  # Binary predictions
        t = targets.long()
        m = mask.long()

        # Compute true positives, false positives, false negatives
        tp = ((preds == 1) & (t == 1) & (m == 1)).sum().item()
        fp = ((preds == 1) & (t == 0) & (m == 1)).sum().item()
        fn = ((preds == 0) & (t == 1) & (m == 1)).sum().item()

        # Compute metrics with safe division
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1   = 2*prec*rec / (prec + rec) if (prec + rec) > 0 else 0.0
        return prec, rec, f1

# ==== 7) Inference: predict boundaries and reconstruct morphemes ====
def predict_boundaries(words: List[str], model, stoi, threshold=0.5) -> List[List[int]]:
    model.eval()
    char_lists = [list(w) for w in words]
    x_ids = [ [stoi.get(c, stoi[UNK]) for c in chars] for chars in char_lists ]
    lengths = [len(x) for x in x_ids]
    maxlen = max(lengths)
    pad_id = stoi[PAD]

    x_pad = [xi + [pad_id]*(maxlen - len(xi)) for xi in x_ids]
    mask  = [[1]*len(xi) + [0]*(maxlen - len(xi)) for xi in x_ids]

    x = torch.LongTensor(x_pad).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    mask_t = torch.BoolTensor(mask).to(device)

    with torch.no_grad():
        logits = model(x, lengths_t)
        probs = torch.sigmoid(logits)
        preds = (probs >= threshold) & mask_t
    # trim pad and convert to 0/1
    out = []
    for i, L in enumerate(lengths):
        out.append(preds[i, :L].int().tolist())
    return out

def apply_boundaries(word: str, boundary_labels: List[int]) -> List[str]:
    # boundary_labels marks the *end* of a morpheme at that position (same convention as your helper)
    segs = []
    start = 0
    for i, b in enumerate(boundary_labels, start=0):
        if b == 1:
            segs.append(word[start:i+1])
            start = i+1
    if start < len(word):
        segs.append(word[start:])
    return segs

# Example usage:
# test_words = ["rikuchkani", "pikunas", "ñichkanchus"]
# pred_b = predict_boundaries(test_words, model, stoi, threshold=0.5)
# for w, b in zip(test_words, pred_b):
#     print(w, b, "->", apply_boundaries(w, b))



Device: cuda
Vocabulary size: 55 characters
Training samples: 6,206
Validation samples: 690


In [36]:
# =========================
# MODEL CHECKPOINTING FUNCTIONS
# =========================
# Functions to save and load trained models to avoid retraining

def generate_model_id(emb_dim, hidden_size, num_layers, dropout, epochs, batch_size, lr, weight_decay):
    """
    Generate a unique identifier for a model based on its training parameters.
    
    Args:
        All training hyperparameters
    
    Returns:
        A string identifier (hash) for the model
    """
    params_dict = {
        'emb_dim': emb_dim,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'dropout': dropout,
        'epochs': epochs,
        'batch_size': batch_size,
        'lr': lr,
        'weight_decay': weight_decay,
        'vocab_size': len(itos)
    }
    params_str = json.dumps(params_dict, sort_keys=True)
    model_id = hashlib.md5(params_str.encode()).hexdigest()[:16]
    return model_id

def save_model_checkpoint(model, stoi, itos, model_id, models_folder=MODELS_FOLDER):
    """
    Save model checkpoint to the models folder.
    
    Args:
        model: Trained BiLSTMBoundary model
        stoi: String-to-index vocabulary dictionary
        itos: Index-to-string vocabulary list
        model_id: Unique identifier for this model
        models_folder: Folder to save models in
    """
    model_dir = os.path.join(models_folder, model_id)
    os.makedirs(model_dir, exist_ok=True)
    
    checkpoint_path = os.path.join(model_dir, "bilstm_char_boundary.pt")
    torch.save({
        "model_state": model.state_dict(),
        "stoi": stoi,
        "itos": itos
    }, checkpoint_path)
    
    # Save metadata
    metadata_path = os.path.join(model_dir, "metadata.json")
    with open(metadata_path, "w") as f:
        json.dump({
            'model_id': model_id,
            'vocab_size': len(itos),
            'model_name': MODEL_NAME
        }, f, indent=2)
    
    print(f"Model checkpoint saved to {model_dir}")
    return model_dir

def load_model_checkpoint(model_id, models_folder=MODELS_FOLDER):
    """
    Load model checkpoint from the models folder.
    
    Args:
        model_id: Unique identifier for the model
        models_folder: Folder where models are saved
    
    Returns:
        Dictionary with 'model_state', 'stoi', 'itos', 'checkpoint_path', 'model_dir' or None if not found
    """
    model_dir = os.path.join(models_folder, model_id)
    checkpoint_path = os.path.join(model_dir, "bilstm_char_boundary.pt")
    
    if not os.path.exists(checkpoint_path):
        return None
    
    checkpoint = torch.load(checkpoint_path, map_location=device)
    print(f"Model checkpoint loaded from {model_dir}")
    return {
        'model_state': checkpoint['model_state'],
        'stoi': checkpoint['stoi'],
        'itos': checkpoint['itos'],
        'checkpoint_path': checkpoint_path,
        'model_dir': model_dir
    }

# =========================
# MODEL INITIALIZATION AND TRAINING
# =========================
# Initialize model and optimizer, then train (or load if already trained)

# Model hyperparameters
EMB_DIM = 16
HIDDEN_SIZE = 32
NUM_LAYERS = 2
DROPOUT = 0.3
EPOCHS = 35
BATCH_SIZE = 64
LR = 1e-3
WEIGHT_DECAY = 1e-4

# Generate model identifier
model_id = generate_model_id(EMB_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, EPOCHS, BATCH_SIZE, LR, WEIGHT_DECAY)

# Try to load existing model
print(f"Checking for existing model with ID: {model_id}")
loaded = load_model_checkpoint(model_id, models_folder=MODELS_FOLDER)

if loaded is not None:
    print(f"✅ Found existing model! Loading from {loaded['model_dir']}")
    stoi = loaded['stoi']
    itos = loaded['itos']
    model = BiLSTMBoundary(vocab_size=len(itos), emb_dim=EMB_DIM, hidden_size=HIDDEN_SIZE, 
                           num_layers=NUM_LAYERS, dropout=DROPOUT).to(device)
    model.load_state_dict(loaded['model_state'])
    model.eval()
    print("Model loaded successfully. Skipping training.")
else:
    print(f"No existing model found. Training new model...")
    
    # Initialize model and optimizer
    model = BiLSTMBoundary(vocab_size=len(itos), emb_dim=EMB_DIM, hidden_size=HIDDEN_SIZE, 
                          num_layers=NUM_LAYERS, dropout=DROPOUT).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    
    # =========================
    # TRAINING LOOP
    # =========================
    # Train the model for specified number of epochs
    # Save checkpoint whenever validation F1 improves
    
    best_val_f1 = 0.0
    
    for epoch in range(1, EPOCHS+1):
        model.train()
        total_loss = 0.0
        total_tokens = 0
        for x, y, mask, lengths in train_loader:
            x = x.to(device)
            y = y.to(device)
            mask = mask.to(device)
            lengths = lengths.to(device)

            logits = model(x, lengths)
            loss = masked_bce_loss(logits, y, mask)

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item() * mask.sum().item()
            total_tokens += mask.sum().item()

        train_loss = total_loss / max(total_tokens, 1)

        # ---- Validation phase
        model.eval()
        val_loss, val_tokens = 0.0, 0
        all_prec, all_rec, all_f1 = [], [], []
        with torch.no_grad():
            for x, y, mask, lengths in val_loader:
                x = x.to(device)
                y = y.to(device)
                mask = mask.to(device)
                lengths = lengths.to(device)

                logits = model(x, lengths)
                loss = masked_bce_loss(logits, y, mask)
                val_loss += loss.item() * mask.sum().item()
                val_tokens += mask.sum().item()

                p, r, f = boundary_f1(logits, y, mask, threshold=0.5)
                all_prec.append(p); all_rec.append(r); all_f1.append(f)

        val_loss = val_loss / max(val_tokens, 1)
        prec = np.mean(all_prec) if all_prec else 0.0
        rec  = np.mean(all_rec)  if all_rec  else 0.0
        f1   = np.mean(all_f1)   if all_f1   else 0.0

        print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f}  val_loss={val_loss:.4f}  P={prec:.3f} R={rec:.3f} F1={f1:.3f}")

        # Keep best model based on validation F1
        if f1 > best_val_f1:
            best_val_f1 = f1
            save_model_checkpoint(model, stoi, itos, model_id, models_folder=MODELS_FOLDER)
            print("  ↳ saved checkpoint (best F1 so far)")
    
    print(f"\nTraining complete! Best validation F1: {best_val_f1:.4f}")


Checking for existing model with ID: 6112ccdaef2e0c54
Model checkpoint loaded from models_segmenter-old\6112ccdaef2e0c54
✅ Found existing model! Loading from models_segmenter-old\6112ccdaef2e0c54
Model loaded successfully. Skipping training.


In [37]:
import pandas as pd
import re
import torch

# =========================
# LOAD TEST DATA
# =========================
# Load the test/accuracy evaluation dataset
print("Loading test data...")
df = pd.read_parquet(os.path.join(DATA_FOLDER, "cleaned_data_df.parquet"))
print(f"Loaded {len(df):,} test examples")

# =========================
# LOAD TRAINED MODEL
# =========================
# Load the best model checkpoint from the models folder
# Use the same model ID that was generated during training

# Model hyperparameters (must match training)
EMB_DIM = 16
HIDDEN_SIZE = 32
NUM_LAYERS = 2
DROPOUT = 0.3
EPOCHS = 35
BATCH_SIZE = 64
LR = 1e-3
WEIGHT_DECAY = 1e-4

# Generate the same model ID
model_id = generate_model_id(EMB_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, EPOCHS, BATCH_SIZE, LR, WEIGHT_DECAY)

# Load checkpoint
loaded = load_model_checkpoint(model_id, models_folder=MODELS_FOLDER)
if loaded is None:
    raise FileNotFoundError(f"Model checkpoint not found. Please train the model first (model_id: {model_id})")

stoi, itos = loaded["stoi"], loaded["itos"]
model = BiLSTMBoundary(vocab_size=len(itos), emb_dim=EMB_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS)
model.load_state_dict(loaded["model_state"])
model.eval()
print("Model loaded successfully for evaluation.")

# =========================
# EVALUATION HELPER FUNCTIONS
# =========================
# Functions for predicting boundaries and evaluating segmentation accuracy

def predict_boundaries(words, model, stoi, threshold=0.5):
    model.eval()
    char_lists = [list(w) for w in words]
    x_ids = [[stoi.get(c, stoi["<UNK>"]) for c in chars] for chars in char_lists]
    lengths = [len(x) for x in x_ids]
    maxlen = max(lengths)
    pad_id = stoi["<PAD>"]

    x_pad = [xi + [pad_id] * (maxlen - len(xi)) for xi in x_ids]
    mask = [[1] * len(xi) + [0] * (maxlen - len(xi)) for xi in x_ids]

    x = torch.LongTensor(x_pad)
    lengths_t = torch.LongTensor(lengths)

    with torch.no_grad():
        logits = model(x, lengths_t)
        probs = torch.sigmoid(logits)
        preds = (probs >= threshold)
    out = []
    for i, L in enumerate(lengths):
        out.append(preds[i, :L].int().tolist())
    return out

def apply_boundaries(word, boundary_labels):
    segs = []
    start = 0
    for i, b in enumerate(boundary_labels):
        if b == 1:
            segs.append(word[start:i+1])
            start = i+1
    if start < len(word):
        segs.append(word[start:])
    return segs


# =========================
# EVALUATION METRICS FUNCTIONS
# =========================
# Functions to compute various evaluation metrics for morphological segmentation

def is_correct_prediction(predicted, gold_variants):
    """
    Check if predicted segmentation exactly matches any gold variant.
    
    Args:
        predicted: List of predicted morphemes
        gold_variants: List of gold segmentation variants
    
    Returns:
        True if prediction matches any gold variant, False otherwise
    """
    return any(predicted == variant for variant in gold_variants)

def boundary_positions_from_labels(labels):
    """
    Convert per-char boundary labels (length L) into boundary positions
    at indices 0..L-2 (between characters). We ignore any label at the last index.
    """
    if not labels:
        return set()
    L = len(labels)
    return {i for i in range(min(L-1, len(labels))) if labels[i] == 1}

def boundary_positions_from_segments(segments):
    """
    Convert a list of segments into boundary positions (end-of-segment indices)
    excluding the final segment end.
    """
    pos = set()
    acc = 0
    for k, seg in enumerate(segments):
        acc += len(seg)
        if k < len(segments) - 1:
            pos.add(acc - 1)  # boundary after this segment at index acc-1
    return pos

def prf_from_sets(pred_set, gold_set):
    tp = len(pred_set & gold_set)
    fp = len(pred_set - gold_set)
    fn = len(gold_set - pred_set)
    # Precision, recall, F1 with safe 0/0 handling (define as 1.0 when both empty)
    if tp + fp == 0:
        precision = 1.0 if tp + fp + fn == 0 else 0.0
    else:
        precision = tp / (tp + fp)
    if tp + fn == 0:
        recall = 1.0 if tp + fp + fn == 0 else 0.0
    else:
        recall = tp / (tp + fn)
    if precision + recall == 0:
        f1 = 1.0 if tp + fp + fn == 0 else 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return tp, fp, fn, precision, recall, f1

def best_variant_metrics(pred_boundaries, gold_variants):
    """
    Among multiple gold segmentations (variants), pick the one that maximizes F1.
    Returns: (best_gold_boundaries, tp, fp, fn, P, R, F1)
    """
    best = None
    for variant in gold_variants:
        gold_b = boundary_positions_from_segments(variant)
        tp, fp, fn, P, R, F1 = prf_from_sets(pred_boundaries, gold_b)
        key = (F1, tp, -fn, -fp)  # tie-breakers
        if (best is None) or (key > best[0]):
            best = (key, gold_b, tp, fp, fn, P, R, F1)
    # If no variants (shouldn't happen after your cleaning), fall back to empty
    if best is None:
        gold_b = set()
        tp, fp, fn, P, R, F1 = prf_from_sets(pred_boundaries, gold_b)
        return gold_b, tp, fp, fn, P, R, F1
    _, gold_b, tp, fp, fn, P, R, F1 = best
    return gold_b, tp, fp, fn, P, R, F1

def normalize_gold_variants(gold_variants):
    """
    Convert gold_variants to a list format, handling numpy arrays and nested structures.
    """
    if gold_variants is None:
        return []
    
    # If it's a numpy array, convert to list
    if isinstance(gold_variants, np.ndarray):
        gold_variants = gold_variants.tolist()
    
    # If it's already a list, ensure nested elements are also lists (not numpy arrays)
    if isinstance(gold_variants, list):
        normalized = []
        for variant in gold_variants:
            if isinstance(variant, np.ndarray):
                normalized.append(variant.tolist())
            elif isinstance(variant, list):
                # Recursively normalize nested lists
                normalized.append([item.tolist() if isinstance(item, np.ndarray) else item for item in variant])
            else:
                normalized.append(variant)
        return normalized
    
    return []

# =========================
# EVALUATION ON TEST SET
# =========================
# Predict boundaries for all test words and compute evaluation metrics

# Batch predict boundaries for all test words
all_words = df["Word"].tolist()
all_boundaries = predict_boundaries(all_words, model, stoi, threshold=0.5)

# Initialize metrics accumulators
results = []
micro_tp = micro_fp = micro_fn = 0  # Micro-averaged metrics (global counts)
macro_Ps, macro_Rs, macro_F1s = [], [], []  # Macro-averaged metrics (per-word averages)

# Evaluate each word
for word, gold_variants, boundary_labels in zip(all_words, df["Gold"], all_boundaries):
    # Normalize gold_variants (convert numpy arrays to lists)
    gold_variants = normalize_gold_variants(gold_variants)

    # Per-word predicted boundaries from labels (ignore last index)
    pred_b = boundary_positions_from_labels(boundary_labels)
    # Choose the best gold variant for boundary comparison
    gold_b, tp, fp, fn, P, R, F1 = best_variant_metrics(pred_b, gold_variants)

    # Also compute the segmentation strings (as you had) for the exact-match accuracy
    predicted_segments = apply_boundaries(word, boundary_labels)
    correct = is_correct_prediction(predicted_segments, gold_variants)

    results.append({
        "Word": word,
        "Prediction": predicted_segments,
        "Gold": gold_variants,
        "PredBoundaries": sorted(pred_b),
        "GoldBoundaries(Chosen)": sorted(gold_b),
        "TP": tp, "FP": fp, "FN": fn,
        "P_word": P, "R_word": R, "F1_word": F1,
        "CorrectExactSeg": correct
    })

    micro_tp += tp
    micro_fp += fp
    micro_fn += fn
    macro_Ps.append(P)
    macro_Rs.append(R)
    macro_F1s.append(F1)

results_df = pd.DataFrame(results)

# Exact segmentation accuracy (same as before)
accuracy = results_df["CorrectExactSeg"].mean()

# Micro metrics (global)
if micro_tp + micro_fp == 0:
    P_micro = 1.0 if micro_tp + micro_fn == 0 else 0.0
else:
    P_micro = micro_tp / (micro_tp + micro_fp)
if micro_tp + micro_fn == 0:
    R_micro = 1.0 if micro_tp + micro_fp == 0 else 0.0
else:
    R_micro = micro_tp / (micro_tp + micro_fn)
if P_micro + R_micro == 0:
    F1_micro = 1.0 if (micro_tp + micro_fp + micro_fn) == 0 else 0.0
else:
    F1_micro = 2 * P_micro * R_micro / (P_micro + R_micro)

# Macro metrics (average of per-word scores)
P_macro = float(pd.Series(macro_Ps).mean()) if macro_Ps else 0.0
R_macro = float(pd.Series(macro_Rs).mean()) if macro_Rs else 0.0
F1_macro = float(pd.Series(macro_F1s).mean()) if macro_F1s else 0.0

print(f"Exact segmentation accuracy: {accuracy:.4f}")
print("Boundary metrics:")
print(f"  Micro  - P: {P_micro:.4f}  R: {R_micro:.4f}  F1: {F1_micro:.4f}")
print(f"  Macro  - P: {P_macro:.4f}  R: {R_macro:.4f}  F1: {F1_macro:.4f}")

# =========================
# SPLIT-COUNT ACCURACY METRICS
# =========================
# Additional metrics that measure how close the predicted number of morphemes
# is to the gold standard, even if the exact segmentation differs

def split_count_metrics(predicted_segments, gold_variants):
    """
    Compute split-count accuracy variants:
    - Exact: same number of morphemes as any gold variant
    - +1: one more split than any gold variant
    - -1: one fewer split than any gold variant
    - ±1: difference ≤ 1 with any gold variant
    """
    pred_count = len(predicted_segments)
    gold_counts = [len(gold) for gold in gold_variants]

    exact = any(pred_count == g for g in gold_counts)
    plus1 = any(pred_count == g + 1 for g in gold_counts)
    minus1 = any(pred_count == g - 1 for g in gold_counts)
    pm1 = any(abs(pred_count - g) <= 1 for g in gold_counts)

    return {"Exact": exact, "+1": plus1, "-1": minus1, "±1": pm1}


# ---- Extend results with split metrics ----
split_exact_flags = []
split_plus1_flags = []
split_minus1_flags = []
split_pm1_flags = []
overlap_flags = []

for rec in results:
    predicted_segments = rec["Prediction"]
    gold_variants = rec["Gold"]

    # Normalize gold_variants (convert numpy arrays to lists)
    gold_variants = normalize_gold_variants(gold_variants)

    split_metrics = split_count_metrics(predicted_segments, gold_variants)
    rec["CorrectSplitCount"] = split_metrics["Exact"]
    rec["SplitCount+1"] = split_metrics["+1"]
    rec["SplitCount-1"] = split_metrics["-1"]
    rec["SplitCount±1"] = split_metrics["±1"]

    # Overlap between exact segmentation and correct split-count
    overlap = rec["CorrectExactSeg"] and split_metrics["Exact"]
    rec["OverlapExactAndSplit"] = overlap

    split_exact_flags.append(split_metrics["Exact"])
    split_plus1_flags.append(split_metrics["+1"])
    split_minus1_flags.append(split_metrics["-1"])
    split_pm1_flags.append(split_metrics["±1"])
    overlap_flags.append(overlap)


# ---- Aggregate metrics ----
split_exact_acc = np.mean(split_exact_flags)
split_plus1_acc = np.mean(split_plus1_flags)
split_minus1_acc = np.mean(split_minus1_flags)
split_pm1_acc = np.mean(split_pm1_flags)
overlap_accuracy = np.mean(overlap_flags)

# ---- Print summary ----
print("\n=== Split-count metrics ===")
print(f"Split-count (Exact):          {split_exact_acc:.4f}")
print(f"Split-count (+1):             {split_plus1_acc:.4f}")
print(f"Split-count (−1):             {split_minus1_acc:.4f}")
print(f"Split-count (±1):             {split_pm1_acc:.4f}")
print(f"Overlap (Exact ∩ Split):      {overlap_accuracy:.4f}")

# ---- Save updated results ----
results_df = pd.DataFrame(results)


# =========================
# SAVE EVALUATION RESULTS
# =========================
# Save evaluation results to the data folder with a descriptive filename
results_output_path = os.path.join(DATA_FOLDER, "bilstm_eval_results.csv")
results_df.to_csv(results_output_path, index=False)
print(f"\nEvaluation results saved to {results_output_path}")



Loading test data...
Loaded 913 test examples
Model checkpoint loaded from models_segmenter-old\6112ccdaef2e0c54
Model loaded successfully for evaluation.
Exact segmentation accuracy: 0.5268
Boundary metrics:
  Micro  - P: 0.7963  R: 0.8397  F1: 0.8174
  Macro  - P: 0.8072  R: 0.8279  F1: 0.7993

=== Split-count metrics ===
Split-count (Exact):          0.6440
Split-count (+1):             0.1906
Split-count (−1):             0.1391
Split-count (±1):             0.9726
Overlap (Exact ∩ Split):      0.5268

Evaluation results saved to data\bilstm_eval_results.csv


In [38]:
results_df.head(50)


Unnamed: 0,Word,Prediction,Gold,PredBoundaries,GoldBoundaries(Chosen),TP,FP,FN,P_word,R_word,F1_word,CorrectExactSeg,CorrectSplitCount,SplitCount+1,SplitCount-1,SplitCount±1,OverlapExactAndSplit
0,unupas,"[unupa, s]","[[unu, pas]]",[4],[2],0,1,1,0.0,0.0,0.0,False,True,False,False,True,False
1,umankus,"[uma, nku, s]","[[uma, nku, s]]","[2, 5]","[2, 5]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
2,hikurin,"[hiku, ri, n]","[[hikuri, n]]","[3, 5]",[5],1,1,0,0.5,1.0,0.666667,False,False,True,False,True,False
3,sutipi,"[suti, pi]","[[suti, pi]]",[3],[3],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
4,pikunas,"[pi, kuna, s]","[[pi, kuna, s]]","[1, 5]","[1, 5]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
5,atipaq,"[ati, paq]","[[ati, paq], [ati, pa, q]]",[2],[2],1,0,0,1.0,1.0,1.0,True,True,False,True,True,True
6,tomani,"[toma, ni]","[[toma, ni]]",[3],[3],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
7,rantiq,"[ranti, q]","[[ranti, q]]",[4],[4],1,0,0,1.0,1.0,1.0,True,True,False,False,True,True
8,imakunas,"[ima, kuna, s]","[[ima, kuna, s]]","[2, 6]","[2, 6]",2,0,0,1.0,1.0,1.0,True,True,False,False,True,True
9,chiqaq,"[chiqa, q]",[[chiqaq]],[4],[],0,1,0,0.0,0.0,0.0,False,False,True,False,True,False
