In [1]:
!pip install datasets seqeval torch tqdm

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=c3466bed497384eed88127527ca5066ff9652293f242f116ed03dac24e98a086
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
!gunzip cc.fa.300.vec.gz

--2025-11-22 11:14:22--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.154.144.13, 18.154.144.87, 18.154.144.102, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.154.144.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1258183862 (1.2G) [binary/octet-stream]
Saving to: ‘cc.fa.300.vec.gz’


2025-11-22 11:14:32 (126 MB/s) - ‘cc.fa.300.vec.gz’ saved [1258183862/1258183862]



In [15]:
# -------------------------------
# 1. Imports
# -------------------------------
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from tqdm import tqdm
import numpy as np
import random

# -------------------------------
# 2. Set seeds
# -------------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed()

# -------------------------------
# 3. Load PEYMA dataset
# -------------------------------
dataset = load_dataset("AliFartout/PEYMA-ARMAN-Mixed")
train_data = dataset["train"]
val_data   = dataset["validation"]
test_data  = dataset["test"]

# -------------------------------
# 4. Label normalization to IOB2 format
# -------------------------------
def normalize_tags(tags):
    """Convert tags to IOB2 format (B-ORG, I-ORG) for seqeval compatibility"""
    return [t.replace("_", "-") for t in tags]

# CRITICAL: Normalize and collect labels at the same time
all_words, all_chars, all_labels = set(), set(), set()
normalized_data = {"train": [], "validation": [], "test": []}

for split_name, split in [("train", train_data), ("validation", val_data), ("test", test_data)]:
    for sample in split:
        tokens = sample["tokens"]
        normalized_tags = normalize_tags(sample["ner_tags_names"])

        # Store normalized version
        normalized_data[split_name].append({
            "tokens": tokens,
            "ner_tags_names": normalized_tags
        })

        # Collect vocabulary
        for w, t in zip(tokens, normalized_tags):
            all_words.add(w)
            all_labels.add(t)
            all_chars.update(list(w))

# Replace original data with normalized data
train_data = normalized_data["train"]
val_data = normalized_data["validation"]
test_data = normalized_data["test"]

# -------------------------------
# 5. Build vocabularies
# -------------------------------

word2id = {"<PAD>":0, "<UNK>":1}
for w in all_words:
    word2id[w] = len(word2id)

char2id = {"<PAD>":0, "<UNK>":1}
for ch in all_chars:
    char2id[ch] = len(char2id)

# Build label encoder AFTER normalization
# Ensure "O" label is included
all_labels.add("O")  # Critical: add O label if missing
label_encoder = LabelEncoder()
label_encoder.fit(sorted(list(all_labels)))  # Sort for consistency

# Create mapping for quick inverse transform - NOW CORRECT
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
print(f"Sample labels: {list(id2label.values())[:10]}")  # Verify format
print(f"Total labels: {len(id2label)}")
print(f"Has O label: {'O' in id2label.values()}")
print(f"Label distribution in training: {Counter([l for sample in train_data for l in sample['ner_tags_names']]).most_common(5)}")

# -------------------------------
# 6. Load FastText embeddings
# -------------------------------
EMB_DIM = 300
fasttext_path = "/content/cc.fa.300.vec"

print("Loading FastText vectors...")
fasttext = {}
with open(fasttext_path, "r", encoding="utf-8", errors="ignore") as f:
    next(f)  # skip header
    for line in tqdm(f, total=2000000):
        parts = line.rstrip().split(" ")
        if len(parts) < EMB_DIM + 1:
            continue
        fasttext[parts[0]] = np.asarray(parts[1:], dtype="float32")

embedding_matrix = np.zeros((len(word2id), EMB_DIM), dtype="float32")
oov_count = 0
for word, idx in word2id.items():
    if word in fasttext:
        embedding_matrix[idx] = fasttext[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.1, size=(EMB_DIM,))
        oov_count += 1
print(f"OOV words: {oov_count} / {len(word2id)}")
embedding_matrix = torch.tensor(embedding_matrix)

# -------------------------------
# 7. Dataset + batching
# -------------------------------
class NERDataset(Dataset):
    def __init__(self, split):
        self.data = split

    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        labels = self.data[idx]["ner_tags_names"]

        word_ids = torch.tensor([word2id.get(w, 1) for w in tokens])
        char_ids = [torch.tensor([char2id.get(c, 1) for c in w]) for w in tokens]
        label_ids = torch.tensor(label_encoder.transform(labels))

        return word_ids, char_ids, label_ids

    def __len__(self):
        return len(self.data)

def pad_batch(batch):
    word_seqs, char_seqs, label_seqs = zip(*batch)
    max_len = max(len(w) for w in word_seqs)

    padded_words, padded_labels, padded_chars, masks = [], [], [], []

    # Use -100 as padding label (standard in PyTorch for ignoring in loss)
    PAD_LABEL = -100

    for w, cseq, l in zip(word_seqs, char_seqs, label_seqs):
        pad_len = max_len - len(w)
        padded_words.append(torch.cat([w, torch.zeros(pad_len, dtype=torch.long)]))
        padded_labels.append(torch.cat([l, torch.full((pad_len,), PAD_LABEL, dtype=torch.long)]))

        # Create mask: 1 for real tokens, 0 for padding
        mask = torch.cat([torch.ones(len(w)), torch.zeros(pad_len)])
        masks.append(mask)

        cseq = cseq + [torch.zeros(1, dtype=torch.long)] * pad_len
        padded_chars.append(cseq)

    max_char_len = max(len(c) for seq in padded_chars for c in seq)
    final_chars = []
    for seq in padded_chars:
        padded = []
        for c in seq:
            pad_len = max_char_len - len(c)
            padded.append(torch.cat([c, torch.zeros(pad_len, dtype=torch.long)]))
        final_chars.append(torch.stack(padded))

    return (torch.stack(padded_words),
            torch.stack(final_chars),
            torch.stack(padded_labels),
            torch.stack(masks))

train_loader = DataLoader(NERDataset(train_data), batch_size=16, shuffle=True, collate_fn=pad_batch)
val_loader   = DataLoader(NERDataset(val_data), batch_size=16, collate_fn=pad_batch)
test_loader  = DataLoader(NERDataset(test_data), batch_size=16, collate_fn=pad_batch)

# -------------------------------
# 8. Improved CNN + BiLSTM model
# -------------------------------
class CNN_BiLSTM_NER(nn.Module):
    def __init__(self, embedding_matrix, char_vocab, num_labels, dropout=0.5):
        super().__init__()
        vocab_size, emb_dim = embedding_matrix.shape

        # Unfreeze embeddings to allow fine-tuning
        self.word_emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.word_dropout = nn.Dropout(dropout)

        # Character-level CNN
        self.char_emb = nn.Embedding(char_vocab, 30, padding_idx=0)
        self.char_cnn = nn.Conv1d(30, 100, kernel_size=3, padding=1)
        self.char_dropout = nn.Dropout(dropout)

        # 2-layer BiLSTM
        self.lstm = nn.LSTM(
            input_size=emb_dim + 100,
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(512, num_labels)

    def forward(self, words, chars):
        # Word embeddings
        word_embed = self.word_dropout(self.word_emb(words))

        # Character embeddings and CNN
        B, L, C = chars.shape
        chars = chars.long().view(B*L, C)
        char_emb = self.char_emb(chars).transpose(1, 2)
        char_feat = torch.max(torch.relu(self.char_cnn(char_emb)), dim=2).values
        char_feat = self.char_dropout(char_feat.view(B, L, 100))

        # Combine word and character features
        combined = torch.cat([word_embed, char_feat], dim=-1)

        # BiLSTM
        lstm_out, _ = self.lstm(combined)
        lstm_out = self.dropout(lstm_out)

        return self.fc(lstm_out)

# -------------------------------
# 9. Prepare weighted loss
# -------------------------------
# Calculate class weights for imbalanced data
all_train_labels = []
for sample in train_data:
    all_train_labels.extend(sample["ner_tags_names"])

label_counts = Counter(all_train_labels)
total = sum(label_counts.values())

# Calculate inverse frequency weights
weights = []
for label in label_encoder.classes_:
    if label in label_counts:
        # Use square root of inverse frequency for smoother weighting
        weight = (total / label_counts[label]) ** 0.5
    else:
        weight = 1.0
    weights.append(weight)

weights_tensor = torch.tensor(weights, dtype=torch.float)

# Normalize weights so O label isn't too dominant
max_weight = max(weights)
weights_tensor = weights_tensor / max_weight

print(f"Label weights (first 10): {weights_tensor[:10].tolist()}")
print(f"O label weight: {weights_tensor[list(label_encoder.classes_).index('O') if 'O' in label_encoder.classes_ else -1]}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = CNN_BiLSTM_NER(embedding_matrix, len(char2id), len(label_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss(weight=weights_tensor.to(device), ignore_index=-100)  # Ignore padding
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

# -------------------------------
# 10. Helper function to convert predictions to labels
# -------------------------------
def convert_to_labels(predictions, labels):
    """Convert tensor predictions to label lists, filtering padding"""
    pred_labels = []
    true_labels = []

    for pred_seq, true_seq in zip(predictions, labels):
        pred_seq = pred_seq.cpu().tolist()
        true_seq = true_seq.cpu().tolist()

        # Filter out padding (label -100)
        pred_tags = []
        true_tags = []
        for p, t in zip(pred_seq, true_seq):
            if t == -100:  # Skip padding
                continue
            pred_tags.append(id2label[p])
            true_tags.append(id2label[t])

        if pred_tags:  # Only add non-empty sequences
            pred_labels.append(pred_tags)
            true_labels.append(true_tags)

    return pred_labels, true_labels

# -------------------------------
# 11. Training with validation F1
# -------------------------------
best_val_f1 = 0
patience_counter = 0
patience = 5
num_epochs = 20

print("\nStarting training...")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\n")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    for words, chars, labels, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        words, chars, labels = words.to(device), chars.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(words, chars)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    y_val_true, y_val_pred = [], []

    with torch.no_grad():
        for words, chars, labels, masks in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            words, chars, labels = words.to(device), chars.to(device), labels.to(device)
            logits = model(words, chars)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            val_loss += loss.item()

            pred = logits.argmax(-1)
            pred_labels, true_labels = convert_to_labels(pred, labels)
            y_val_pred.extend(pred_labels)
            y_val_true.extend(true_labels)

    val_loss /= len(val_loader)
    val_f1 = f1_score(y_val_true, y_val_pred, average="micro")

    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}")

    # Learning rate scheduling
    scheduler.step(val_f1)

    # Early stopping & checkpoint
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_model.pt")
        patience_counter = 0
        print(f"  → New best model saved! (F1: {val_f1:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping triggered after {epoch+1} epochs!")
            break

print(f"\nTraining completed. Best validation F1: {best_val_f1:.4f}")

# -------------------------------
# 12. Test evaluation
# -------------------------------
print("\nLoading best model for test evaluation...")
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for words, chars, labels, masks in tqdm(test_loader, desc="Testing"):
        words, chars, labels = words.to(device), chars.to(device), labels.to(device)
        logits = model(words, chars)
        pred = logits.argmax(-1)

        pred_labels, true_labels = convert_to_labels(pred, labels)
        y_pred.extend(pred_labels)
        y_true.extend(true_labels)

print("\n" + "="*80)
print("FINAL TEST RESULTS")
print("="*80)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))
print("\nAggregate Scores:")
print(f"  Micro F1:    {f1_score(y_true, y_pred, average='micro'):.4f}")
print(f"  Macro F1:    {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"  Weighted F1: {f1_score(y_true, y_pred, average='weighted'):.4f}")
print("="*80)

Sample labels: [np.str_('B-DAT'), np.str_('B-EVE'), np.str_('B-FAC'), np.str_('B-LOC'), np.str_('B-MON'), np.str_('B-ORG'), np.str_('B-PCT'), np.str_('B-PER'), np.str_('B-PRO'), np.str_('B-TIM')]
Total labels: 21
Has O label: True
Label distribution in training: [('O', 745941), ('I-ORG', 21230), ('B-ORG', 15719), ('B-LOC', 12976), ('B-PER', 11335)]
Loading FastText vectors...


100%|██████████| 2000000/2000000 [02:40<00:00, 12426.33it/s]


OOV words: 7493 / 29408
Label weights (first 10): [0.3843235373497009, 0.4012308716773987, 0.40794211626052856, 0.13079950213432312, 0.7055195569992065, 0.11884038150310516, 0.9135570526123047, 0.13994769752025604, 0.3593672811985016, 1.0]
O label weight: 0.01725139655172825
Using device: cuda

Starting training...
Total parameters: 11,770,777
Trainable parameters: 11,770,777



Epoch 1/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.38it/s]
Epoch 1/20 [Val]: 100%|██████████| 206/206 [00:07<00:00, 26.88it/s]


Epoch 01 | Train Loss: 0.7104 | Val Loss: 0.3488 | Val F1: 0.6298
  → New best model saved! (F1: 0.6298)


Epoch 2/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.57it/s]
Epoch 2/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 30.24it/s]


Epoch 02 | Train Loss: 0.2648 | Val Loss: 0.2449 | Val F1: 0.6955
  → New best model saved! (F1: 0.6955)


Epoch 3/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.67it/s]
Epoch 3/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 34.54it/s]


Epoch 03 | Train Loss: 0.1637 | Val Loss: 0.2051 | Val F1: 0.7263
  → New best model saved! (F1: 0.7263)


Epoch 4/20 [Train]: 100%|██████████| 1649/1649 [01:05<00:00, 25.19it/s]
Epoch 4/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 38.52it/s]


Epoch 04 | Train Loss: 0.1148 | Val Loss: 0.1830 | Val F1: 0.7906
  → New best model saved! (F1: 0.7906)


Epoch 5/20 [Train]: 100%|██████████| 1649/1649 [01:05<00:00, 25.03it/s]
Epoch 5/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 36.36it/s]


Epoch 05 | Train Loss: 0.0887 | Val Loss: 0.1968 | Val F1: 0.7867


Epoch 6/20 [Train]: 100%|██████████| 1649/1649 [01:06<00:00, 24.97it/s]
Epoch 6/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 30.97it/s]


Epoch 06 | Train Loss: 0.0713 | Val Loss: 0.1775 | Val F1: 0.8199
  → New best model saved! (F1: 0.8199)


Epoch 7/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.64it/s]
Epoch 7/20 [Val]: 100%|██████████| 206/206 [00:07<00:00, 29.17it/s]


Epoch 07 | Train Loss: 0.0514 | Val Loss: 0.1885 | Val F1: 0.8348
  → New best model saved! (F1: 0.8348)


Epoch 8/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.65it/s]
Epoch 8/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 33.36it/s]


Epoch 08 | Train Loss: 0.0476 | Val Loss: 0.1788 | Val F1: 0.8236


Epoch 9/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.49it/s]
Epoch 9/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 39.00it/s]


Epoch 09 | Train Loss: 0.0402 | Val Loss: 0.1832 | Val F1: 0.8410
  → New best model saved! (F1: 0.8410)


Epoch 10/20 [Train]: 100%|██████████| 1649/1649 [01:06<00:00, 24.89it/s]
Epoch 10/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 37.90it/s]


Epoch 10 | Train Loss: 0.0346 | Val Loss: 0.2008 | Val F1: 0.8561
  → New best model saved! (F1: 0.8561)


Epoch 11/20 [Train]: 100%|██████████| 1649/1649 [01:05<00:00, 25.33it/s]
Epoch 11/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 30.68it/s]


Epoch 11 | Train Loss: 0.0297 | Val Loss: 0.2235 | Val F1: 0.8567
  → New best model saved! (F1: 0.8567)


Epoch 12/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.59it/s]
Epoch 12/20 [Val]: 100%|██████████| 206/206 [00:07<00:00, 28.69it/s]


Epoch 12 | Train Loss: 0.0290 | Val Loss: 0.2354 | Val F1: 0.8632
  → New best model saved! (F1: 0.8632)


Epoch 13/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.52it/s]
Epoch 13/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 32.28it/s]


Epoch 13 | Train Loss: 0.0253 | Val Loss: 0.2376 | Val F1: 0.8471


Epoch 14/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.48it/s]
Epoch 14/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 37.88it/s]


Epoch 14 | Train Loss: 0.0233 | Val Loss: 0.2661 | Val F1: 0.8391


Epoch 15/20 [Train]: 100%|██████████| 1649/1649 [01:06<00:00, 24.89it/s]
Epoch 15/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 38.76it/s]


Epoch 15 | Train Loss: 0.0216 | Val Loss: 0.2606 | Val F1: 0.8521


Epoch 16/20 [Train]: 100%|██████████| 1649/1649 [01:05<00:00, 25.29it/s]
Epoch 16/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 32.00it/s]


Epoch 16 | Train Loss: 0.0142 | Val Loss: 0.2487 | Val F1: 0.8653
  → New best model saved! (F1: 0.8653)


Epoch 17/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.38it/s]
Epoch 17/20 [Val]: 100%|██████████| 206/206 [00:07<00:00, 28.94it/s]


Epoch 17 | Train Loss: 0.0110 | Val Loss: 0.2669 | Val F1: 0.8640


Epoch 18/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.40it/s]
Epoch 18/20 [Val]: 100%|██████████| 206/206 [00:07<00:00, 28.12it/s]


Epoch 18 | Train Loss: 0.0087 | Val Loss: 0.2812 | Val F1: 0.8792
  → New best model saved! (F1: 0.8792)


Epoch 19/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.47it/s]
Epoch 19/20 [Val]: 100%|██████████| 206/206 [00:06<00:00, 32.83it/s]


Epoch 19 | Train Loss: 0.0076 | Val Loss: 0.2791 | Val F1: 0.8671


Epoch 20/20 [Train]: 100%|██████████| 1649/1649 [01:04<00:00, 25.38it/s]
Epoch 20/20 [Val]: 100%|██████████| 206/206 [00:05<00:00, 38.04it/s]


Epoch 20 | Train Loss: 0.0076 | Val Loss: 0.2877 | Val F1: 0.8730

Training completed. Best validation F1: 0.8792

Loading best model for test evaluation...


Testing: 100%|██████████| 206/206 [00:07<00:00, 27.73it/s]



FINAL TEST RESULTS

Classification Report:
              precision    recall  f1-score   support

         DAT     0.5425    0.7486    0.6291       179
         EVE     0.8843    0.9817    0.9304       218
         FAC     0.8552    1.0000    0.9219       124
         LOC     0.8615    0.9224    0.8909      1855
         MON     0.7778    0.8235    0.8000        51
         ORG     0.8709    0.9164    0.8931      2010
         PCT     0.7576    0.9259    0.8333        27
         PER     0.8749    0.9429    0.9076      1558
         PRO     0.8328    0.9929    0.9058       281
         TIM     0.5357    0.5556    0.5455        27

   micro avg     0.8530    0.9250    0.8875      6330
   macro avg     0.7793    0.8810    0.8258      6330
weighted avg     0.8557    0.9250    0.8885      6330


Aggregate Scores:
  Micro F1:    0.8875
  Macro F1:    0.8258
  Weighted F1: 0.8885


In [16]:
# Inspect prediction errors
import random

print("="*80)
print("ANALYZING PREDICTION ERRORS")
print("="*80)

model.eval()
error_examples = []

with torch.no_grad():
    for words, chars, labels, masks in test_loader:
        words, chars, labels = words.to(device), chars.to(device), labels.to(device)
        logits = model(words, chars)
        pred = logits.argmax(-1)

        # Convert to labels
        for i, (pred_seq, true_seq, word_seq) in enumerate(zip(pred, labels, words)):
            pred_seq = pred_seq.cpu().tolist()
            true_seq = true_seq.cpu().tolist()
            word_seq = word_seq.cpu().tolist()

            # Get actual tokens (reverse lookup in word2id)
            id2word = {v: k for k, v in word2id.items()}
            tokens = [id2word.get(w, '<UNK>') for w in word_seq]

            pred_tags = []
            true_tags = []
            token_list = []
            has_error = False

            for p, t, tok in zip(pred_seq, true_seq, tokens):
                if t == -100:  # Skip padding
                    continue
                pred_tag = id2label[p]
                true_tag = id2label[t]

                if pred_tag != true_tag:
                    has_error = True

                pred_tags.append(pred_tag)
                true_tags.append(true_tag)
                token_list.append(tok)

            if has_error and len(token_list) > 0:
                error_examples.append({
                    'tokens': token_list,
                    'predicted': pred_tags,
                    'true': true_tags
                })

print(f"\nFound {len(error_examples)} sequences with errors")
print("\n" + "="*80)
print("RANDOM ERROR EXAMPLES (10 samples)")
print("="*80)

# Show 10 random error examples
for idx, example in enumerate(random.sample(error_examples, min(10, len(error_examples))), 1):
    print(f"\n--- Example {idx} ---")
    print(f"Tokens:    {' '.join(example['tokens'][:20])}")  # Show first 20 tokens
    print(f"Predicted: {' '.join(example['predicted'][:20])}")
    print(f"True:      {' '.join(example['true'][:20])}")

    # Highlight differences
    errors = []
    for i, (tok, pred, true) in enumerate(zip(example['tokens'][:20], example['predicted'][:20], example['true'][:20])):
        if pred != true:
            errors.append(f"  Position {i}: '{tok}' → Predicted: {pred}, True: {true}")

    if errors:
        print("Errors:")
        for err in errors[:5]:  # Show max 5 errors per example
            print(err)

print("\n" + "="*80)
print("ERROR PATTERN ANALYSIS")
print("="*80)

# Analyze common error patterns
all_errors = []
for ex in error_examples:
    for tok, pred, true in zip(ex['tokens'], ex['predicted'], ex['true']):
        if pred != true:
            all_errors.append((true, pred))

error_counts = Counter(all_errors)
print("\nMost common errors (True → Predicted):")
for (true_label, pred_label), count in error_counts.most_common(20):
    print(f"  {true_label:15s} → {pred_label:15s} : {count:4d} times")

ANALYZING PREDICTION ERRORS

Found 568 sequences with errors

RANDOM ERROR EXAMPLES (10 samples)

--- Example 1 ---
Tokens:    تازه یا خشک کرده » ، « زعفران در بسته\u200cبندی بیش از 30 گرم » و « انجیر ، تازه
Predicted: O O O O O O O O O O O O O O O O O B-PRO O O
True:      O O O O O O O O O O O O O O O O O O O O
Errors:
  Position 17: 'انجیر' → Predicted: B-PRO, True: O

--- Example 2 ---
Tokens:    برنامه نیستان رادیو فرهنگ ، برنامه\u200cای با موضوع موسیقی ایرانی است که هر روز ساعت 14:30 پخش می\u200cشود .
Predicted: O O O I-PER O O O O O O O O O O B-TIM I-TIM O O O
True:      O O B-ORG I-ORG O O O O O O O O O O B-TIM I-TIM O O O
Errors:
  Position 2: 'رادیو' → Predicted: O, True: B-ORG
  Position 3: 'فرهنگ' → Predicted: I-PER, True: I-ORG

--- Example 3 ---
Tokens:    این استودیو شامل چند واحد تولید ، یک شهربازی و مرکز آموزش خواهد بود .
Predicted: O O O O O O O O O O B-ORG I-ORG O O O
True:      O O O O O O O O O O O O O O O
Errors:
  Position 10: 'مرکز' → Predicted: B-ORG, True: O
  