In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=ceb77ff71ee4f8aacfd3be44333522c8a0dc9848fcf7fec8d4a33c623540d987
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from tqdm import tqdm
import numpy as np
import random

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed()

In [5]:
dataset = load_dataset("AliFartout/PEYMA-ARMAN-Mixed")
train_data = dataset["train"]
val_data   = dataset["validation"]
test_data  = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.31M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/431k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/423k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3296 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3296 [00:00<?, ? examples/s]

In [6]:
def normalize_tags(tags):
    """Convert tags to IOB2 format (B-ORG, I-ORG) for seqeval compatibility"""
    return [t.replace("_", "-") for t in tags]

# CRITICAL: Normalize and collect labels at the same time
all_words, all_chars, all_labels = set(), set(), set()
normalized_data = {"train": [], "validation": [], "test": []}

for split_name, split in [("train", train_data), ("validation", val_data), ("test", test_data)]:
    for sample in split:
        tokens = sample["tokens"]
        normalized_tags = normalize_tags(sample["ner_tags_names"])

        # Store normalized version
        normalized_data[split_name].append({
            "tokens": tokens,
            "ner_tags_names": normalized_tags
        })

        # Collect vocabulary
        for w, t in zip(tokens, normalized_tags):
            all_words.add(w)
            all_labels.add(t)
            all_chars.update(list(w))

# Replace original data with normalized data
train_data = normalized_data["train"]
val_data = normalized_data["validation"]
test_data = normalized_data["test"]

In [7]:
word2id = {"<PAD>":0, "<UNK>":1}
for w in all_words:
    word2id[w] = len(word2id)

char2id = {"<PAD>":0, "<UNK>":1}
for ch in all_chars:
    char2id[ch] = len(char2id)

# Build label encoder AFTER normalization
all_labels.add("O")  # Ensure O label exists
label_encoder = LabelEncoder()
label_encoder.fit(sorted(list(all_labels)))

# Create mapping for quick inverse transform
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
print(f"Sample labels: {list(id2label.values())[:10]}")
print(f"Total labels: {len(id2label)}")
print(f"Has O label: {'O' in id2label.values()}")


Sample labels: [np.str_('B-DAT'), np.str_('B-EVE'), np.str_('B-FAC'), np.str_('B-LOC'), np.str_('B-MON'), np.str_('B-ORG'), np.str_('B-PCT'), np.str_('B-PER'), np.str_('B-PRO'), np.str_('B-TIM')]
Total labels: 21
Has O label: True


In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
!gunzip cc.fa.300.vec.gz

--2025-11-23 10:55:38--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.108, 3.163.189.14, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1258183862 (1.2G) [binary/octet-stream]
Saving to: ‘cc.fa.300.vec.gz’


2025-11-23 10:55:55 (72.9 MB/s) - ‘cc.fa.300.vec.gz’ saved [1258183862/1258183862]



In [10]:
#  Load FastText embeddings
# -------------------------------
EMB_DIM = 300
fasttext_path = "/content/cc.fa.300.vec"

print("Loading FastText vectors...")
fasttext = {}
with open(fasttext_path, "r", encoding="utf-8", errors="ignore") as f:
    next(f)  # skip header
    for line in tqdm(f, total=2000000):
        parts = line.rstrip().split(" ")
        if len(parts) < EMB_DIM + 1:
            continue
        fasttext[parts[0]] = np.asarray(parts[1:], dtype="float32")

embedding_matrix = np.zeros((len(word2id), EMB_DIM), dtype="float32")
oov_count = 0
for word, idx in word2id.items():
    if word in fasttext:
        embedding_matrix[idx] = fasttext[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.1, size=(EMB_DIM,))
        oov_count += 1
print(f"OOV words: {oov_count} / {len(word2id)}")
embedding_matrix = torch.tensor(embedding_matrix)


Loading FastText vectors...


100%|██████████| 2000000/2000000 [02:36<00:00, 12775.17it/s]


OOV words: 7493 / 29408


In [11]:
class NERDataset(Dataset):
    def __init__(self, split):
        self.data = split

    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        labels = self.data[idx]["ner_tags_names"]

        word_ids = torch.tensor([word2id.get(w, 1) for w in tokens])
        char_ids = [torch.tensor([char2id.get(c, 1) for c in w]) for w in tokens]
        label_ids = torch.tensor(label_encoder.transform(labels))

        return word_ids, char_ids, label_ids

    def __len__(self):
        return len(self.data)

def pad_batch(batch):
    word_seqs, char_seqs, label_seqs = zip(*batch)
    max_len = max(len(w) for w in word_seqs)

    padded_words, padded_labels, padded_chars, masks = [], [], [], []

    # Use -100 as padding label
    PAD_LABEL = -100

    for w, cseq, l in zip(word_seqs, char_seqs, label_seqs):
        pad_len = max_len - len(w)
        padded_words.append(torch.cat([w, torch.zeros(pad_len, dtype=torch.long)]))
        padded_labels.append(torch.cat([l, torch.full((pad_len,), PAD_LABEL, dtype=torch.long)]))

        # Create mask: 1 for real tokens, 0 for padding
        mask = torch.cat([torch.ones(len(w)), torch.zeros(pad_len)])
        masks.append(mask)

        cseq = cseq + [torch.zeros(1, dtype=torch.long)] * pad_len
        padded_chars.append(cseq)

    max_char_len = max(len(c) for seq in padded_chars for c in seq)
    final_chars = []
    for seq in padded_chars:
        padded = []
        for c in seq:
            pad_len = max_char_len - len(c)
            padded.append(torch.cat([c, torch.zeros(pad_len, dtype=torch.long)]))
        final_chars.append(torch.stack(padded))

    return (torch.stack(padded_words),
            torch.stack(final_chars),
            torch.stack(padded_labels),
            torch.stack(masks))

train_loader = DataLoader(NERDataset(train_data), batch_size=16, shuffle=True, collate_fn=pad_batch)
val_loader   = DataLoader(NERDataset(val_data), batch_size=16, collate_fn=pad_batch)
test_loader  = DataLoader(NERDataset(test_data), batch_size=16, collate_fn=pad_batch)


In [12]:
class CRF(nn.Module):
    """Conditional Random Field layer for sequence tagging"""

    def __init__(self, num_tags, batch_first=True):
        super(CRF, self).__init__()
        self.num_tags = num_tags
        self.batch_first = batch_first

        # Transition parameters: transitions[i, j] = score of transitioning from tag j to tag i
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

        # Start and end transitions
        self.start_transitions = nn.Parameter(torch.randn(num_tags))
        self.end_transitions = nn.Parameter(torch.randn(num_tags))

    def forward(self, emissions, mask):
        """
        Compute the log likelihood of the given sequence of tags.
        Args:
            emissions: (batch_size, seq_length, num_tags)
            mask: (batch_size, seq_length)
        Returns:
            log_likelihood
        """
        if self.batch_first:
            emissions = emissions.transpose(0, 1)  # (seq_length, batch_size, num_tags)
            mask = mask.transpose(0, 1)  # (seq_length, batch_size)

        return emissions, mask

    def decode(self, emissions, mask):
        """
        Find the most likely tag sequence using Viterbi algorithm.
        Args:
            emissions: (batch_size, seq_length, num_tags)
            mask: (batch_size, seq_length)
        Returns:
            best_tags: (batch_size, seq_length)
        """
        if self.batch_first:
            emissions = emissions.transpose(0, 1)  # (seq_length, batch_size, num_tags)
            mask = mask.transpose(0, 1)  # (seq_length, batch_size)

        return self._viterbi_decode(emissions, mask)

    def _viterbi_decode(self, emissions, mask):
        """Viterbi algorithm for finding best path"""
        seq_length, batch_size, num_tags = emissions.shape

        # Initialize scores with start transitions
        score = self.start_transitions.unsqueeze(0) + emissions[0]
        history = []

        # Forward pass
        for i in range(1, seq_length):
            # Broadcast and add transitions
            broadcast_score = score.unsqueeze(2)  # (batch, num_tags, 1)
            broadcast_emissions = emissions[i].unsqueeze(1)  # (batch, 1, num_tags)

            # Compute scores for all possible transitions
            next_score = broadcast_score + self.transitions.unsqueeze(0) + broadcast_emissions

            # Find best previous tag for each current tag
            next_score, indices = next_score.max(dim=1)

            # Apply mask
            score = torch.where(mask[i].unsqueeze(1).bool(), next_score, score)
            history.append(indices)

        # Add end transitions
        score = score + self.end_transitions.unsqueeze(0)

        # Backtrack to find best path
        best_tags_list = []
        _, best_last_tag = score.max(dim=1)
        best_tags = [best_last_tag]

        for hist in reversed(history):
            best_last_tag = hist.gather(1, best_last_tag.unsqueeze(1)).squeeze(1)
            best_tags.append(best_last_tag)

        # Reverse to get forward direction
        best_tags.reverse()
        best_tags = torch.stack(best_tags, dim=0).transpose(0, 1)  # (batch, seq_length)

        return best_tags

    def neg_log_likelihood(self, emissions, tags, mask):
        """
        Compute negative log likelihood loss.
        Args:
            emissions: (batch_size, seq_length, num_tags)
            tags: (batch_size, seq_length)
            mask: (batch_size, seq_length)
        """
        if self.batch_first:
            emissions = emissions.transpose(0, 1)  # (seq_length, batch_size, num_tags)
            tags = tags.transpose(0, 1)
            mask = mask.transpose(0, 1)

        # Compute score of gold sequence
        gold_score = self._compute_gold_score(emissions, tags, mask)

        # Compute partition function (sum of all possible sequences)
        forward_score = self._compute_forward_score(emissions, mask)

        # NLL = - (gold_score - log(partition))
        return (forward_score - gold_score).mean()

    def _compute_gold_score(self, emissions, tags, mask):
        """Compute score of the gold tag sequence"""
        seq_length, batch_size = tags.shape

        # Start transition
        score = self.start_transitions[tags[0]]

        # Emission scores
        score += emissions[0].gather(1, tags[0].unsqueeze(1)).squeeze(1)

        # Transition scores
        for i in range(1, seq_length):
            # Transition from tags[i-1] to tags[i]
            trans_score = self.transitions[tags[i], tags[i-1]]
            emit_score = emissions[i].gather(1, tags[i].unsqueeze(1)).squeeze(1)

            # Apply mask
            score = score + torch.where(mask[i].bool(), trans_score + emit_score, torch.zeros_like(trans_score))

        # End transition (use last valid tag per sequence)
        last_tag_indices = mask.long().sum(0) - 1
        last_tags = tags.gather(0, last_tag_indices.unsqueeze(0)).squeeze(0)
        score = score + self.end_transitions[last_tags]

        return score

    def _compute_forward_score(self, emissions, mask):
        """Compute partition function using forward algorithm"""
        seq_length, batch_size, num_tags = emissions.shape

        # Initialize with start transitions
        score = self.start_transitions.unsqueeze(0) + emissions[0]

        # Forward pass
        for i in range(1, seq_length):
            broadcast_score = score.unsqueeze(2)  # (batch, num_tags, 1)
            broadcast_emissions = emissions[i].unsqueeze(1)  # (batch, 1, num_tags)

            # Log-sum-exp trick for numerical stability
            next_score = broadcast_score + self.transitions.unsqueeze(0) + broadcast_emissions
            next_score = torch.logsumexp(next_score, dim=1)

            # Apply mask
            score = torch.where(mask[i].unsqueeze(1).bool(), next_score, score)

        # Add end transitions
        score = score + self.end_transitions.unsqueeze(0)

        # Log-sum-exp over all final tags
        return torch.logsumexp(score, dim=1)

In [13]:
class CNN_BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_matrix, char_vocab, num_labels, dropout=0.5):
        super().__init__()
        vocab_size, emb_dim = embedding_matrix.shape

        # Word embeddings (fine-tunable)
        self.word_emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.word_dropout = nn.Dropout(dropout)

        # Character-level CNN
        self.char_emb = nn.Embedding(char_vocab, 30, padding_idx=0)
        self.char_cnn = nn.Conv1d(30, 100, kernel_size=3, padding=1)
        self.char_dropout = nn.Dropout(dropout)

        # 2-layer BiLSTM
        self.lstm = nn.LSTM(
            input_size=emb_dim + 100,
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )

        self.dropout = nn.Dropout(dropout)

        # Emission layer (LSTM output to tag scores)
        self.hidden2tag = nn.Linear(512, num_labels)

        # CRF layer
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, words, chars, mask):
        """
        Forward pass - returns emissions for CRF
        """
        # Word embeddings
        word_embed = self.word_dropout(self.word_emb(words))

        # Character embeddings and CNN
        B, L, C = chars.shape
        chars = chars.long().view(B*L, C)
        char_emb = self.char_emb(chars).transpose(1, 2)
        char_feat = torch.max(torch.relu(self.char_cnn(char_emb)), dim=2).values
        char_feat = self.char_dropout(char_feat.view(B, L, 100))

        # Combine word and character features
        combined = torch.cat([word_embed, char_feat], dim=-1)

        # BiLSTM
        lstm_out, _ = self.lstm(combined)
        lstm_out = self.dropout(lstm_out)

        # Emissions (tag scores)
        emissions = self.hidden2tag(lstm_out)

        return emissions

    def loss(self, words, chars, tags, mask):
        """
        Compute CRF loss
        """
        emissions = self.forward(words, chars, mask)
        return self.crf.neg_log_likelihood(emissions, tags, mask)

    def decode(self, words, chars, mask):
        """
        Decode best tag sequence using Viterbi
        """
        emissions = self.forward(words, chars, mask)
        return self.crf.decode(emissions, mask)


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = CNN_BiLSTM_CRF(embedding_matrix, len(char2id), len(label_encoder.classes_)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

# -------------------------------
# 11. Helper function to convert predictions to labels
# -------------------------------
def convert_to_labels(predictions, labels):
    """Convert tensor predictions to label lists, filtering padding"""
    pred_labels = []
    true_labels = []

    for pred_seq, true_seq in zip(predictions, labels):
        pred_seq = pred_seq.cpu().tolist()
        true_seq = true_seq.cpu().tolist()

        # Filter out padding (label -100)
        pred_tags = []
        true_tags = []
        for p, t in zip(pred_seq, true_seq):
            if t == -100:  # Skip padding
                continue
            pred_tags.append(id2label[p])
            true_tags.append(id2label[t])

        if pred_tags:  # Only add non-empty sequences
            pred_labels.append(pred_tags)
            true_labels.append(true_tags)

    return pred_labels, true_labels


Using device: cuda


In [15]:
best_val_f1 = 0
patience_counter = 0
patience = 5
num_epochs = 20

print("\nStarting training...")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\n")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0

    for words, chars, labels, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        words = words.to(device)
        chars = chars.to(device)
        labels_input = labels.clone()
        # Replace -100 with 0 for CRF (CRF doesn't handle -100)
        labels_input[labels == -100] = 0
        labels_input = labels_input.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        loss = model.loss(words, chars, labels_input, masks)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    y_val_true, y_val_pred = [], []

    with torch.no_grad():
        for words, chars, labels, masks in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            words = words.to(device)
            chars = chars.to(device)
            labels_input = labels.clone()
            labels_input[labels == -100] = 0
            labels_input = labels_input.to(device)
            masks = masks.to(device)

            loss = model.loss(words, chars, labels_input, masks)
            val_loss += loss.item()

            # Decode using Viterbi
            pred = model.decode(words, chars, masks)

            pred_labels, true_labels = convert_to_labels(pred, labels)
            y_val_pred.extend(pred_labels)
            y_val_true.extend(true_labels)

    val_loss /= len(val_loader)
    val_f1 = f1_score(y_val_true, y_val_pred, average="micro")

    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}")

    # Learning rate scheduling
    scheduler.step(val_f1)

    # Early stopping & checkpoint
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_crf_model.pt")
        patience_counter = 0
        print(f"  → New best model saved! (F1: {val_f1:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping triggered after {epoch+1} epochs!")
            break

print(f"\nTraining completed. Best validation F1: {best_val_f1:.4f}")



Starting training...
Total parameters: 11,771,260
Trainable parameters: 11,771,260



Epoch 1/20 [Train]: 100%|██████████| 1649/1649 [03:18<00:00,  8.32it/s]
Epoch 1/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.06it/s]


Epoch 01 | Train Loss: 4.1592 | Val Loss: -2.0704 | Val F1: 0.7171
  → New best model saved! (F1: 0.7171)


Epoch 2/20 [Train]: 100%|██████████| 1649/1649 [03:12<00:00,  8.57it/s]
Epoch 2/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.42it/s]


Epoch 02 | Train Loss: -5.6906 | Val Loss: -8.6015 | Val F1: 0.7641
  → New best model saved! (F1: 0.7641)


Epoch 3/20 [Train]: 100%|██████████| 1649/1649 [03:13<00:00,  8.52it/s]
Epoch 3/20 [Val]: 100%|██████████| 206/206 [00:12<00:00, 15.89it/s]


Epoch 03 | Train Loss: -12.4124 | Val Loss: -14.2462 | Val F1: 0.8090
  → New best model saved! (F1: 0.8090)


Epoch 4/20 [Train]: 100%|██████████| 1649/1649 [03:15<00:00,  8.43it/s]
Epoch 4/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.10it/s]


Epoch 04 | Train Loss: -18.7015 | Val Loss: -19.4535 | Val F1: 0.8309
  → New best model saved! (F1: 0.8309)


Epoch 5/20 [Train]: 100%|██████████| 1649/1649 [03:12<00:00,  8.57it/s]
Epoch 5/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.50it/s]


Epoch 05 | Train Loss: -24.7782 | Val Loss: -24.4402 | Val F1: 0.8399
  → New best model saved! (F1: 0.8399)


Epoch 6/20 [Train]: 100%|██████████| 1649/1649 [03:13<00:00,  8.52it/s]
Epoch 6/20 [Val]: 100%|██████████| 206/206 [00:12<00:00, 16.32it/s]


Epoch 06 | Train Loss: -30.7464 | Val Loss: -29.4695 | Val F1: 0.8553
  → New best model saved! (F1: 0.8553)


Epoch 7/20 [Train]: 100%|██████████| 1649/1649 [03:10<00:00,  8.66it/s]
Epoch 7/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.68it/s]


Epoch 07 | Train Loss: -36.4962 | Val Loss: -34.4118 | Val F1: 0.8656
  → New best model saved! (F1: 0.8656)


Epoch 8/20 [Train]: 100%|██████████| 1649/1649 [03:10<00:00,  8.67it/s]
Epoch 8/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.23it/s]


Epoch 08 | Train Loss: -42.1090 | Val Loss: -38.9710 | Val F1: 0.8694
  → New best model saved! (F1: 0.8694)


Epoch 9/20 [Train]: 100%|██████████| 1649/1649 [03:11<00:00,  8.63it/s]
Epoch 9/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.27it/s]


Epoch 09 | Train Loss: -47.7877 | Val Loss: -43.5208 | Val F1: 0.8680


Epoch 10/20 [Train]: 100%|██████████| 1649/1649 [03:10<00:00,  8.66it/s]
Epoch 10/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.41it/s]


Epoch 10 | Train Loss: -53.3075 | Val Loss: -48.3207 | Val F1: 0.8744
  → New best model saved! (F1: 0.8744)


Epoch 11/20 [Train]: 100%|██████████| 1649/1649 [03:07<00:00,  8.79it/s]
Epoch 11/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.62it/s]


Epoch 11 | Train Loss: -58.7779 | Val Loss: -52.6760 | Val F1: 0.8725


Epoch 12/20 [Train]: 100%|██████████| 1649/1649 [03:09<00:00,  8.69it/s]
Epoch 12/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.30it/s]


Epoch 12 | Train Loss: -64.3169 | Val Loss: -57.7546 | Val F1: 0.8775
  → New best model saved! (F1: 0.8775)


Epoch 13/20 [Train]: 100%|██████████| 1649/1649 [03:09<00:00,  8.70it/s]
Epoch 13/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.35it/s]


Epoch 13 | Train Loss: -69.7876 | Val Loss: -61.9846 | Val F1: 0.8801
  → New best model saved! (F1: 0.8801)


Epoch 14/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.76it/s]
Epoch 14/20 [Val]: 100%|██████████| 206/206 [00:12<00:00, 16.25it/s]


Epoch 14 | Train Loss: -75.2045 | Val Loss: -66.3234 | Val F1: 0.8823
  → New best model saved! (F1: 0.8823)


Epoch 15/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.74it/s]
Epoch 15/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.39it/s]


Epoch 15 | Train Loss: -80.5831 | Val Loss: -71.3915 | Val F1: 0.8800


Epoch 16/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.75it/s]
Epoch 16/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.45it/s]


Epoch 16 | Train Loss: -85.8682 | Val Loss: -74.4305 | Val F1: 0.8779


Epoch 17/20 [Train]: 100%|██████████| 1649/1649 [03:06<00:00,  8.82it/s]
Epoch 17/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.47it/s]


Epoch 17 | Train Loss: -91.1835 | Val Loss: -79.4352 | Val F1: 0.8801


Epoch 18/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.75it/s]
Epoch 18/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.42it/s]


Epoch 18 | Train Loss: -95.6122 | Val Loss: -82.2686 | Val F1: 0.8898
  → New best model saved! (F1: 0.8898)


Epoch 19/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.76it/s]
Epoch 19/20 [Val]: 100%|██████████| 206/206 [00:12<00:00, 16.85it/s]


Epoch 19 | Train Loss: -98.5758 | Val Loss: -84.0296 | Val F1: 0.8886


Epoch 20/20 [Train]: 100%|██████████| 1649/1649 [03:08<00:00,  8.74it/s]
Epoch 20/20 [Val]: 100%|██████████| 206/206 [00:13<00:00, 15.40it/s]


Epoch 20 | Train Loss: -101.5111 | Val Loss: -86.5059 | Val F1: 0.8891

Training completed. Best validation F1: 0.8898


In [16]:
print("\nLoading best model for test evaluation...")
model.load_state_dict(torch.load("best_crf_model.pt"))
model.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for words, chars, labels, masks in tqdm(test_loader, desc="Testing"):
        words = words.to(device)
        chars = chars.to(device)
        masks = masks.to(device)

        # Decode using Viterbi
        pred = model.decode(words, chars, masks)

        pred_labels, true_labels = convert_to_labels(pred, labels)
        y_pred.extend(pred_labels)
        y_true.extend(true_labels)

print("\n" + "="*80)
print("FINAL TEST RESULTS (CNN-BiLSTM-CRF)")
print("="*80)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))
print("\nAggregate Scores:")
print(f"  Micro F1:    {f1_score(y_true, y_pred, average='micro'):.4f}")
print(f"  Macro F1:    {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"  Weighted F1: {f1_score(y_true, y_pred, average='weighted'):.4f}")
print("="*80)


Loading best model for test evaluation...


Testing: 100%|██████████| 206/206 [00:08<00:00, 23.52it/s]



FINAL TEST RESULTS (CNN-BiLSTM-CRF)

Classification Report:
              precision    recall  f1-score   support

         DAT     0.5792    0.7151    0.6400       179
         EVE     0.9559    0.9954    0.9753       218
         FAC     0.8732    1.0000    0.9323       124
         LOC     0.9137    0.9245    0.9191      1855
         MON     0.6935    0.8431    0.7611        51
         ORG     0.8699    0.9050    0.8871      2010
         PCT     0.8519    0.8519    0.8519        27
         PER     0.9342    0.9198    0.9269      1558
         PRO     0.8978    1.0000    0.9461       281
         TIM     0.5333    0.5926    0.5614        27

   micro avg     0.8889    0.9161    0.9023      6330
   macro avg     0.8103    0.8747    0.8401      6330
weighted avg     0.8917    0.9161    0.9033      6330


Aggregate Scores:
  Micro F1:    0.9023
  Macro F1:    0.8401
  Weighted F1: 0.9033
