# Lab 07-04: Named Entity Recognition with RNN, LSTM và GRU
##  Comparing Recurrent Architectures for Sequence Labeling

**Task:** Named Entity Recognition (NER)  
**Dataset:** Synthetic NER dataset với tags: PER, ORG, LOC  
**Models:** Vanilla RNN, LSTM, GRU  

## Part 1: Setup và Import Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import time
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix

# Set random seed
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

## Part 2: Data Preparation

We'll use a synthetic NER dataset with BIO tagging scheme:
- **B-X**: Beginning of entity type X
- **I-X**: Inside entity type X
- **O**: Outside any entity

In [None]:
# Sample NER dataset (BIO format)
sample_data = [
    (["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$1", "billion"],
     ["B-ORG", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O"]),
    
    (["Tim", "Cook", "is", "the", "CEO", "of", "Apple", "Inc."],
     ["B-PER", "I-PER", "O", "O", "O", "O", "B-ORG", "I-ORG"]),
    
    (["Google", "was", "founded", "in", "California"],
     ["B-ORG", "O", "O", "O", "B-LOC"]),
    
    (["John", "Smith", "works", "at", "Microsoft", "in", "Seattle"],
     ["B-PER", "I-PER", "O", "O", "B-ORG", "O", "B-LOC"]),
    
    (["The", "meeting", "will", "be", "in", "New", "York", "City"],
     ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "I-LOC"]),
    
    (["Barack", "Obama", "was", "born", "in", "Hawaii"],
     ["B-PER", "I-PER", "O", "O", "O", "B-LOC"]),
    
    (["Amazon", "delivers", "packages", "worldwide"],
     ["B-ORG", "O", "O", "O"]),
    
    (["Paris", "is", "the", "capital", "of", "France"],
     ["B-LOC", "O", "O", "O", "O", "B-LOC"]),
    
    (["Elon", "Musk", "founded", "Tesla", "and", "SpaceX"],
     ["B-PER", "I-PER", "O", "B-ORG", "O", "B-ORG"]),
    
    (["The", "United", "Nations", "is", "based", "in", "Geneva"],
     ["O", "B-ORG", "I-ORG", "O", "O", "O", "B-LOC"]),
    
    (["Mark", "Zuckerberg", "created", "Facebook"],
     ["B-PER", "I-PER", "O", "B-ORG"]),
    
    (["London", "is", "in", "England"],
     ["B-LOC", "O", "O", "B-LOC"]),
    
    (["IBM", "has", "offices", "in", "Tokyo"],
     ["B-ORG", "O", "O", "O", "B-LOC"]),
    
    (["Bill", "Gates", "works", "with", "Microsoft"],
     ["B-PER", "I-PER", "O", "O", "B-ORG"]),
    
    (["The", "company", "is", "based", "in", "Silicon", "Valley"],
     ["O", "O", "O", "O", "O", "B-LOC", "I-LOC"]),
]

# Create more data by duplication
all_data = sample_data * 20  # 300 samples
np.random.shuffle(all_data)

train_size = int(0.7 * len(all_data))
val_size = int(0.15 * len(all_data))

train_data = all_data[:train_size]
val_data = all_data[train_size:train_size + val_size]
test_data = all_data[train_size + val_size:]

print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nSample:")
for i in range(2):
    words, tags = train_data[i]
    print(f"  Words: {' '.join(words)}")
    print(f"  Tags:  {' '.join(tags)}")

In [None]:
# Build vocabularies
class Vocab:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {0: "<PAD>", 1: "<UNK>"}
        self.tag2idx = {"<PAD>": 0}
        self.idx2tag = {0: "<PAD>"}
        
    def build(self, data):
        # Build word vocab
        words = set()
        tags = set()
        for sent_words, sent_tags in data:
            words.update(sent_words)
            tags.update(sent_tags)
        
        for idx, word in enumerate(sorted(words), start=2):
            self.word2idx[word] = idx
            self.idx2word[idx] = word
            
        for idx, tag in enumerate(sorted(tags), start=1):
            self.tag2idx[tag] = idx
            self.idx2tag[idx] = tag
            
        print(f"Vocabulary size: {len(self.word2idx)}")
        print(f"Tag set size: {len(self.tag2idx)}")
        print(f"Tags: {list(self.tag2idx.keys())}")

vocab = Vocab()
vocab.build(train_data)

In [None]:
# Dataset class
class NERDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        words, tags = self.data[idx]
        
        word_ids = [self.vocab.word2idx.get(w, self.vocab.word2idx["<UNK>"]) for w in words]
        tag_ids = [self.vocab.tag2idx[t] for t in tags]
        
        return torch.tensor(word_ids), torch.tensor(tag_ids)

def collate_fn(batch):
    """Collate function for variable length sequences"""
    word_seqs, tag_seqs = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in word_seqs])
    
    # Pad sequences
    word_seqs_padded = pad_sequence(word_seqs, batch_first=True, padding_value=0)
    tag_seqs_padded = pad_sequence(tag_seqs, batch_first=True, padding_value=0)
    
    return word_seqs_padded, tag_seqs_padded, lengths

# Create datasets and dataloaders
BATCH_SIZE = 32

train_dataset = NERDataset(train_data, vocab)
val_dataset = NERDataset(val_data, vocab)
test_dataset = NERDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## Part 3: Model Implementations

### 3.1. Vanilla RNN for NER

In [None]:
class RNN_NER(nn.Module):
    """
    Vanilla RNN for Named Entity Recognition
    
    Architecture:
    Embedding → RNN → Linear → Output tags
    """
    
    def __init__(self, vocab_size, tag_size, embedding_dim=100, hidden_dim=128, 
                 num_layers=1, dropout=0.3):
        super(RNN_NER, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Vanilla RNN layer
        self.rnn = nn.RNN(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, tag_size)
    
    def forward(self, sentences, lengths):
        """
        Args:
            sentences: [batch_size, seq_len]
            lengths: [batch_size]
        
        Returns:
            outputs: [batch_size, seq_len, tag_size]
        """
        # Embedding
        embeds = self.embedding(sentences)  # [batch, seq_len, embed_dim]
        embeds = self.dropout(embeds)
        
        # Pack sequences
        packed_embeds = pack_padded_sequence(
            embeds, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # RNN
        packed_rnn_out, hidden = self.rnn(packed_embeds)
        
        # Unpack
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)
        
        # Dropout
        rnn_out = self.dropout(rnn_out)
        
        # Output layer
        outputs = self.fc(rnn_out)  # [batch, seq_len, tag_size]
        
        return outputs

In [None]:
class LSTM_NER(nn.Module):
    """
    LSTM for Named Entity Recognition
    
    Architecture:
    Embedding → LSTM → Linear → Output tags
    """
    
    def __init__(self, vocab_size, tag_size, embedding_dim=100, hidden_dim=128, 
                 num_layers=1, dropout=0.3):
        super(LSTM_NER, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, tag_size)
    
    def forward(self, sentences, lengths):
        """
        Args:
            sentences: [batch_size, seq_len]
            lengths: [batch_size]
        
        Returns:
            outputs: [batch_size, seq_len, tag_size]
        """
        # Embedding
        embeds = self.embedding(sentences)  # [batch, seq_len, embed_dim]
        embeds = self.dropout(embeds)
        
        # Pack sequences
        packed_embeds = pack_padded_sequence(
            embeds, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # LSTM
        packed_lstm_out, (hidden, cell) = self.lstm(packed_embeds)
        
        # Unpack
        lstm_out, _ = pad_packed_sequence(packed_lstm_out, batch_first=True)
        
        # Dropout
        lstm_out = self.dropout(lstm_out)
        
        # Output layer
        outputs = self.fc(lstm_out)  # [batch, seq_len, tag_size]
        
        return outputs

### 3.3. GRU for NER

In [None]:
class GRU_NER(nn.Module):
    """
    GRU for Named Entity Recognition
    
    Architecture:
    Embedding → GRU → Linear → Output tags
    """
    
    def __init__(self, vocab_size, tag_size, embedding_dim=100, hidden_dim=128, 
                 num_layers=1, dropout=0.3):
        super(GRU_NER, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # GRU layer
        self.gru = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, tag_size)
    
    def forward(self, sentences, lengths):
        """
        Args:
            sentences: [batch_size, seq_len]
            lengths: [batch_size]
        
        Returns:
            outputs: [batch_size, seq_len, tag_size]
        """
        # Embedding
        embeds = self.embedding(sentences)  # [batch, seq_len, embed_dim]
        embeds = self.dropout(embeds)
        
        # Pack sequences
        packed_embeds = pack_padded_sequence(
            embeds, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # GRU
        packed_gru_out, hidden = self.gru(packed_embeds)
        
        # Unpack
        gru_out, _ = pad_packed_sequence(packed_gru_out, batch_first=True)
        
        # Dropout
        gru_out = self.dropout(gru_out)
        
        # Output layer
        outputs = self.fc(gru_out)  # [batch, seq_len, tag_size]
        
        return outputs

## Part 4: Training và Evaluation Functions

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for sentences, tags, lengths in tqdm(dataloader, desc="Training", leave=False):
        sentences = sentences.to(device)
        tags = tags.to(device)
        lengths = lengths.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(sentences, lengths)  # [batch, seq_len, tag_size]
        
        # Reshape for loss calculation
        outputs_flat = outputs.view(-1, outputs.shape[-1])  # [batch*seq_len, tag_size]
        tags_flat = tags.view(-1)  # [batch*seq_len]
        
        # Calculate loss (ignore padding)
        loss = criterion(outputs_flat, tags_flat)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy (excluding padding)
        predictions = outputs.argmax(dim=-1)  # [batch, seq_len]
        mask = tags != 0  # Non-padding positions
        correct += ((predictions == tags) & mask).sum().item()
        total += mask.sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total if total > 0 else 0
    
    return avg_loss, accuracy

def evaluate(model, dataloader, criterion, device, vocab):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for sentences, tags, lengths in tqdm(dataloader, desc="Evaluating", leave=False):
            sentences = sentences.to(device)
            tags = tags.to(device)
            lengths = lengths.to(device)
            
            # Forward pass
            outputs = model(sentences, lengths)
            
            # Reshape for loss calculation
            outputs_flat = outputs.view(-1, outputs.shape[-1])
            tags_flat = tags.view(-1)
            
            # Calculate loss
            loss = criterion(outputs_flat, tags_flat)
            total_loss += loss.item()
            
            # Calculate accuracy
            predictions = outputs.argmax(dim=-1)
            mask = tags != 0
            correct += ((predictions == tags) & mask).sum().item()
            total += mask.sum().item()
            
            # Store predictions for detailed metrics
            for i, length in enumerate(lengths):
                pred_tags = predictions[i, :length].cpu().numpy()
                true_tags = tags[i, :length].cpu().numpy()
                all_predictions.extend(pred_tags)
                all_targets.extend(true_tags)
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total if total > 0 else 0
    
    return avg_loss, accuracy, all_predictions, all_targets

## Part 5: Training All Models

Let's train all three models and compare their performance!

In [None]:
# Model hyperparameters
VOCAB_SIZE = len(vocab.word2idx)
TAG_SIZE = len(vocab.tag2idx)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.3

# Training hyperparameters
N_EPOCHS = 30
LEARNING_RATE = 0.001

# Loss function (ignore padding index 0)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Dictionary to store results
results = {}

# Models to train
models_to_train = {
    'RNN': RNN_NER(VOCAB_SIZE, TAG_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT),
    'LSTM': LSTM_NER(VOCAB_SIZE, TAG_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT),
    'GRU': GRU_NER(VOCAB_SIZE, TAG_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
}

print("Model Comparison")
for name, model in models_to_train.items():
    n_params = sum(p.numel() for p in model.parameters())
    print(f"{name:10} - Parameters: {n_params:,}")

In [None]:
# Train each model
for model_name, model in models_to_train.items():
    print(f"Training {model_name} Model")
    
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    
    best_val_acc = 0
    training_time = 0
    
    for epoch in range(N_EPOCHS):
        print(f"Epoch {epoch+1}/{N_EPOCHS}")
        
        start_time = time.time()
        
        # Train
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_acc, _, _ = evaluate(model, val_loader, criterion, device, vocab)
        
        epoch_time = time.time() - start_time
        training_time += epoch_time
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
        print(f"  Time: {epoch_time:.2f}s")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'{model_name.lower()}_ner_best.pt')
            print(f"  Saved best model (Val Acc: {best_val_acc:.4f})")
    
    # Test on best model
    model.load_state_dict(torch.load(f'{model_name.lower()}_ner_best.pt'))
    test_loss, test_acc, test_preds, test_targets = evaluate(model, test_loader, criterion, device, vocab)
    
    # Store results
    results[model_name] = {
        'history': history,
        'best_val_acc': best_val_acc,
        'test_acc': test_acc,
        'test_loss': test_loss,
        'training_time': training_time,
        'predictions': test_preds,
        'targets': test_targets,
        'n_params': sum(p.numel() for p in model.parameters())
    }
    
    print(f"{model_name} Training Complete!")
    print(f"  Best Val Acc: {best_val_acc:.4f}")
    print(f"  Test Acc: {test_acc:.4f}")
    print(f"  Total Training Time: {training_time:.2f}s")

## Part 6: Visualization và Comparison

In [None]:
# Plot training history for all models
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

epochs = range(1, N_EPOCHS + 1)

# Training Loss
ax = axes[0, 0]
for model_name in ['RNN', 'LSTM', 'GRU']:
    ax.plot(epochs, results[model_name]['history']['train_loss'], 
            label=model_name, linewidth=2, marker='o', markersize=3)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Loss', fontsize=12)
ax.set_title('Training Loss Comparison', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Validation Loss
ax = axes[0, 1]
for model_name in ['RNN', 'LSTM', 'GRU']:
    ax.plot(epochs, results[model_name]['history']['val_loss'], 
            label=model_name, linewidth=2, marker='o', markersize=3)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Loss', fontsize=12)
ax.set_title('Validation Loss Comparison', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Training Accuracy
ax = axes[1, 0]
for model_name in ['RNN', 'LSTM', 'GRU']:
    ax.plot(epochs, results[model_name]['history']['train_acc'], 
            label=model_name, linewidth=2, marker='o', markersize=3)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Training Accuracy Comparison', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Validation Accuracy
ax = axes[1, 1]
for model_name in ['RNN', 'LSTM', 'GRU']:
    ax.plot(epochs, results[model_name]['history']['val_acc'], 
            label=model_name, linewidth=2, marker='o', markersize=3)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Validation Accuracy Comparison', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
# plt.savefig('rnn_lstm_gru_comparison.png', dpi=150)
plt.show()

In [None]:
# Performance comparison table
print("MODEL COMPARISON SUMMARY")
print(f"{'Model':<10} {'Params':<12} {'Train Time':<15} {'Best Val Acc':<15} {'Test Acc':<12}")

for model_name in ['RNN', 'LSTM', 'GRU']:
    res = results[model_name]
    print(f"{model_name:<10} {res['n_params']:>10,}  "
          f"{res['training_time']:>12.2f}s  "
          f"{res['best_val_acc']:>12.4f}  "
          f"{res['test_acc']:>10.4f}")

# Bar plots for final comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

model_names = ['RNN', 'LSTM', 'GRU']
colors = ['#3498db', '#e74c3c', '#2ecc71']

# Test Accuracy
ax = axes[0]
test_accs = [results[name]['test_acc'] for name in model_names]
bars = ax.bar(model_names, test_accs, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
ax.set_ylim([0, 1.0])
for bar, acc in zip(bars, test_accs):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
            f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

# Training Time
ax = axes[1]
train_times = [results[name]['training_time'] for name in model_names]
bars = ax.bar(model_names, train_times, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Time (seconds)', fontsize=12)
ax.set_title('Training Time Comparison', fontsize=14, fontweight='bold')
for bar, time_val in zip(bars, train_times):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 1,
            f'{time_val:.1f}s', ha='center', va='bottom', fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

# Parameters
ax = axes[2]
n_params = [results[name]['n_params'] for name in model_names]
bars = ax.bar(model_names, n_params, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Number of Parameters', fontsize=12)
ax.set_title('Model Size Comparison', fontsize=14, fontweight='bold')
for bar, params in zip(bars, n_params):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 500,
            f'{params:,}', ha='center', va='bottom', fontweight='bold', fontsize=9)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
# plt.savefig('model_metrics_comparison.png', dpi=150)
plt.show()

## Part 7: Detailed Analysis

Let's analyze the predictions from each model in more detail.

In [None]:
# Classification report for each model
for model_name in ['RNN', 'LSTM', 'GRU']:
    print(f"Classification Report - {model_name}")
    
    preds = results[model_name]['predictions']
    targets = results[model_name]['targets']
    
    # Convert indices to tag names
    pred_tags = [vocab.idx2tag[p] for p in preds]
    true_tags = [vocab.idx2tag[t] for t in targets]
    
    print(classification_report(true_tags, pred_tags, zero_division=0))
    
    # Confusion matrix
    tags_list = sorted([tag for tag in vocab.tag2idx.keys() if tag != '<PAD>'])
    cm = confusion_matrix(true_tags, pred_tags, labels=tags_list)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=tags_list, yticklabels=tags_list)
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    # plt.savefig(f'confusion_matrix_{model_name.lower()}.png', dpi=150)
    plt.show()

## Part 8: Sample Predictions

Let's test all models on some example sentences.

In [None]:
def predict_sentence(model, sentence, vocab, device):
    """Predict NER tags for a sentence"""
    model.eval()
    
    # Tokenize (simple space split)
    words = sentence.split()
    
    # Convert to indices
    word_ids = [vocab.word2idx.get(w, vocab.word2idx["<UNK>"]) for w in words]
    
    # Create tensors
    sentence_tensor = torch.tensor([word_ids]).to(device)
    length_tensor = torch.tensor([len(word_ids)]).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(sentence_tensor, length_tensor)
        predictions = outputs.argmax(dim=-1)[0, :len(words)]
    
    # Convert to tag names
    pred_tags = [vocab.idx2tag[p.item()] for p in predictions]
    
    return list(zip(words, pred_tags))

# Load best models
best_models = {}
for model_name, model_class in [('RNN', RNN_NER), ('LSTM', LSTM_NER), ('GRU', GRU_NER)]:
    model = model_class(VOCAB_SIZE, TAG_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
    model.load_state_dict(torch.load(f'{model_name.lower()}_ner_best.pt'))
    model = model.to(device)
    best_models[model_name] = model

# Test sentences
test_sentences = [
    "Apple is looking at buying U.K. startup",
    "Tim Cook is the CEO of Apple Inc.",
    "Google was founded in California",
    "John Smith works at Microsoft in Seattle",
    "Barack Obama was born in Hawaii"
]

print("SAMPLE PREDICTIONS COMPARISON")

for sent in test_sentences:
    print(f"\n Sentence: {sent}")
    
    # Get predictions from all models
    all_predictions = {}
    for model_name, model in best_models.items():
        predictions = predict_sentence(model, sent, vocab, device)
        all_predictions[model_name] = predictions
    
    # Display in table format
    words = sent.split()
    print(f"{'Word':<15} {'RNN':<15} {'LSTM':<15} {'GRU':<15}")
    
    for i, word in enumerate(words):
        rnn_tag = all_predictions['RNN'][i][1]
        lstm_tag = all_predictions['LSTM'][i][1]
        gru_tag = all_predictions['GRU'][i][1]
        
        # Highlight entities
        rnn_display = f"➤ {rnn_tag}" if rnn_tag != 'O' else rnn_tag
        lstm_display = f"➤ {lstm_tag}" if lstm_tag != 'O' else lstm_tag
        gru_display = f"➤ {gru_tag}" if gru_tag != 'O' else gru_tag
        
        print(f"{word:<15} {rnn_display:<15} {lstm_display:<15} {gru_display:<15}")