# NLP501 - Lab 07: Sentiment Analysis with RNN và LSTM

## 0: Setup và Import Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re
import os
import urllib.request
import tarfile
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

# Set random seed
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1: Data Preparation

In [None]:
def download_imdb_dataset(data_dir='./data'):
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filepath = os.path.join(data_dir, "aclImdb_v1.tar.gz")
    
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    if not os.path.exists(os.path.join(data_dir, 'aclImdb')):
        print("Downloading IMDB dataset...")
        urllib.request.urlretrieve(url, filepath)
        print("Extracting...")
        with tarfile.open(filepath, 'r:gz') as tar:
            tar.extractall(data_dir)
        os.remove(filepath)
        print("Done!")
    else:
        print("Dataset already exists.")
    
    return os.path.join(data_dir, 'aclImdb')

data_path = download_imdb_dataset()

In [None]:
def load_imdb_data(data_path, split='train'):
    """Load IMDB data from directory"""
    texts = []
    labels = []
    
    for label_type in ['pos', 'neg']:
        dir_path = os.path.join(data_path, split, label_type)
        label = 1 if label_type == 'pos' else 0
        
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                    labels.append(label)
    
    return texts, labels

print("Loading training data...")
train_texts, train_labels = load_imdb_data(data_path, 'train')
print(f"Training samples: {len(train_texts)}")

print("Loading test data...")
test_texts, test_labels = load_imdb_data(data_path, 'test')
print(f"Test samples: {len(test_texts)}")

In [None]:
def preprocess_text(text):
    """
    Preprocess text: lowercase, remove HTML tags, tokenize
    
    Args:
        text: raw text string
    Returns:
        list of tokens
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # 3. Keep only alphanumeric characters and spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # 4. Remove extra whitespace and split
    tokens = text.split()
    
    return tokens

# Test
sample = "This is a <br />GREAT movie! I loved it... 10/10"
print(f"Original: {sample}")
print(f"Preprocessed: {preprocess_text(sample)}")

In [None]:
class Vocabulary:
    """
    Vocabulary class to map words to indices
    """
    
    def __init__(self, max_vocab_size=25000, min_freq=2):
        self.max_vocab_size = max_vocab_size
        self.min_freq = min_freq
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_freq = Counter()
        
    def build_vocab(self, texts):
        """
        Build vocabulary from list of texts
        """
        # Count word frequencies
        print("Counting word frequencies...")
        for text in tqdm(texts):
            tokens = preprocess_text(text)
            self.word_freq.update(tokens)
        
        # Filter by min_freq and get most common
        filtered_words = [
            word for word, freq in self.word_freq.items() 
            if freq >= self.min_freq
        ]
        
        # Sort by frequency (descending)
        sorted_words = sorted(
            filtered_words, 
            key=lambda w: self.word_freq[w], 
            reverse=True
        )
        
        # Limit vocab size (subtract 2 for PAD and UNK)
        vocab_words = sorted_words[:self.max_vocab_size - 2]
        
        # Build mappings
        for idx, word in enumerate(vocab_words, start=2):
            self.word2idx[word] = idx
            self.idx2word[idx] = word
        
        print(f"Vocabulary built: {len(self.word2idx)} words")
    
    def text_to_indices(self, text):
        """
        Convert text to list of indices
        """
        tokens = preprocess_text(text)
        indices = [
            self.word2idx.get(token, self.word2idx['<UNK>'])
            for token in tokens
        ]
        return indices
    
    def __len__(self):
        return len(self.word2idx)

# Build vocabulary
print("Building vocabulary...")
vocab = Vocabulary(max_vocab_size=25000, min_freq=2)
vocab.build_vocab(train_texts)
print(f"Vocabulary size: {len(vocab)}")

# Test
sample_text = "This movie is great!"
print(f"\nSample: '{sample_text}'")
print(f"Indices: {vocab.text_to_indices(sample_text)}")

In [None]:
class IMDBDataset(Dataset):
    """PyTorch Dataset cho IMDB"""
    
    def __init__(self, texts, labels, vocab, max_length=256):
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length
        
        self.sequences = []
        for text in tqdm(texts, desc="Converting texts"):
            indices = vocab.text_to_indices(text)
            if len(indices) > max_length:
                indices = indices[:max_length]
            self.sequences.append(indices)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.sequences[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float)
        )

In [None]:
def collate_fn(batch):
    """
    Custom collate function to pad sequences in batch
    """
    # Separate sequences and labels
    sequences, labels = zip(*batch)
    
    # Get actual lengths
    lengths = torch.tensor([len(seq) for seq in sequences])
    
    # Pad sequences (pad_sequence expects list of tensors)
    padded_sequences = pad_sequence(
        sequences, 
        batch_first=True, 
        padding_value=0  # PAD index
    )
    
    # Stack labels
    labels = torch.stack(labels)
    
    return padded_sequences, labels, lengths

In [None]:
# Hyperparameters
MAX_LENGTH = 256
BATCH_SIZE = 64

# Create datasets
print("Creating training dataset...")
train_dataset = IMDBDataset(train_texts, train_labels, vocab, MAX_LENGTH)

print("Creating test dataset...")
test_dataset = IMDBDataset(test_texts, test_labels, vocab, MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_fn
)

print(f"\nTrain batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

## 2: RNN Model

In [None]:
class RNNClassifier(nn.Module):
    """
    RNN-based Sentiment Classifier
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers=1, dropout=0.5, pad_idx=0):
        super(RNNClassifier, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=pad_idx
        )
        
        # RNN layer
        self.rnn = nn.RNN(
            embedding_dim, 
            hidden_dim, 
            num_layers=n_layers,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0
        )
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # FC layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text, text_lengths):
        """
        Forward pass
        
        Args:
            text: [batch_size, seq_len]
            text_lengths: [batch_size]
        """
        # Embedding: [batch_size, seq_len, embedding_dim]
        embedded = self.embedding(text)
        
        # Pack padded sequence
        packed_embedded = pack_padded_sequence(
            embedded, 
            text_lengths.cpu(), 
            batch_first=True, 
            enforce_sorted=False
        )
        
        # RNN
        packed_output, hidden = self.rnn(packed_embedded)
        # hidden: [n_layers, batch_size, hidden_dim]
        
        # Get final hidden state from last layer
        final_hidden = hidden[-1]  # [batch_size, hidden_dim]
        
        # Dropout + FC + Sigmoid
        output = self.dropout(final_hidden)
        output = self.fc(output)  # [batch_size, 1]
        output = torch.sigmoid(output)
        
        return output.squeeze(1)  # [batch_size]

## 3: LSTM Model

In [None]:
class LSTMClassifier(nn.Module):
    """
    LSTM-based Sentiment Classifier
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers=1, bidirectional=False, dropout=0.5, pad_idx=0):
        super(LSTMClassifier, self).__init__()
        
        self.bidirectional = bidirectional
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embedding layer
        self.embedding = nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=pad_idx
        )
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=n_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # FC layer (hidden_dim * 2 if bidirectional)
        fc_input_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(fc_input_dim, output_dim)
    
    def forward(self, text, text_lengths):
        """
        Forward pass
        
        Args:
            text: [batch_size, seq_len]
            text_lengths: [batch_size]
        """
        # Embedding: [batch_size, seq_len, embedding_dim]
        embedded = self.embedding(text)
        
        # Pack padded sequence
        packed_embedded = pack_padded_sequence(
            embedded, 
            text_lengths.cpu(), 
            batch_first=True, 
            enforce_sorted=False
        )
        
        # LSTM
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden: [n_layers * num_directions, batch_size, hidden_dim]
        
        if self.bidirectional:
            # Concatenate forward and backward hidden states from last layer
            # Forward: hidden[-2], Backward: hidden[-1]
            final_hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
            # final_hidden: [batch_size, hidden_dim * 2]
        else:
            # Get final hidden state from last layer
            final_hidden = hidden[-1]  # [batch_size, hidden_dim]
        
        # Dropout + FC + Sigmoid
        output = self.dropout(final_hidden)
        output = self.fc(output)
        output = torch.sigmoid(output)
        
        return output.squeeze(1)

In [None]:
# Model hyperparameters
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.5
PAD_IDX = vocab.word2idx['<PAD>']

# Create models
rnn_model = RNNClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, DROPOUT, PAD_IDX
).to(device)

lstm_model = LSTMClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, bidirectional=False, dropout=DROPOUT, pad_idx=PAD_IDX
).to(device)

bilstm_model = LSTMClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, bidirectional=True, dropout=DROPOUT, pad_idx=PAD_IDX
).to(device)

print(f"RNN parameters: {sum(p.numel() for p in rnn_model.parameters()):,}")
print(f"LSTM parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")
print(f"BiLSTM parameters: {sum(p.numel() for p in bilstm_model.parameters()):,}")

## 4: Training và Evaluation

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    """
    Train model for one epoch
    """
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        # Get data
        sequences, labels, lengths = batch
        sequences = sequences.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(sequences, lengths)
        
        # Calculate loss
        loss = criterion(predictions, labels)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping (important for RNNs)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update weights
        optimizer.step()
        
        # Track metrics
        epoch_loss += loss.item()
        predicted_labels = (predictions >= 0.5).float()
        correct += (predicted_labels == labels).sum().item()
        total += labels.size(0)
    
    return epoch_loss / len(dataloader), correct / total

In [None]:
def evaluate(model, dataloader, criterion, device):
    """
    Evaluate model
    """
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Get data
            sequences, labels, lengths = batch
            sequences = sequences.to(device)
            labels = labels.to(device)
            lengths = lengths.to(device)
            
            # Forward pass
            predictions = model(sequences, lengths)
            
            # Calculate loss
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            
            # Track predictions
            predicted_labels = (predictions >= 0.5).float()
            all_preds.extend(predicted_labels.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    return epoch_loss / len(dataloader), accuracy, all_preds, all_labels

In [None]:
def train_model(model, train_loader, test_loader, n_epochs, lr, device, model_name="model"):
    """
    Complete training loop
    """
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    
    history = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': []
    }
    
    best_acc = 0
    
    for epoch in range(n_epochs):
        print(f"\n{'='*50}")
        print(f"Epoch {epoch+1}/{n_epochs}")
        print(f"{'='*50}")
        
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        test_loss, test_acc, _, _ = evaluate(model, test_loader, criterion, device)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        
        print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%")
        print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc*100:.2f}%")
        
        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), f'{model_name}_best.pt')
            print(f"✓ Saved best model with accuracy: {best_acc*100:.2f}%")
    
    return history

In [None]:
# Training
N_EPOCHS = 5
LEARNING_RATE = 0.001

print("Training RNN Model")
rnn_history = train_model(
    rnn_model, train_loader, test_loader,
    N_EPOCHS, LEARNING_RATE, device, "rnn"
)

In [None]:
# Reinitialize and train LSTM
lstm_model = LSTMClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, bidirectional=False, dropout=DROPOUT, pad_idx=PAD_IDX
).to(device)

print("Training LSTM Model")
lstm_history = train_model(
    lstm_model, train_loader, test_loader,
    N_EPOCHS, LEARNING_RATE, device, "lstm"
)

In [None]:
# Reinitialize and train BiLSTM
bilstm_model = LSTMClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, bidirectional=True, dropout=DROPOUT, pad_idx=PAD_IDX
).to(device)

print("Training Bidirectional LSTM Model")
bilstm_history = train_model(
    bilstm_model, train_loader, test_loader,
    N_EPOCHS, LEARNING_RATE, device, "bilstm"
)

## 5: Visualization và Analysis

In [None]:
def plot_training_history(histories, labels):
    """Plot training history comparison"""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    colors = ['#e74c3c', '#3498db', '#2ecc71']
    
    # Loss
    ax = axes[0]
    for i, (hist, label) in enumerate(zip(histories, labels)):
        epochs = range(1, len(hist['train_loss']) + 1)
        ax.plot(epochs, hist['train_loss'], '--', color=colors[i], label=f'{label} Train')
        ax.plot(epochs, hist['test_loss'], '-', color=colors[i], label=f'{label} Test')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Training & Test Loss')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Accuracy
    ax = axes[1]
    for i, (hist, label) in enumerate(zip(histories, labels)):
        epochs = range(1, len(hist['train_acc']) + 1)
        ax.plot(epochs, [acc*100 for acc in hist['train_acc']], '--', color=colors[i], label=f'{label} Train')
        ax.plot(epochs, [acc*100 for acc in hist['test_acc']], '-', color=colors[i], label=f'{label} Test')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Training & Test Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    # plt.savefig('training_comparison.png', dpi=150)
    plt.show()

plot_training_history(
    [rnn_history, lstm_history, bilstm_history],
    ['RNN', 'LSTM', 'BiLSTM']
)

In [None]:
# Summary
print("MODEL COMPARISON SUMMARY")
print(f"{'Model':<15} {'Parameters':>15} {'Best Test Acc':>15}")
print(f"{'RNN':<15} {sum(p.numel() for p in rnn_model.parameters()):>15,} {max(rnn_history['test_acc'])*100:>14.2f}%")
print(f"{'LSTM':<15} {sum(p.numel() for p in lstm_model.parameters()):>15,} {max(lstm_history['test_acc'])*100:>14.2f}%")
print(f"{'BiLSTM':<15} {sum(p.numel() for p in bilstm_model.parameters()):>15,} {max(bilstm_history['test_acc'])*100:>14.2f}%")

## MORE STUFF

In [None]:
class GRUClassifier(nn.Module):
    """
    GRU-based Sentiment Classifier
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers=1, bidirectional=False, dropout=0.5, pad_idx=0):
        super(GRUClassifier, self).__init__()
        
        self.bidirectional = bidirectional
        
        # Embedding layer
        self.embedding = nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=pad_idx
        )
        
        # GRU layer
        self.gru = nn.GRU(
            embedding_dim, 
            hidden_dim, 
            num_layers=n_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # FC layer
        fc_input_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(fc_input_dim, output_dim)
    
    def forward(self, text, text_lengths):
        # Embedding
        embedded = self.embedding(text)
        
        # Pack
        packed_embedded = pack_padded_sequence(
            embedded, 
            text_lengths.cpu(), 
            batch_first=True, 
            enforce_sorted=False
        )
        
        # GRU (only hidden, no cell state unlike LSTM)
        packed_output, hidden = self.gru(packed_embedded)
        
        if self.bidirectional:
            final_hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            final_hidden = hidden[-1]
        
        # Output
        output = self.dropout(final_hidden)
        output = self.fc(output)
        output = torch.sigmoid(output)
        
        return output.squeeze(1)

# Test GRU
gru_model = GRUClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
    N_LAYERS, bidirectional=True, dropout=DROPOUT, pad_idx=PAD_IDX
).to(device)

print(f"GRU parameters: {sum(p.numel() for p in gru_model.parameters()):,}")
print("Note: GRU has ~25% fewer parameters than LSTM due to having 2 gates instead of 3")

In [None]:
def load_glove_embeddings(glove_path, vocab, embedding_dim=100):
    """
    Load pretrained GloVe embeddings
    
    Args:
        glove_path: path to glove.6B.100d.txt
        vocab: Vocabulary object
        embedding_dim: embedding dimension
    
    Returns:
        embedding_matrix: numpy array [vocab_size, embedding_dim]
    """
    # Initialize with random values
    embedding_matrix = np.random.randn(len(vocab), embedding_dim) * 0.01
    
    # Set PAD to zeros
    embedding_matrix[0] = np.zeros(embedding_dim)
    
    # Load GloVe
    glove_dict = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe"):
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove_dict[word] = vector
    
    # Fill embedding matrix
    found = 0
    for word, idx in vocab.word2idx.items():
        if word in glove_dict:
            embedding_matrix[idx] = glove_dict[word]
            found += 1
    
    print(f"Found {found}/{len(vocab)} words in GloVe")
    return embedding_matrix

# Usage (uncomment after downloading GloVe):
# embeddings = load_glove_embeddings('glove.6B.100d.txt', vocab)
# model.embedding.weight.data.copy_(torch.from_numpy(embeddings))