## Install and import dependencies

## Part 0. Dataset Preparation

In [1]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


## Part 2 : Model Training & Evaluation - RNN

In [2]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import torch.optim as optim
import spacy
import random

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the embedding matrix and word_to_index from the pickle file
with open("updated_embedding_matrix.pkl", "rb") as f:
    data = pickle.load(f)
    embedding_matrix = data["embeddings"]
    word_to_index = data["word_to_index"]

# Convert embedding_matrix to a NumPy array and a PyTorch tensor
embedding_matrix_array = np.array(embedding_matrix)
embedding_matrix_tensor = torch.tensor(embedding_matrix_array, dtype=torch.float32)

print(f"Loaded embedding matrix with shape: {embedding_matrix_array.shape}")
print(f"Vocabulary size (word_to_index): {len(word_to_index)}")

Loaded embedding matrix with shape: (16633, 300)
Vocabulary size (word_to_index): 16633


In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensures reproducibility in CUDA operations
    torch.backends.cudnn.benchmark = False     # Disables some optimizations to ensure determinism

# Set the seed
set_seed(42)

In [5]:
pre_tokenized_train_texts = []
for sentence in train_dataset['text']:
    # Tokenize the sentence using spaCy and store tokens as a list of strings
    tokens = [token.text for token in nlp(sentence.lower())]
    pre_tokenized_train_texts.append(tokens)

In [6]:
# Pre-tokenize validation and test sets
pre_tokenized_validation_texts = [[token.text for token in nlp(sentence.lower())] for sentence in validation_dataset['text']]
pre_tokenized_test_texts = [[token.text for token in nlp(sentence.lower())] for sentence in test_dataset['text']]


In [7]:
# Prepare Dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vocab, embedding_matrix, max_len=30):
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = word_to_index
        self.embedding_matrix = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize(tokens)
        return torch.tensor(vectorized_text), torch.tensor(label)

    def vectorize(self, tokens):
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Check for out-of-range indices
        for index in vectorized:
            if index >= len(self.embedding_matrix):
                raise ValueError(f"Index {index} is out of range for the embedding matrix.")
                
        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized


## 3.5: Improvement - Multihead Self Attention LSTM

In [8]:
import torch.nn.functional as F
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert hidden_dim % num_heads == 0, "Hidden dimension must be divisible by the number of heads"
        self.hidden_dim = hidden_dim  # Store hidden_dim as an instance variable
        self.num_heads = num_heads
        self.dim_per_head = hidden_dim // num_heads
        
        # Define linear layers for query, key, and value
        self.query_layer = nn.Linear(hidden_dim, hidden_dim)
        self.key_layer = nn.Linear(hidden_dim, hidden_dim)
        self.value_layer = nn.Linear(hidden_dim, hidden_dim)
        
        # Linear layer to combine heads
        self.fc = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, encoder_outputs):
        batch_size = encoder_outputs.size(0)
        
        # Linear projections
        query = self.query_layer(encoder_outputs)  # [batch_size, seq_len, hidden_dim]
        key = self.key_layer(encoder_outputs)
        value = self.value_layer(encoder_outputs)
        
        # Split into multiple heads and reshape for attention calculation
        query = query.view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        key = key.view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        value = value.view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        
        # Scaled Dot-Product Attention
        attn_weights = torch.matmul(query, key.transpose(-2, -1)) / (self.dim_per_head ** 0.5)
        attn_weights = F.softmax(attn_weights, dim=-1)
        
        # Compute weighted sum of values
        context = torch.matmul(attn_weights, value)
        
        # Concatenate heads and pass through final linear layer
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.hidden_dim)
        return self.fc(context)
    
    
class BiLSTMWithSelfAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size, dropout_rate=0.3, num_heads=4):
        super(BiLSTMWithSelfAttention, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        
        # Define the bidirectional LSTM layer
        self.lstm = nn.LSTM(embedding_matrix.shape[1], hidden_size, num_layers=1, 
                            bidirectional=True, batch_first=True)
        
        # Multi-head self-attention layer
        self.self_attn = MultiHeadSelfAttention(hidden_size * 2, num_heads=num_heads)
        
        # Fully connected layer to output the final prediction
        self.fc = nn.Linear(hidden_size * 2, output_size)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Embed the input
        embedded = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
        
        # Pass the embeddings through the LSTM
        lstm_out, _ = self.lstm(embedded)  # lstm_out: [batch_size, seq_len, hidden_size * 2]
        
        # Apply multi-head self-attention on the LSTM outputs
        context = self.self_attn(lstm_out)  # [batch_size, seq_len, hidden_size * 2]
        
        # Use mean pooling over the sequence length for final sentence representation
        pooled_context = torch.mean(context, dim=1)  # [batch_size, hidden_size * 2]
        
        # Pass through fully connected layer
        output = self.fc(self.dropout(pooled_context))  # [batch_size, output_size]
        
        return output

In [9]:
# Prepare DataLoader
def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataset_instance = SentimentDataset(pre_tokenized_train_texts, train_dataset['label'], word_to_index, embedding_matrix)
val_dataset_instance = SentimentDataset(pre_tokenized_validation_texts, validation_dataset['label'], word_to_index, embedding_matrix)
test_dataset_instance = SentimentDataset(pre_tokenized_test_texts, test_dataset['label'], word_to_index, embedding_matrix)

In [10]:
def evaluate(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data, labels in data_loader:
            output = model.forward(data)
            probs = torch.sigmoid(output)  # Apply sigmoid to get probabilities
            predicted = (probs >= 0.5).long()  # Convert probabilities to binary predictions
            all_preds.extend(predicted.cpu().numpy().flatten().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    return acc

In [11]:
# Train and validate function
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, max_epochs=100, convergence_threshold=0.001):
    best_val_acc = 0
    epochs_without_improvement = 0
    
    for epoch in range(max_epochs):
        model.train()
        running_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data).squeeze(1)
            loss = criterion(output, target.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{max_epochs}, Loss: {running_loss/len(train_loader)}, Val Accuracy: {val_acc}")
        
        # Check for improvement
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 10:  # Convergence condition (no improvement for 5 epochs)
            print("Convergence reached, stopping training.")
            break
            
    return best_val_acc, epoch 


In [12]:
#Training the model with the best hyperparameters
batch_size = 64
lr=0.001
model = BiLSTMWithSelfAttention(embedding_matrix_array, hidden_size=128, output_size=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loader = create_data_loader(train_dataset_instance, batch_size)
val_loader = create_data_loader(val_dataset_instance, batch_size)
                
# Train and validate
val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
print(f"Validation Accuracy: {val_acc}, over {epochs_used} epochs")

Epoch 1/100, Loss: 0.5467946498251673, Val Accuracy: 0.7382739212007504
Epoch 2/100, Loss: 0.3475748131524271, Val Accuracy: 0.7786116322701688
Epoch 3/100, Loss: 0.18891150587752684, Val Accuracy: 0.775797373358349
Epoch 4/100, Loss: 0.07896038220583726, Val Accuracy: 0.7729831144465291
Epoch 5/100, Loss: 0.03400338272526756, Val Accuracy: 0.7626641651031895
Epoch 6/100, Loss: 0.01741032857871823, Val Accuracy: 0.7711069418386491
Epoch 7/100, Loss: 0.004463940029463221, Val Accuracy: 0.7851782363977486
Epoch 8/100, Loss: 0.0027898363919782797, Val Accuracy: 0.7654784240150094
Epoch 9/100, Loss: 0.019749614002363674, Val Accuracy: 0.7570356472795498
Epoch 10/100, Loss: 0.00684902294606625, Val Accuracy: 0.7701688555347092
Epoch 11/100, Loss: 0.005605665637678829, Val Accuracy: 0.775797373358349
Epoch 12/100, Loss: 0.0006463904098225938, Val Accuracy: 0.7692307692307693
Epoch 13/100, Loss: 6.428473893834901e-05, Val Accuracy: 0.7701688555347092
Epoch 14/100, Loss: 3.0413369074828193e-05

In [13]:
# Step 7: Evaluate on Test Set
test_loader = create_data_loader(test_dataset_instance, batch_size)
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc}")

# Report the configuration
print(f"Final Configuration:\nEpochs: {epochs_used}\nLearning Rate: {lr}\nOptimizer: Adam\nBatch Size: {batch_size}")

Test Accuracy: 0.7861163227016885
Final Configuration:
Epochs: 16
Learning Rate: 0.001
Optimizer: Adam
Batch Size: 64
