## Install and import dependencies

## Part 0. Dataset Preparation

In [1]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


## Part 2 : Model Training & Evaluation - RNN

In [2]:
%pip install scikit-learn




In [3]:
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import torch.optim as optim
import spacy
import random

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the embedding matrix and word_to_index from the pickle file
with open("updated_embedding_matrix.pkl", "rb") as f:
    data = pickle.load(f)
    embedding_matrix = data["embeddings"]
    word_to_index = data["word_to_index"]

# Convert embedding_matrix to a NumPy array and a PyTorch tensor
embedding_matrix_array = np.array(embedding_matrix)
embedding_matrix_tensor = torch.tensor(embedding_matrix_array, dtype=torch.float32)

print(f"Loaded embedding matrix with shape: {embedding_matrix_array.shape}")
print(f"Vocabulary size (word_to_index): {len(word_to_index)}")

Loaded embedding matrix with shape: (16633, 300)
Vocabulary size (word_to_index): 16633


In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensures reproducibility in CUDA operations
    torch.backends.cudnn.benchmark = False     # Disables some optimizations to ensure determinism

# Set the seed
set_seed(42)

In [5]:
pre_tokenized_train_texts = []
for sentence in train_dataset['text']:
    # Tokenize the sentence using spaCy and store tokens as a list of strings
    tokens = [token.text for token in nlp(sentence.lower())]
    pre_tokenized_train_texts.append(tokens)

In [6]:
# Pre-tokenize validation and test sets
pre_tokenized_validation_texts = [[token.text for token in nlp(sentence.lower())] for sentence in validation_dataset['text']]
pre_tokenized_test_texts = [[token.text for token in nlp(sentence.lower())] for sentence in test_dataset['text']]


In [7]:
# Prepare Dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vocab, embedding_matrix, max_len=30):
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = word_to_index
        self.embedding_matrix = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize(tokens)
        return torch.tensor(vectorized_text), torch.tensor(label)

    def vectorize(self, tokens):
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Check for out-of-range indices
        for index in vectorized:
            if index >= len(self.embedding_matrix):
                raise ValueError(f"Index {index} is out of range for the embedding matrix.")
                
        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized


## 3.5: Improvement - Multihead Self Attention BiGRU

In [9]:
import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.embed_size = embed_size
        self.head_dim = embed_size // num_heads

        assert self.head_dim * num_heads == embed_size, "Embedding size must be divisible by number of heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask=None):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into num_heads different pieces
        values = values.reshape(N, value_len, self.num_heads, self.head_dim).transpose(1, 2)
        keys = keys.reshape(N, key_len, self.num_heads, self.head_dim).transpose(1, 2)
        query = query.reshape(N, query_len, self.num_heads, self.head_dim).transpose(1, 2)

        energy = torch.einsum("nqhd,nkhd->nhqk", [query, keys])  # (N, num_heads, query_len, key_len)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=-1)  # (N, num_heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values])  # (N, query_len, num_heads, head_dim)
        out = out.transpose(1, 2).reshape(N, query_len, self.num_heads * self.head_dim)  # (N, query_len, embed_size)

        out = self.fc_out(out)
        return out


class BiGRUWithSelfAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size, num_heads=8, dropout=0.1):
        super(BiGRUWithSelfAttention, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        
        # Bi-GRU Layer
        self.bigru = nn.GRU(input_size=embedding_matrix.size(1), hidden_size=hidden_size,
                            num_layers=1, bidirectional=True, batch_first=True, dropout=dropout)

        # Multi-Head Self Attention Layer
        self.attention = MultiHeadSelfAttention(embed_size=hidden_size * 2, num_heads=num_heads)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        # Get the embedding for the input sequence
        embedded = self.embedding(x)
        
        # Pass through BiGRU
        gru_out, _ = self.bigru(embedded)

        # Pass through Multi-Head Attention
        attention_out = self.attention(gru_out, gru_out, gru_out)

        # Pass through a fully connected layer for final output
        output = self.fc(attention_out[:, -1, :])  # Take the last output for classification

        return output


In [10]:
# Prepare DataLoader
def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataset_instance = SentimentDataset(pre_tokenized_train_texts, train_dataset['label'], word_to_index, embedding_matrix)
val_dataset_instance = SentimentDataset(pre_tokenized_validation_texts, validation_dataset['label'], word_to_index, embedding_matrix)
test_dataset_instance = SentimentDataset(pre_tokenized_test_texts, test_dataset['label'], word_to_index, embedding_matrix)

In [11]:
def evaluate(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data, labels in data_loader:
            output = model.forward(data)
            probs = torch.sigmoid(output)  # Apply sigmoid to get probabilities
            predicted = (probs >= 0.5).long()  # Convert probabilities to binary predictions
            all_preds.extend(predicted.cpu().numpy().flatten().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    return acc

In [19]:
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, max_epochs=100, convergence_threshold=0.001):
    best_val_acc = 0
    epochs_without_improvement = 0
    prev_val_acc = 0  # Track the previous validation accuracy

    for epoch in range(max_epochs):
        model.train()
        running_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)  # Unpack model output (assuming model returns output and attention weights)
            output = output.squeeze(1)  # Squeeze to remove any unwanted dimensions (if needed)
            loss = criterion(output, target.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{max_epochs}, Loss: {running_loss/len(train_loader)}, Val Accuracy: {val_acc}")

        # Check for improvement in accuracy or loss
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
        
        # Check for convergence in terms of accuracy improvement (threshold for small change)
        if abs(val_acc - prev_val_acc) < convergence_threshold:
            epochs_without_improvement += 1
            print(f"Small change in validation accuracy ({abs(val_acc - prev_val_acc):.6f}), increasing patience.")
        
        prev_val_acc = val_acc  # Update the previous validation accuracy
        
        # Check for convergence (no improvement for 10 epochs or small changes)
        if epochs_without_improvement >= 10:  # Convergence condition
            print("Convergence reached, stopping training.")
            break
            
    return best_val_acc, epoch


In [20]:
#Training the model with the best hyperparameters
batch_size = 64
lr=0.001
model = BiGRUWithSelfAttention(embedding_matrix_tensor, hidden_size=128, output_size=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loader = create_data_loader(train_dataset_instance, batch_size)
val_loader = create_data_loader(val_dataset_instance, batch_size)
                
# Train and validate
val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
print(f"Validation Accuracy: {val_acc}, over {epochs_used} epochs")

Epoch 1/100, Loss: 0.6221631649714797, Val Accuracy: 0.7551594746716698
Epoch 2/100, Loss: 0.385480981042136, Val Accuracy: 0.7682926829268293
Epoch 3/100, Loss: 0.2059583786819408, Val Accuracy: 0.7682926829268293
Small change in validation accuracy (0.000000), increasing patience.
Epoch 4/100, Loss: 0.08824793556566114, Val Accuracy: 0.7598499061913696
Epoch 5/100, Loss: 0.03770583792269897, Val Accuracy: 0.7382739212007504
Epoch 6/100, Loss: 0.027207952549632416, Val Accuracy: 0.7664165103189493
Epoch 7/100, Loss: 0.014696399301834362, Val Accuracy: 0.7673545966228893
Small change in validation accuracy (0.000938), increasing patience.
Epoch 8/100, Loss: 0.012700410767626921, Val Accuracy: 0.7542213883677298
Epoch 9/100, Loss: 0.009156539890139342, Val Accuracy: 0.7354596622889306
Epoch 10/100, Loss: 0.011639100531215602, Val Accuracy: 0.7626641651031895
Convergence reached, stopping training.
Validation Accuracy: 0.7682926829268293, over 9 epochs


In [21]:
# Step 7: Evaluate on Test Set
test_loader = create_data_loader(test_dataset_instance, batch_size)
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc}")

# Report the configuration
print(f"Final Configuration:\nEpochs: {epochs_used}\nLearning Rate: {lr}\nOptimizer: Adam\nBatch Size: {batch_size}")

Test Accuracy: 0.7804878048780488
Final Configuration:
Epochs: 9
Learning Rate: 0.001
Optimizer: Adam
Batch Size: 64
