## Install and import dependencies

In [1]:
# %pip install torch gensim datasets nltk

In [1]:
import os
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import spacy
import random
import pickle

from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score

from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


## Part 0. Dataset Preparation

In [2]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [3]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [4]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the embedding matrix and word_to_index from the pickle file
with open("updated_embedding_matrix.pkl", "rb") as f:
    data = pickle.load(f)
    embedding_matrix = data["embeddings"]
    word_to_index = data["word_to_index"]

# Convert embedding_matrix to a NumPy array and a PyTorch tensor
embedding_matrix_array = np.array(embedding_matrix)
embedding_matrix_tensor = torch.tensor(embedding_matrix_array, dtype=torch.float32)

print(f"Loaded embedding matrix with shape: {embedding_matrix_array.shape}")
print(f"Vocabulary size (word_to_index): {len(word_to_index)}")

Loaded embedding matrix with shape: (16633, 300)
Vocabulary size (word_to_index): 16633


tokenise train, validation, test dataset

In [5]:
pre_tokenized_train_texts = []
for sentence in train_dataset['text']:
    # Tokenize the sentence using spaCy and store tokens as a list of strings
    tokens = [token.text for token in nlp(sentence.lower())]
    pre_tokenized_train_texts.append(tokens)

In [6]:
# Pre-tokenize validation and test sets
pre_tokenized_validation_texts = [[token.text for token in nlp(sentence.lower())] for sentence in validation_dataset['text']]
pre_tokenized_test_texts = [[token.text for token in nlp(sentence.lower())] for sentence in test_dataset['text']]

In [None]:
# #tokenize sentences 
# train_tokenized = []
# for sentence in train_dataset['text']:
#     train_tokenized.append(word_tokenize(sentence.lower()))

# print('sample sentence:', train_tokenized[0],'\n')

# #build vocabulary
# vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
# vocab.update(word for sentence in train_tokenized for word in sentence)

# print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
# print("Number of words in the vocabulary:" , len(vocab)-2)


sample sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] 

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


## Part 3 
# 3.Keeping the above two adjustments, replace your simple RNN model in Part 2 wioth a biLSTM model and biGRU model.

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
# import numpy as np
# from nltk.tokenize import word_tokenize
# from sklearn.metrics import accuracy_score

In [None]:
# embedding_matrix=np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
# # display(embedding_matrix)

In [7]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensures reproducibility in CUDA operations
    torch.backends.cudnn.benchmark = False     # Disables some optimizations to ensure determinism

# Set the seed
set_seed(42)

In [8]:
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels : list[int], vocab : set, embedding_matrix : dict, max_len=30):
        self.texts = tokenized_texts
        self.labels  = labels
        self.vocab = word_to_index
        self.embedding_matrix : dict = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize(text)
        return torch.tensor(vectorized_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)
            
    def vectorize(self, tokens):
        # Convert tokens to their corresponding index in the vocabulary
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized

    def build_vocab_dict(self, vocab : set):
        if "<PAD>" in vocab : vocab.remove("<PAD>")
        if "<UNK>" in vocab : vocab.remove("<UNK>")
        vocab_dict = {word: idx for idx, word in enumerate(vocab)}
        vocab_dict['<PAD>'] = len(vocab_dict) # Add padding token
        vocab_dict['<UNK>'] = len(vocab_dict) # Add unknown token
        return vocab_dict


In [None]:
# # Converting the Vocab set to dictionary 
# def build_vocab_dict(vocab_set):
#     # Create the vocabulary dictionary without <PAD> and <UNK>
#     vocab_set.discard("<PAD>")
#     vocab_set.discard("<UNK>")
#     vocab_dict = {word: idx for idx, word in enumerate(vocab_set, start=2)}

#     # Check for <PAD> and <UNK> existence and assign them fixed indices if they are present
#     if "<PAD>" not in vocab_dict:
#         vocab_dict["<PAD>"] = 0  # Index for padding token
#     if "<UNK>" not in vocab_dict:
#         vocab_dict["<UNK>"] = 1  # Index for unknown token
    
#     #add the <PAD> and <UNK> back to the vocab
#     vocab_set.add("<PAD>")
#     vocab_set.add("<UNK>")

# biGRU Model

In [9]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [None]:
class SentimentBiGRU(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(SentimentBiGRU, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  
        self.hidden_dim = hidden_dim
        
        # Bidirectional GRU layer
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Output size is 2 * hidden_dim for bidirectional
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer for regularization
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation for binary classification
        

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: [batch_size, sequence_length, embedding_dim]
        gru_out, _ = self.gru(embedded)  # gru_out shape: [batch_size, sequence_length, hidden_dim * 2]
        # Concatenate last hidden states from forward and backward GRUs
        out = torch.cat((gru_out[:, -1, :self.hidden_dim], gru_out[:, 0, self.hidden_dim:]), dim=1)  # Shape: [batch_size, hidden_dim * 2]
        out = self.dropout(out)  # Apply dropout
        return self.fc(out)  # Final output

In [11]:

# embedding_matrix : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
# embedding_matrix_values = np.array(list(embedding_matrix.values()), dtype=np.float32)
# embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)

# Create dataset instances
train_dataset = SentimentDataset(pre_tokenized_train_texts, train_dataset['label'], word_to_index, embedding_matrix)
valid_dataset = SentimentDataset(pre_tokenized_validation_texts, validation_dataset['label'], word_to_index, embedding_matrix)
test_dataset = SentimentDataset(pre_tokenized_test_texts, test_dataset['label'], word_to_index, embedding_matrix)
# Create data loaders
train_iterator = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_iterator = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_iterator = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [12]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in iterator:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch.float())
        loss.backward()
        # for param in model.parameters():
        #     if param.grad is not None:  # Ensure the gradient is not None
        #         print(f"Gradient norm for {param.shape}: {param.grad.data.norm()}")
        optimizer.step()
        # print("model.parameters():",model.parameters())
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


In [13]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())
            epoch_loss += loss.item()
            
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, epoch_loss / len(iterator)


In [23]:
def train_and_validate(num_epochs, model, train_iterator, valid_iterator, optimizer, criterion, scheduler):
    for epoch in range(num_epochs):
        train_loss = train(model, train_iterator, optimizer, criterion)
        accuracy , valid_loss = evaluate(model, valid_iterator, criterion)
        scheduler.step(valid_loss)

        if (epoch==0):
            best_acc = accuracy
            epochs_without_improvement = 0
        print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.3f}, Accuracy = {accuracy:.3f}, Val Loss = {valid_loss:.3f} Learning Rate: {scheduler.optimizer.param_groups[0]["lr"]:.6f}')

        if accuracy > best_acc:
            best_acc = accuracy
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 6:  # Convergence condition (no improvement for 4 epochs)
            print("Convergence reached, stopping training.")
            break

    return accuracy

In [18]:
hidden_dim = 128  # Adjust as needed
output_dim = 1  # Binary sentiment classification
dropout_rate = 0.3  # Adjust as needed

criterion = nn.BCEWithLogitsLoss()
model = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate=0.5).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

In [19]:
# Now you can run your training loop
train_and_validate(30, model, train_iterator, valid_iterator, optimizer, criterion, scheduler)

Epoch 1: Train Loss = 0.548, Accuracy = 0.780, Val Loss = 0.470 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.323, Accuracy = 0.780, Val Loss = 0.492 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.150, Accuracy = 0.765, Val Loss = 0.562 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.057, Accuracy = 0.755, Val Loss = 0.857 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.025, Accuracy = 0.750, Val Loss = 0.892 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.009, Accuracy = 0.741, Val Loss = 1.338 Learning Rate: 0.001000
Convergence reached, stopping training.


## Hyper-parameter training

In [24]:
import itertools

# Define the hyper-parameter grid
hidden_dims = [64, 128]
learning_rates = [0.001]
dropout_rates = [0.3, 0.5]
batch_sizes = [32, 64]
output_dim = 1 
best_acc = 0
best_hidden_dim = 0
best_dropout_rate = 0
best_bs = 0

# Iterate over all combinations of hyper-parameters
for hidden_dim, lr, dropout_rate, bs in itertools.product(hidden_dims, learning_rates, dropout_rates, batch_sizes):
    print(f'Training with hidden_dim={hidden_dim}, lr={lr}, dropout_rate={dropout_rate}, batch_size={bs}')
    
    model_hyper = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
    criterion_hyper = nn.BCEWithLogitsLoss()
    optimizer_hyper = optim.Adam(model_hyper.parameters(), lr=lr)
    scheduler_hyper = optim.lr_scheduler.ReduceLROnPlateau(optimizer_hyper, mode='min', factor=0.1, patience=4)

    train_iterator_hyper = DataLoader(train_dataset, bs, shuffle=True)
    valid_iterator_hyper = DataLoader(valid_dataset, bs, shuffle=False)

    accuracy = train_and_validate(25, model_hyper, train_iterator_hyper, valid_iterator_hyper, optimizer_hyper, criterion_hyper, scheduler_hyper)

    if accuracy > best_acc:
        best_acc = accuracy
        best_hidden_dim = hidden_dim
        best_dropout_rate = dropout_rate
        best_bs = bs


print(f'Best accuracy: {best_acc:.3f} with hidden_dim={best_hidden_dim}, dropout_rate={best_dropout_rate}, batch_size={best_bs}')

Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=32
Epoch 1: Train Loss = 0.563, Accuracy = 0.747, Val Loss = 0.495 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.330, Accuracy = 0.765, Val Loss = 0.489 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.156, Accuracy = 0.765, Val Loss = 0.644 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.066, Accuracy = 0.757, Val Loss = 0.770 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.026, Accuracy = 0.757, Val Loss = 0.970 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.011, Accuracy = 0.754, Val Loss = 1.103 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.008, Accuracy = 0.756, Val Loss = 1.243 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.003, Accuracy = 0.752, Val Loss = 1.260 Learning Rate: 0.000100
Epoch 9: Train Loss = 0.002, Accuracy = 0.750, Val Loss = 1.288 Learning Rate: 0.000100
Convergence reached, stopping training.
Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=64
Epoch 1: Train Loss = 0.59

## Training with best parameters 
hidden_dim=64
dropout_rate=0.3
batch_size=64

In [43]:
hidden_dim_best = 128
lr=0.001
dropout_rate_best=0.3
bs_best=32
output_dim = 1 

model_best = SentimentBiGRU(embedding_matrix_tensor, hidden_dim_best, output_dim, dropout_rate_best).to(device)
criterion_best = nn.BCEWithLogitsLoss()
optimizer_best = optim.Adam(model_best.parameters(), lr)
scheduler_best = optim.lr_scheduler.ReduceLROnPlateau(optimizer_best, mode='min', factor=0.1, patience=5)

train_iterator_best = DataLoader(train_dataset, bs_best)
valid_iterator_best = DataLoader(valid_dataset, bs_best)

train_and_validate(25, model_best, train_iterator_best, valid_iterator_best, optimizer_best, criterion_best, scheduler_best)


Epoch 1: Train Loss = 0.283, Accuracy = 0.500, Val Loss = 2.815 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.446, Accuracy = 0.500, Val Loss = 2.534 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.426, Accuracy = 0.500, Val Loss = 2.457 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.424, Accuracy = 0.500, Val Loss = 2.444 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.422, Accuracy = 0.500, Val Loss = 2.255 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.354, Accuracy = 0.500, Val Loss = 2.639 Learning Rate: 0.001000
Convergence reached, stopping training.


0.5

## Testing

In [27]:
def test(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())

            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
            
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy = {accuracy:.3f}")

In [None]:
test(model_best, test_iterator, criterion_best)

Accuracy = 0.775


## BiGRU with an Attention Layer

In [36]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, hidden, encoder_outputs):
        """
        hidden: [batch_size, hidden_dim] (last hidden state of the decoder or a step in the sequence)
        encoder_outputs: [batch_size, seq_len, hidden_dim] (outputs from the encoder)
        """
        # Compute the attention weights using the query (hidden state) and keys (encoder outputs)
        attn_weights = torch.matmul(encoder_outputs, hidden.unsqueeze(2)).squeeze(2) # [batch_size, seq_len]
        
        # Apply softmax to get the attention weights
        attn_weights = F.softmax(attn_weights, dim=1)
        
        # Compute the weighted sum of the encoder outputs (values)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1) # [batch_size, hidden_dim]
        
        return context

In [37]:

class BiGRUWithAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(BiGRUWithAttention, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        
        # Define the GRU layer (bidirectional)
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        # Attention layer
        self.attn = Attention(hidden_dim * 2)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Embed the input
        embedded = self.embedding(x) # [batch_size, seq_len, embedding_dim]
        
        # Get the outputs and hidden states from the GRU
        gru_out, _ = self.gru(embedded) # gru_out: [batch_size, seq_len, hidden_dim * 2]
        
        # Use the last hidden state of the GRU as the query for the attention mechanism
        last_hidden = gru_out[:, -1, :] # [batch_size, 2 * hidden_dim]
        
        # Apply attention to the GRU outputs
        context = self.attn(last_hidden, gru_out) # context: [batch_size, 2 * hidden_dim]
        
        # Pass the context vector through a fully connected layer
        output = self.fc(context) # [batch_size, output_dim]
        
        return output

In [38]:
hidden_dim_attn = 128
lr_attn=0.001
dropout_rate_attn=0.3
bs_attn=32
output_dim_attn = 1 


model_attn= BiGRUWithAttention(embedding_matrix_tensor, hidden_dim_attn, output_dim_attn, dropout_rate_attn).to(device)
criterion_attn = nn.BCEWithLogitsLoss()
optimizer_attn = torch.optim.Adam(model_attn.parameters(), lr=1e-3)
scheduler_attn = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_attn, mode='min', factor=0.1, patience=4)

In [39]:
train_and_validate(20, model_attn, train_iterator,valid_iterator, optimizer_attn, criterion_attn, scheduler_attn)

Epoch 1: Train Loss = 0.546, Accuracy = 0.772, Val Loss = 0.474 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.350, Accuracy = 0.771, Val Loss = 0.492 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.217, Accuracy = 0.777, Val Loss = 0.565 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.133, Accuracy = 0.770, Val Loss = 0.674 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.075, Accuracy = 0.752, Val Loss = 0.928 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.043, Accuracy = 0.763, Val Loss = 0.903 Learning Rate: 0.000100
Epoch 7: Train Loss = 0.024, Accuracy = 0.763, Val Loss = 0.947 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.021, Accuracy = 0.761, Val Loss = 0.995 Learning Rate: 0.000100
Epoch 9: Train Loss = 0.018, Accuracy = 0.762, Val Loss = 1.032 Learning Rate: 0.000100
Convergence reached, stopping training.


0.7617260787992496

In [40]:
test(model_attn, test_iterator, criterion_attn)

Accuracy = 0.781
