## Install and import dependencies

## Part 0. Dataset Preparation

In [41]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

## Part 2 : Model Training & Evaluation - RNN

In [4]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.5.2 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [67]:
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import torch.optim as optim
import spacy
import random

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the embedding matrix and word_to_index from the pickle file
with open("base_embedding_matrix.pkl", "rb") as f:
    data = pickle.load(f)
    embedding_matrix = data["embeddings"]
    word_to_index = data["word_to_index"]

# Convert embedding_matrix to a NumPy array and a PyTorch tensor
embedding_matrix_array = np.array(embedding_matrix)
embedding_matrix_tensor = torch.tensor(embedding_matrix_array, dtype=torch.float32)

print(f"Loaded embedding matrix with shape: {embedding_matrix_array.shape}")
print(f"Vocabulary size (word_to_index): {len(word_to_index)}")

Loaded embedding matrix with shape: (16633, 300)
Vocabulary size (word_to_index): 16633


In [68]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensures reproducibility in CUDA operations
    torch.backends.cudnn.benchmark = False     # Disables some optimizations to ensure determinism

# Set the seed
set_seed(42)

In [69]:
pre_tokenized_train_texts = []
for sentence in train_dataset['text']:
    # Tokenize the sentence using spaCy and store tokens as a list of strings
    tokens = [token.text for token in nlp(sentence.lower())]
    pre_tokenized_train_texts.append(tokens)

In [70]:
# Pre-tokenize validation and test sets
pre_tokenized_validation_texts = [[token.text for token in nlp(sentence.lower())] for sentence in validation_dataset['text']]
pre_tokenized_test_texts = [[token.text for token in nlp(sentence.lower())] for sentence in test_dataset['text']]


In [71]:
# Prepare Dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vocab, embedding_matrix, max_len=30):
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = word_to_index
        self.embedding_matrix = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize(tokens)
        return torch.tensor(vectorized_text), torch.tensor(label)

    def vectorize(self, tokens):
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Check for out-of-range indices
        for index in vectorized:
            if index >= len(self.embedding_matrix):
                raise ValueError(f"Index {index} is out of range for the embedding matrix.")
                
        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized


In [72]:
# Create the RNN Model
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size, num_layers=2, bidirectional=False):
        super(RNNModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        
        # Define the embedding layer with pretrained embeddings, frozen
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)
        
        # Define RNN layer 
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass
        x = self.embedding(x)
        out, _ = self.rnn(x)
        # Using Average Pooling
        out = torch.mean(out, dim=1)
        out = self.fc(out)
        return out

In [73]:
# Prepare DataLoader
def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataset_instance = SentimentDataset(pre_tokenized_train_texts, train_dataset['label'], word_to_index, embedding_matrix)
val_dataset_instance = SentimentDataset(pre_tokenized_validation_texts, validation_dataset['label'], word_to_index, embedding_matrix)
test_dataset_instance = SentimentDataset(pre_tokenized_test_texts, test_dataset['label'], word_to_index, embedding_matrix)

In [74]:
def evaluate(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data, labels in data_loader:
            output = model.forward(data)
            probs = torch.sigmoid(output)  # Apply sigmoid to get probabilities
            predicted = (probs >= 0.5).long()  # Convert probabilities to binary predictions
            all_preds.extend(predicted.cpu().numpy().flatten().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    return acc

In [75]:
# Train and validate function
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, max_epochs=100, convergence_threshold=0.001):
    best_val_acc = 0
    epochs_without_improvement = 0
    
    for epoch in range(max_epochs):
        model.train()
        running_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data).squeeze(1)
            loss = criterion(output, target.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{max_epochs}, Loss: {running_loss/len(train_loader)}, Val Accuracy: {val_acc}")
        
        # Check for improvement
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 10:  # Convergence condition (no improvement for 5 epochs)
            print("Convergence reached, stopping training.")
            break
            
    return best_val_acc, epoch 


In [76]:
# Hyperparameter tuning
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
hidden_size = 128
optimizers = ['adam', 'sgd', 'rmsprop']  

vocab_size, embedding_dim = embedding_matrix_array.shape
output_size = 1  

best_val_acc = 0
best_hyperparams = {}

for lr in learning_rates:
    for bs in batch_sizes:
            for opt in optimizers:
                print("Training with the following hyperparameters:")
                print(f"Learning Rate: {lr}, Batch Size: {bs}, Hidden Size: {hidden_size}, Optimizer: {opt}")
                # Initialize model, criterion
                model = RNNModel(embedding_matrix_array, hidden_size=128, output_size=1)
                criterion = nn.BCEWithLogitsLoss()

                # Initialize optimizer based on the selected type
                if opt == 'adam':
                    optimizer = optim.Adam(model.parameters(), lr=lr)
                elif opt == 'sgd':
                    optimizer = optim.SGD(model.parameters(), lr=lr)
                elif opt == 'rmsprop':
                    optimizer = optim.RMSprop(model.parameters(), lr=lr)

                train_loader = create_data_loader(train_dataset_instance, bs)
                val_loader = create_data_loader(val_dataset_instance, bs)
                
                # Train and validate
                val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
                print(f"Learning Rate: {lr}, Batch Size: {bs} Optimizer: {opt}, Validation Accuracy: {val_acc}")

                # Update best parameters
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_hyperparams = {
                        'learning_rate': lr,
                        'batch_size': bs,
                        'optimizer': opt
                    }
                    best_epochs = epochs_used

# Print the best configuration
print(f"Best Model Configuration: {best_hyperparams} with Validation Accuracy: {best_val_acc} over {best_epochs} epochs")

Training with the following hyperparameters:
Learning Rate: 0.001, Batch Size: 32, Hidden Size: 128, Optimizer: adam
Epoch 1/100, Loss: 0.5837534061085419, Val Accuracy: 0.7213883677298312
Epoch 2/100, Loss: 0.5237110727065512, Val Accuracy: 0.6894934333958724
Epoch 3/100, Loss: 0.5254920146215275, Val Accuracy: 0.7307692307692307
Epoch 4/100, Loss: 0.5011564123719819, Val Accuracy: 0.7373358348968105
Epoch 5/100, Loss: 0.48441201649355087, Val Accuracy: 0.7504690431519699
Epoch 6/100, Loss: 0.4728004155534037, Val Accuracy: 0.7514071294559099
Epoch 7/100, Loss: 0.46884569704309387, Val Accuracy: 0.7532833020637899
Epoch 8/100, Loss: 0.44855630604292124, Val Accuracy: 0.7410881801125704
Epoch 9/100, Loss: 0.4370223778798786, Val Accuracy: 0.7607879924953096
Epoch 10/100, Loss: 0.4207631019728907, Val Accuracy: 0.7626641651031895
Epoch 11/100, Loss: 0.40114425345976257, Val Accuracy: 0.7429643527204502
Epoch 12/100, Loss: 0.38232182931810726, Val Accuracy: 0.7448405253283302
Epoch 13/10

In [78]:
#Training the model with the best hyperparameters
batch_size = 32
lr=0.001
model = RNNModel(embedding_matrix_array, hidden_size=128, output_size=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loader = create_data_loader(train_dataset_instance, batch_size)
val_loader = create_data_loader(val_dataset_instance, batch_size)
                
# Train and validate
val_acc, epochs_used = train_and_validate(model, train_loader, val_loader, optimizer, criterion)
print(f"Validation Accuracy: {val_acc}, over {epochs_used} epochs")

Epoch 1/100, Loss: 0.576576258136092, Val Accuracy: 0.7213883677298312
Epoch 2/100, Loss: 0.521774230155159, Val Accuracy: 0.7298311444652908
Epoch 3/100, Loss: 0.5068495542034228, Val Accuracy: 0.6454033771106942
Epoch 4/100, Loss: 0.5062251841084341, Val Accuracy: 0.7373358348968105
Epoch 5/100, Loss: 0.5072163670473777, Val Accuracy: 0.7204502814258912
Epoch 6/100, Loss: 0.48705723156196795, Val Accuracy: 0.7401500938086304
Epoch 7/100, Loss: 0.46627063574862393, Val Accuracy: 0.7542213883677298
Epoch 8/100, Loss: 0.45223133143191035, Val Accuracy: 0.7542213883677298
Epoch 9/100, Loss: 0.4401021102506123, Val Accuracy: 0.7495309568480301
Epoch 10/100, Loss: 0.42683407340603374, Val Accuracy: 0.7354596622889306
Epoch 11/100, Loss: 0.4197735022962763, Val Accuracy: 0.7514071294559099
Epoch 12/100, Loss: 0.40088874347201003, Val Accuracy: 0.7354596622889306
Epoch 13/100, Loss: 0.37567835331856086, Val Accuracy: 0.7560975609756098
Epoch 14/100, Loss: 0.3614028098989515, Val Accuracy: 0.

In [80]:
# Step 7: Evaluate on Test Set
test_loader = create_data_loader(test_dataset_instance, batch_size)
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc}")

# Report the configuration
print(f"Final Configuration:\nEpochs: {epochs_used}\nLearning Rate: {lr}\nOptimizer: Adam\nBatch Size: {batch_size}")

Test Accuracy: 0.7326454033771107
Final Configuration:
Epochs: 22
Learning Rate: 0.001
Optimizer: Adam
Batch Size: 32


In [28]:
# Step 8: Get a sample sentence from the test set and predict
import random
# Select a random index from the test dataset
random_index = random.randint(0, len(test_dataset) - 1)

# Get the corresponding sentence and its label from the test dataset
sample_sentence = test_dataset[random_index]['text']  # Assuming the dataset contains a 'text' field
true_label = test_dataset[random_index]['label']  # Assuming there's a label field

# Tokenize the sample sentence
sample_tokens = word_tokenize(sample_sentence.lower())

# Convert tokens to indices
sample_indices = []
for token in sample_tokens:
    if token in vocab:
        sample_indices.append(list(vocab).index(token))
    else:
        sample_indices.append(list(vocab).index("<UNK>"))
sample_tensor = torch.tensor(sample_indices).unsqueeze(0)  # Add batch dimension
# Make prediction using the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to compute gradients during inference
    output = model(sample_tensor)  # Pass the tensor to the model
    _, predicted = torch.max(output, 1)  # Get the index of the max log-probability

# Map predicted index to sentiment label
sentiment_labels = ['negative', 'positive']  # Adjust according to your label encoding
predicted_label = sentiment_labels[predicted.item()]

# Print results
print(f"Sample Sentence: '{sample_sentence}'")
print(f"True Label: {true_label}")
print(f"Predicted Label: {predicted_label}")

Sample Sentence: 'directed in a flashy , empty sub-music video style by a director so self-possessed he actually adds a period to his first name'
True Label: negative
Predicted Label: negative
