In [57]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim

from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [34]:
class SentimentDataset_33(Dataset):
    def __init__(self, texts : list[str], labels : list[int], vocab : set, embedding_matrix : dict, max_len=30):
        self.texts : list[str] = texts
        self.labels : list[int] = labels
        self.vocab : dict = self.build_vocab_dict(vocab)  # function to build vocabulary
        self.embedding_matrix : dict = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = word_tokenize(text.lower())
        vectorized_text = self.vectorize(tokenized_text)
        return torch.tensor(vectorized_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

    def vectorize(self, tokens):
        # Convert tokens to their corresponding index in the vocabulary
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized

    def build_vocab_dict(self, vocab : set):
        if "<PAD>" in vocab : vocab.remove("<PAD>")
        if "<UNK>" in vocab : vocab.remove("<UNK>")
        vocab_dict = {word: idx for idx, word in enumerate(vocab)}
        print(len(vocab_dict))
        vocab_dict['<PAD>'] = len(vocab_dict) # Add padding token
        print(len(vocab_dict))
        vocab_dict['<UNK>'] = len(vocab_dict) # Add unknown token
        print(len(vocab_dict))
        return vocab_dict

In [35]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [36]:
train_tokenized = []
for sentence in train_dataset['text']:
    train_tokenized.append(word_tokenize(sentence.lower()))
vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
vocab.update(word for sentence in train_tokenized for word in sentence)

print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
print("Number of words in the vocabulary:" , len(vocab)-2)

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


In [37]:
# Prepare your datasets
train_texts_33 : list[str] = train_dataset['text']  # List of training texts
train_labels_33 : list[int] = train_dataset['label']  # Corresponding labels for training texts
valid_texts_33 : list[str]= validation_dataset['text']  # List of validation texts
valid_labels_33 : list[int] = validation_dataset['label']  # Corresponding labels for validation texts
vocab_33 : set = vocab
embedding_matrix_33 : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
embedding_matrix_values = np.array(list(embedding_matrix_33.values()), dtype=np.float32)
embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)
embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)

train_dataset_33 : SentimentDataset_33 = SentimentDataset_33(train_texts_33, train_labels_33, vocab_33, embedding_matrix_33)
valid_dataset_33 : SentimentDataset_33 = SentimentDataset_33(valid_texts_33, valid_labels_33, vocab_33, embedding_matrix_33)

18029
18030
18031
18029
18030
18031


In [64]:
def train_33(model, iterator):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in iterator:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch.float())
        loss.backward()
        # for param in model.parameters():
        #     if param.grad is not None:  # Ensure the gradient is not None
        #         print(f"Gradient norm for {param.shape}: {param.grad.data.norm()}")
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate_33(model, iterator):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())
            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, epoch_loss / len(iterator)

def train_and_validate_33(num_epochs, model, train_iterator, valid_iterator):
    best_val_acc = 0
    for epoch in range(num_epochs):
        train_loss = train_33(model, train_iterator)
        accuracy , valid_loss = evaluate_33(model, valid_iterator)
        print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.3f}, Accuracy = {accuracy:.3f}, Val Loss = {valid_loss:.3f}')

        if accuracy > best_val_acc:
            best_val_acc = accuracy
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 10:  # Convergence condition (no improvement for 5 epochs)
            print("Convergence reached, stopping training.")
            break
        
    return best_val_acc, epochs_without_improvement

## AttentionBiLSTM

In [61]:
class AttentionBiLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(AttentionBiLSTM, self).__init__()
        # Load pre-trained embeddings with trainable parameter
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_dim, num_layers=1, 
                            bidirectional=True, batch_first=True)
        
        # Attention layer
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
        # Fully connected layer and dropout
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)  # LSTM output for all time steps
        
        # Attention weights
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        # Weighted sum of LSTM outputs
        weighted_output = torch.sum(attn_weights * lstm_out, dim=1)
        
        # Dropout and fully connected layer
        out = self.dropout(weighted_output)
        return self.fc(out)  # For classification, logits are returned

In [None]:
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
dropout_rates = [0.3, 0.5]
hidden_size = 128
output_dim = 1

best_val_acc = 0
best_hyperparams = {}
#switch to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for lr in learning_rates:
    for bs in batch_sizes:
        for dr in dropout_rates:
            print("Training with the following hyperparameters:")
            print(f"Learning Rate: {lr}, Batch Size: {bs}, Dropout Rate: {dr}, Hidden Size: {hidden_size}")
            model = AttentionBiLSTM(embedding_matrix_tensor, 128, output_dim, dr).to(device)
            optimizer = optim.Adam(model.parameters(), lr=lr)
            criterion = nn.BCEWithLogitsLoss()

            train_iterator_33 = DataLoader(train_dataset_33, bs)
            valid_iterator_33 = DataLoader(valid_dataset_33, bs)
                
            # Train and validate
            val_acc, epochs_used = train_and_validate_33(30, model, train_iterator_33, valid_iterator_33)
            print(f"Learning Rate: {lr}, Batch Size: {bs}, Validation Accuracy: {val_acc}")

            # Update best parameters
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_hyperparams = {
                    'learning_rate': lr,
                    'batch_size': bs
                }
                best_epochs = epochs_used

# Print the best configuration
print(f"Best Model Configuration: {best_hyperparams} with Validation Accuracy: {best_val_acc} over {best_epochs} epochs")

Training with the following hyperparameters:
Learning Rate: 0.001, Batch Size: 32, Hidden Size: 128
Epoch 1: Train Loss = 0.336, Accuracy = 0.500, Val Loss = 2.604
Epoch 2: Train Loss = 0.275, Accuracy = 0.539, Val Loss = 1.483
Epoch 3: Train Loss = 0.056, Accuracy = 0.670, Val Loss = 1.270
Epoch 4: Train Loss = 0.012, Accuracy = 0.702, Val Loss = 1.274
Epoch 5: Train Loss = 0.006, Accuracy = 0.708, Val Loss = 1.340
Epoch 6: Train Loss = 0.003, Accuracy = 0.718, Val Loss = 1.381
Epoch 7: Train Loss = 0.002, Accuracy = 0.718, Val Loss = 1.431
Epoch 8: Train Loss = 0.002, Accuracy = 0.725, Val Loss = 1.511
Epoch 9: Train Loss = 0.001, Accuracy = 0.714, Val Loss = 1.612
Epoch 10: Train Loss = 0.001, Accuracy = 0.717, Val Loss = 1.685
Epoch 11: Train Loss = 0.001, Accuracy = 0.720, Val Loss = 1.782
Epoch 12: Train Loss = 0.000, Accuracy = 0.723, Val Loss = 1.722
Epoch 13: Train Loss = 0.000, Accuracy = 0.720, Val Loss = 1.764
Epoch 14: Train Loss = 0.000, Accuracy = 0.720, Val Loss = 1.802

KeyboardInterrupt: 