## Install and import dependencies

In [27]:
# %pip install torch datasets  

In [28]:
import os
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import spacy
import random
import pickle

from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score

from datasets import load_dataset


## Part 0. Dataset Preparation

In [29]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [30]:
#Number of sentences in each set 
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [None]:
# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Load the embedding matrix and word_to_index from the pickle file
# with open("updated_embedding_matrix.pkl", "rb") as f:
#     data = pickle.load(f)
#     embedding_matrix = data["embeddings"]
#     word_to_index = data["word_to_index"]

# # Convert embedding_matrix to a NumPy array and a PyTorch tensor
# embedding_matrix_array = np.array(embedding_matrix)
# embedding_matrix_tensor = torch.tensor(embedding_matrix_array, dtype=torch.float32)

# print(f"Loaded embedding matrix with shape: {embedding_matrix_array.shape}")
# print(f"Vocabulary size (word_to_index): {len(word_to_index)}")

Loaded embedding matrix with shape: (16633, 300)
Vocabulary size (word_to_index): 16633


tokenise train, validation, test dataset

In [None]:
# pre_tokenized_train_texts = []
# for sentence in train_dataset['text']:
#     # Tokenize the sentence using spaCy and store tokens as a list of strings
#     tokens = [token.text for token in nlp(sentence.lower())]
#     pre_tokenized_train_texts.append(tokens)

In [None]:
# # Pre-tokenize validation and test sets
# pre_tokenized_validation_texts = [[token.text for token in nlp(sentence.lower())] for sentence in validation_dataset['text']]
# pre_tokenized_test_texts = [[token.text for token in nlp(sentence.lower())] for sentence in test_dataset['text']]


## Part 3.3.Keeping the above two adjustments, replace your simple RNN model in Part 2 wioth a biLSTM model and biGRU model.

# biLSTM Model

preparing the train dataset. text -> word index

In [122]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensures reproducibility in CUDA operations
    torch.backends.cudnn.benchmark = False     # Disables some optimizations to ensure determinism

# Set the seed
set_seed(42)

In [35]:
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels : list[int], vocab : set, embedding_matrix : dict, max_len=30):
        self.texts = tokenized_texts
        self.labels  = labels
        self.vocab = word_to_index
        self.embedding_matrix : dict = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize(text)
        return torch.tensor(vectorized_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)
            
    def vectorize(self, tokens):
        # Convert tokens to their corresponding index in the vocabulary
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized

    def build_vocab_dict(self, vocab : set):
        if "<PAD>" in vocab : vocab.remove("<PAD>")
        if "<UNK>" in vocab : vocab.remove("<UNK>")
        vocab_dict = {word: idx for idx, word in enumerate(vocab)}
        vocab_dict['<PAD>'] = len(vocab_dict) # Add padding token
        vocab_dict['<UNK>'] = len(vocab_dict) # Add unknown token
        return vocab_dict


make the bilstm

In [None]:
class SentimentBiLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(SentimentBiLSTM, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size], lstm_out[:, 0, self.lstm.hidden_size:]), dim=1)
        out = self.dropout(out)
        return self.fc(out)

### Training, evaluation functions

training function

In [37]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in iterator:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch.float())
        loss.backward()
        # for param in model.parameters():
        #     if param.grad is not None:  # Ensure the gradient is not None
        #         print(f"Gradient norm for {param.shape}: {param.grad.data.norm()}")
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


evaluation function

In [38]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())

            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, epoch_loss / len(iterator)


train and validate function

In [None]:
def train_and_validate(num_epochs, model, train_iterator, valid_iterator, optimizer, criterion, scheduler):
    for epoch in range(num_epochs):
        train_loss = train(model, train_iterator, optimizer, criterion)
        accuracy , valid_loss = evaluate(model, valid_iterator, criterion)
        scheduler.step(valid_loss)
        
        if (epoch==0):
                best_acc = accuracy
                epochs_without_improvement = 0
        print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.3f}, Accuracy = {accuracy:.3f}, Val Loss = {valid_loss:.3f} Learning Rate: {scheduler.optimizer.param_groups[0]["lr"]:.6f}')

        if accuracy > best_acc:
            best_acc = accuracy
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 6:  # Convergence condition (no improvement for 4 epochs)
            print("Convergence reached, stopping training.")
            break

    return accuracy

create model and run the train loop

In [40]:

# embedding_matrix : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
# embedding_matrix_values = np.array(list(embedding_matrix.values()), dtype=np.float32)
# embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)

# Create dataset instances
train_dataset = SentimentDataset(pre_tokenized_train_texts, train_dataset['label'], word_to_index, embedding_matrix)
valid_dataset = SentimentDataset(pre_tokenized_validation_texts, validation_dataset['label'], word_to_index, embedding_matrix)
test_dataset = SentimentDataset(pre_tokenized_test_texts, test_dataset['label'], word_to_index, embedding_matrix)
# Create data loaders
train_iterator = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_iterator = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_iterator = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [41]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [51]:
hidden_dim = 128  # Adjust as needed
output_dim = 1  # Binary sentiment classification
model = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate=0.0).to(device)

In [52]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)
# Now you can run your training loop
train_and_validate(25, model, train_iterator, valid_iterator, optimizer, criterion, scheduler)

Epoch 1: Train Loss = 0.559, Accuracy = 0.743, Val Loss = 0.524 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.353, Accuracy = 0.778, Val Loss = 0.475 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.177, Accuracy = 0.767, Val Loss = 0.644 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.071, Accuracy = 0.768, Val Loss = 0.768 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.027, Accuracy = 0.770, Val Loss = 0.916 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.016, Accuracy = 0.772, Val Loss = 0.955 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.006, Accuracy = 0.759, Val Loss = 1.035 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.003, Accuracy = 0.771, Val Loss = 1.185 Learning Rate: 0.000100
Epoch 9: Train Loss = 0.001, Accuracy = 0.773, Val Loss = 1.252 Learning Rate: 0.000100
Epoch 10: Train Loss = 0.001, Accuracy = 0.772, Val Loss = 1.313 Learning Rate: 0.000100
Convergence reached, stopping training.


test

## Hyper-parameter Training

In [125]:
import itertools

# Define the hyper-parameter grid
hidden_dims = [64, 128]
learning_rates = [0.001]
dropout_rates = [0.3, 0.5]
batch_sizes = [32, 64]
output_dim = 1 
best_acc = 0
best_hidden_dim = 0
best_dropout_rate = 0
best_bs = 0

# Iterate over all combinations of hyper-parameters
for hidden_dim, lr, dropout_rate, bs in itertools.product(hidden_dims, learning_rates, dropout_rates, batch_sizes):
    print(f'Training with hidden_dim={hidden_dim}, lr={lr}, dropout_rate={dropout_rate}, batch_size={bs}')
    
    model_hyper = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
    criterion_hyper = nn.BCEWithLogitsLoss()
    optimizer_hyper = optim.Adam(model_hyper.parameters(), lr=lr)
    scheduler_hyper = optim.lr_scheduler.ReduceLROnPlateau(optimizer_hyper, mode='min', factor=0.1, patience=4)

    train_iterator_hyper = DataLoader(train_dataset, bs, shuffle=True)
    valid_iterator_hyper = DataLoader(valid_dataset, bs, shuffle=False)

    accuracy = train_and_validate(25, model_hyper, train_iterator_hyper, valid_iterator_hyper, optimizer_hyper, criterion_hyper, scheduler_hyper)

    if accuracy > best_acc:
        best_acc = accuracy
        best_hidden_dim = hidden_dim
        best_dropout_rate = dropout_rate
        best_bs = bs


print(f'Best accuracy: {best_acc:.3f} with hidden_dim={best_hidden_dim}, dropout_rate={best_dropout_rate}, batch_size={best_bs}')

Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=32
Epoch 1: Train Loss = 0.581, Accuracy = 0.775, Val Loss = 0.483 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.358, Accuracy = 0.778, Val Loss = 0.494 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.180, Accuracy = 0.771, Val Loss = 0.547 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.075, Accuracy = 0.760, Val Loss = 0.785 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.034, Accuracy = 0.761, Val Loss = 0.958 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.020, Accuracy = 0.762, Val Loss = 1.000 Learning Rate: 0.000100
Epoch 7: Train Loss = 0.007, Accuracy = 0.757, Val Loss = 1.102 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.005, Accuracy = 0.758, Val Loss = 1.154 Learning Rate: 0.000100
Convergence reached, stopping training.
Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=64
Epoch 1: Train Loss = 0.605, Accuracy = 0.735, Val Loss = 0.533 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.39

## Training with best parameters : 
hidden_dim=128, 

lr=0.001, 

dropout_rate=0.3, 

batch_size=64

In [74]:
hidden_dim_best = 128
lr_best=0.001
dropout_rate_best=0.3
bs_best=64
output_dim = 1 

model_best = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim_best, output_dim, dropout_rate_best).to(device)
criterion_best = nn.BCEWithLogitsLoss()
optimizer_best = optim.Adam(model_best.parameters(), lr_best)
scheduler_best = optim.lr_scheduler.ReduceLROnPlateau(optimizer_best, mode='min', factor=0.1, patience=4)

train_iterator_best = DataLoader(train_dataset, bs_best, shuffle=True)
valid_iterator_best = DataLoader(valid_dataset, bs_best, shuffle=False)

train_and_validate(25, model_best, train_iterator_best, valid_iterator_best, optimizer_best, criterion_best, scheduler_best)


Epoch 1: Train Loss = 0.577, Accuracy = 0.726, Val Loss = 0.530 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.387, Accuracy = 0.761, Val Loss = 0.477 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.234, Accuracy = 0.768, Val Loss = 0.527 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.122, Accuracy = 0.767, Val Loss = 0.648 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.056, Accuracy = 0.760, Val Loss = 0.948 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.025, Accuracy = 0.762, Val Loss = 0.911 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.017, Accuracy = 0.759, Val Loss = 1.151 Learning Rate: 0.000100
Convergence reached, stopping training.


0.7589118198874296

## Testing

In [47]:
def test(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())

            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
            
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy = {accuracy:.3f}")



In [53]:
test(model, test_iterator, criterion)

Accuracy = 0.788


In [64]:
test(model, test_iterator, criterion)

Accuracy = 0.788


In [76]:
test(model_best, test_iterator, criterion_best)

Accuracy = 0.797


## BiLSTM with an Attention Layer

In [86]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, hidden, encoder_outputs):
        """
        hidden: [batch_size, hidden_dim] (last hidden state of the decoder or a step in the sequence)
        encoder_outputs: [batch_size, seq_len, hidden_dim] (outputs from the encoder)
        """
        # Compute the attention weights using the query (hidden state) and keys (encoder outputs)
        attn_weights = torch.matmul(encoder_outputs, hidden.unsqueeze(2)).squeeze(2)
        
        # Apply softmax to get the attention weights
        attn_weights = F.softmax(attn_weights, dim=1)
        
        # Compute the weighted sum of the encoder outputs (values)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        
        return context

In [87]:
class BiLSTMWithAttention(nn.Module):
    
    
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(BiLSTMWithAttention, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.attn = Attention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, x):
        # Embed the input
        embedded = self.embedding(x) #[B, S, E]
        
        # Get the outputs and hidden states from the LSTM
        lstm_out,_= self.lstm(embedded)  # lstm_out: [batch_size, seq_len, hidden_dim * 2]
        
        # Use the last hidden state of the LSTM as the query for the attention mechanism
        last_hidden = lstm_out[:, -1, :] #[B, 2 * H]
        
        # Apply attention to the LSTM outputs
        context = self.attn(last_hidden, lstm_out) #[B, 2 * H], [B, S]
        
        # Pass the context vector through a fully connected layer
        output = self.fc(context) #[B, output_dim]
        
        return output


In [118]:
hidden_dim_attn = 128
lr_attn=0.001
dropout_rate_attn=0.3
bs_attn=64
output_dim = 1 


model_attn= BiLSTMWithAttention(embedding_matrix_tensor, hidden_dim_attn, output_dim, dropout_rate_attn ).to(device)
criterion_attn = nn.BCEWithLogitsLoss()
optimizer_attn = torch.optim.Adam(model_attn.parameters(), lr=1e-3)
scheduler_attn = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_attn, mode='min', factor=0.1, patience=4)

train_iterator_attn = DataLoader(train_dataset, bs_attn, shuffle=True)
valid_iterator_attn = DataLoader(valid_dataset, bs_attn, shuffle=False)

In [119]:
train_and_validate(2, model_attn, train_iterator_attn,valid_iterator_attn, optimizer_attn, criterion_attn, scheduler_attn)

Epoch 1: Train Loss = 0.561, Accuracy = 0.768, Val Loss = 0.496 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.377, Accuracy = 0.730, Val Loss = 0.542 Learning Rate: 0.001000


0.7298311444652908

In [120]:
test(model_attn, test_iterator, criterion_attn)

Accuracy = 0.728


In [96]:
train_and_validate(2, model_attn, train_iterator_attn,valid_iterator_attn, optimizer_attn, criterion_attn, scheduler_attn)
test(model_attn, test_iterator, criterion_attn)

Epoch 1: Train Loss = 0.009, Accuracy = 0.765, Val Loss = 1.267 Learning Rate: 0.000010
Epoch 2: Train Loss = 0.008, Accuracy = 0.768, Val Loss = 1.277 Learning Rate: 0.000010
Accuracy = 0.784


In [126]:
import itertools

# Define the hyper-parameter grid
hidden_dims = [64, 128]
learning_rates = [0.001]
dropout_rates = [0.3, 0.5]
batch_sizes = [32, 64]
output_dim = 1 
best_acc_attn = 0
best_hidden_dim_attn = 0
best_dropout_rate_attn = 0
best_bs_attn = 0

# Iterate over all combinations of hyper-parameters
for hidden_dim, lr, dropout_rate, bs in itertools.product(hidden_dims, learning_rates, dropout_rates, batch_sizes):
    print(f'Training with hidden_dim={hidden_dim}, lr={lr}, dropout_rate={dropout_rate}, batch_size={bs}')
    
    model_attn_hyper = SentimentBiLSTM(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
    criterion_attn_hyper = nn.BCEWithLogitsLoss()
    optimizer_attn_hyper = optim.Adam(model_attn_hyper.parameters(), lr=0.001)
    scheduler_attn_hyper = optim.lr_scheduler.ReduceLROnPlateau(optimizer_attn_hyper, mode='min', factor=0.1, patience=4)

    train_iterator_attn_hyper = DataLoader(train_dataset, bs, shuffle=True)
    valid_iterator_attn_hyper = DataLoader(valid_dataset, bs, shuffle=False)

    accuracy = train_and_validate(5, model_attn_hyper, train_iterator_attn_hyper, valid_iterator_attn_hyper, optimizer_attn_hyper, criterion_attn_hyper, scheduler_attn_hyper)

    if accuracy > best_acc_attn:
        best_acc_attn = accuracy
        best_hidden_dim_attn = hidden_dim
        best_dropout_rate_attn = dropout_rate
        best_bs_attn = bs


print(f'Best accuracy: {best_acc_attn:.3f} with hidden_dim={best_hidden_dim_attn}, dropout_rate={best_dropout_rate_attn}, batch_size={best_bs_attn}')

Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=32
Epoch 1: Train Loss = 0.592, Accuracy = 0.754, Val Loss = 0.508 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.366, Accuracy = 0.765, Val Loss = 0.500 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.196, Accuracy = 0.772, Val Loss = 0.542 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.086, Accuracy = 0.748, Val Loss = 0.749 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.037, Accuracy = 0.752, Val Loss = 1.038 Learning Rate: 0.001000
Training with hidden_dim=64, lr=0.001, dropout_rate=0.3, batch_size=64
Epoch 1: Train Loss = 0.595, Accuracy = 0.740, Val Loss = 0.512 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.384, Accuracy = 0.758, Val Loss = 0.493 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.231, Accuracy = 0.755, Val Loss = 0.535 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.127, Accuracy = 0.752, Val Loss = 0.689 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.058, Accuracy = 0.750, Val Loss = 0.796 Le

In [127]:
hidden_dim_attn_best = 128
lr_attn=0.001
dropout_rate_attn_best=0.3
bs_attn_best=32
output_dim = 1 


model_attn_best= BiLSTMWithAttention(embedding_matrix_tensor, hidden_dim_attn_best, output_dim, dropout_rate_attn_best ).to(device)
criterion_attn_best = nn.BCEWithLogitsLoss()
optimizer_attn_best = torch.optim.Adam(model_attn_best.parameters(), lr=1e-3)
scheduler_attn_best = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_attn_best, mode='min', factor=0.1, patience=4)

train_iterator_attn_best = DataLoader(train_dataset, bs_attn_best, shuffle=True)
valid_iterator_attn_best = DataLoader(valid_dataset, bs_attn_best, shuffle=False)

In [128]:
train_and_validate(5, model_attn_best, train_iterator_attn_best, valid_iterator_attn_best, optimizer_attn_best, criterion_attn_best, scheduler_attn_best)

Epoch 1: Train Loss = 0.559, Accuracy = 0.762, Val Loss = 0.479 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.346, Accuracy = 0.787, Val Loss = 0.463 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.191, Accuracy = 0.782, Val Loss = 0.586 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.097, Accuracy = 0.778, Val Loss = 0.685 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.050, Accuracy = 0.783, Val Loss = 1.036 Learning Rate: 0.001000


0.7833020637898687

In [129]:
test(model_attn_best, test_iterator, criterion_attn_best)

Accuracy = 0.791
