## Install and import dependencies

In [1]:
# %pip install torch gensim datasets nltk

In [1]:
import os
import nltk
nltk.download("all")


import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api

from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/mythilimulani/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagg

## Part 0. Dataset Preparation

In [2]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [5]:
#tokenize sentences 
train_tokenized = []
for sentence in train_dataset['text']:
    train_tokenized.append(word_tokenize(sentence.lower()))

print('sample sentence:', train_tokenized[0],'\n')

#build vocabulary
vocab = {"<PAD>", "<UNK>"} #include a padding and unknown token for future processing
vocab.update(word for sentence in train_tokenized for word in sentence)

print("Number of words in the vocabulary(including padding and unknown tokens):", len(vocab))
print("Number of words in the vocabulary:" , len(vocab)-2)


sample sentence: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] 

Number of words in the vocabulary(including padding and unknown tokens): 18031
Number of words in the vocabulary: 18029


## Part 3 
# 3.Keeping the above two adjustments, replace your simple RNN model in Part 2 wioth a biLSTM model and biGRU model.

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score

In [7]:
embedding_matrix_33=np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
# display(embedding_matrix_33)

preparing the train dataset. text -> word index

In [8]:
class SentimentDataset_33(Dataset):
    def __init__(self, texts : list[str], labels : list[int], vocab : set, embedding_matrix : dict, max_len=30):
        self.texts : list[str] = texts
        self.labels : list[int] = labels
        self.vocab : dict = self.build_vocab_dict(vocab)  # function to build vocabulary
        self.embedding_matrix : dict = embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = word_tokenize(text.lower())
        vectorized_text = self.vectorize(tokenized_text)
        return torch.tensor(vectorized_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

    def vectorize(self, tokens):
        # Convert tokens to their corresponding index in the vocabulary
        vectorized = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Pad or truncate to max_len
        if len(vectorized) < self.max_len:
            vectorized += [self.vocab['<PAD>']] * (self.max_len - len(vectorized))
        else:
            vectorized = vectorized[:self.max_len]
        return vectorized

    def build_vocab_dict(self, vocab : set):
        if "<PAD>" in vocab : vocab.remove("<PAD>")
        if "<UNK>" in vocab : vocab.remove("<UNK>")
        vocab_dict = {word: idx for idx, word in enumerate(vocab)}
        vocab_dict['<PAD>'] = len(vocab_dict) # Add padding token
        vocab_dict['<UNK>'] = len(vocab_dict) # Add unknown token
        return vocab_dict


start making the rnn

In [9]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [10]:
# Converting the Vocab set to dictionary 
def build_vocab_dict(vocab_set):
    # Create the vocabulary dictionary without <PAD> and <UNK>
    vocab_set.discard("<PAD>")
    vocab_set.discard("<UNK>")
    vocab_dict = {word: idx for idx, word in enumerate(vocab_set, start=2)}

    # Check for <PAD> and <UNK> existence and assign them fixed indices if they are present
    if "<PAD>" not in vocab_dict:
        vocab_dict["<PAD>"] = 0  # Index for padding token
    if "<UNK>" not in vocab_dict:
        vocab_dict["<UNK>"] = 1  # Index for unknown token
    
    #add the <PAD> and <UNK> back to the vocab
    vocab_set.add("<PAD>")
    vocab_set.add("<UNK>")

# biGRU Model

In [11]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [12]:
class SentimentBiGRU(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(SentimentBiGRU, self).__init__()
        
        # Using pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # Freeze if you don't want to update
        self.hidden_dim = hidden_dim
        
        # Bidirectional GRU layer
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Output size is 2 * hidden_dim for bidirectional
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer for regularization
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation for binary classification
        

    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)  # Shape: [batch_size, sequence_length, embedding_dim]

        # Pass through GRU layer
        gru_out, _ = self.gru(embedded)  # gru_out shape: [batch_size, sequence_length, hidden_dim * 2]
        
        # Concatenate last hidden states from forward and backward GRUs
        out = torch.cat((gru_out[:, -1, :self.hidden_dim], gru_out[:, 0, self.hidden_dim:]), dim=1)  # Shape: [batch_size, hidden_dim * 2]
        
        out = self.dropout(out)  # Apply dropout
        return self.fc(out)  # Final output

In [13]:
train_texts_33 : list[str] = train_dataset['text']  # List of training texts
train_labels_33 : list[int] = train_dataset['label']  # Corresponding labels for training texts
valid_texts_33 : list[str]= validation_dataset['text']  # List of validation texts
valid_labels_33 : list[int] = validation_dataset['label']  # Corresponding labels for validation texts
test_texts_33 : list[str] = test_dataset['text']  # List of test texts
test_labels_33 : list[int] = test_dataset['label']  # Corresponding labels for test texts
vocab_33 : set = vocab  # Your vocabulary list
embedding_matrix_33 : dict[ str , np.ndarray]= np.load("embedding_matrix.npy",allow_pickle='TRUE').item()
embedding_matrix_values = np.array(list(embedding_matrix_33.values()), dtype=np.float32)
embedding_matrix_tensor = torch.tensor(embedding_matrix_values, dtype=torch.float32)
#embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)


# Create dataset instances
train_dataset_33 : SentimentDataset_33 = SentimentDataset_33(train_texts_33, train_labels_33, vocab_33, embedding_matrix_33)
valid_dataset_33 : SentimentDataset_33 = SentimentDataset_33(valid_texts_33, valid_labels_33, vocab_33, embedding_matrix_33)
test_dataset_33 : SentimentDataset_33 = SentimentDataset_33(test_texts_33, test_labels_33, vocab_33, embedding_matrix_33)

# Create data loaders
train_iterator_33 = DataLoader(train_dataset_33, batch_size=32, shuffle=True)
valid_iterator_33 = DataLoader(valid_dataset_33, batch_size=32, shuffle=False)
test_iterator_33 = DataLoader(test_dataset_33, batch_size=32, shuffle=False)


In [14]:
def train_33(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in iterator:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch.float())
        loss.backward()
        # for param in model.parameters():
        #     if param.grad is not None:  # Ensure the gradient is not None
        #         print(f"Gradient norm for {param.shape}: {param.grad.data.norm()}")
        optimizer.step()
        # print("model.parameters():",model.parameters())
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


In [15]:
def evaluate_33(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())
            epoch_loss += loss.item()
            
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, epoch_loss / len(iterator)


In [16]:
def train_and_validate_33(num_epochs, model, train_iterator, valid_iterator, optimizer, criterion, scheduler):
    for epoch in range(num_epochs):
        train_loss = train_33(model, train_iterator, optimizer, criterion)
        accuracy , valid_loss = evaluate_33(model, valid_iterator, criterion)
        scheduler.step(valid_loss)
        if (epoch==0):
                best_val_loss = valid_loss
                epochs_without_improvement = 0
        print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.3f}, Accuracy = {accuracy:.3f}, Val Loss = {valid_loss:.3f} Learning Rate: {scheduler.optimizer.param_groups[0]["lr"]:.6f}')

        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1
            
        # Check for convergence
        if epochs_without_improvement >= 8:  # Convergence condition (no improvement for 4 epochs)
            print("Convergence reached, stopping training.")
            break

In [17]:
hidden_dim = 128  # Adjust as needed
output_dim = 1  # Binary sentiment classification
dropout_rate = 0.3  # Adjust as needed

criterion = nn.BCEWithLogitsLoss()
model_33 = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate=0.5).to(device)
optimizer = optim.Adam(model_33.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

In [18]:
# Now you can run your training loop
train_and_validate_33(30, model_33, train_iterator_33, valid_iterator_33, optimizer, criterion, scheduler)

Epoch 1: Train Loss = 0.625, Accuracy = 0.758, Val Loss = 0.504 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.333, Accuracy = 0.746, Val Loss = 0.552 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.139, Accuracy = 0.745, Val Loss = 0.697 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.061, Accuracy = 0.743, Val Loss = 0.876 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.032, Accuracy = 0.745, Val Loss = 1.049 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.014, Accuracy = 0.731, Val Loss = 1.300 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.005, Accuracy = 0.733, Val Loss = 1.522 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.001, Accuracy = 0.732, Val Loss = 1.596 Learning Rate: 0.000100
Convergence reached, stopping training.


## Hyper-parameter training

In [None]:
# import itertools
# # Define the hyper-parameter grid
# hidden_dims = [64, 128]
# dropout_rates = [0.3, 0.5]
# lr = 0.001
# batch_sizes = [32, 64]
# output_dim = 1 

# # Iterate over all combinations of hyper-parameters
# for hidden_dim, dropout_rate, bs in itertools.product(hidden_dims, dropout_rates, batch_sizes):
#     print(f'Training with hidden_dim={hidden_dim}, dropout_rate={dropout_rate}, batch_size={bs}')
    
#     model_33 = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
#     criterion = nn.BCEWithLogitsLoss()
#     optimizer = optim.Adam(model_33.parameters(), lr=lr)
#     scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

#     train_iterator_33 = DataLoader(train_dataset_33, bs)
#     valid_iterator_33 = DataLoader(valid_dataset_33, bs)

#     train_and_validate_33(20, model_33, train_iterator_33, valid_iterator_33, optimizer, criterion, scheduler)

Training with hidden_dim=64, dropout_rate=0.3, batch_size=32
Epoch 1: Train Loss = 0.332, Accuracy = 0.500, Val Loss = 2.345 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.597, Accuracy = 0.500, Val Loss = 1.909 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.597, Accuracy = 0.500, Val Loss = 1.736 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.539, Accuracy = 0.500, Val Loss = 1.548 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.443, Accuracy = 0.500, Val Loss = 1.746 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.353, Accuracy = 0.501, Val Loss = 1.649 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.303, Accuracy = 0.544, Val Loss = 1.526 Learning Rate: 0.001000
Epoch 8: Train Loss = 0.235, Accuracy = 0.553, Val Loss = 1.387 Learning Rate: 0.001000
Epoch 9: Train Loss = 0.192, Accuracy = 0.576, Val Loss = 1.331 Learning Rate: 0.001000
Epoch 10: Train Loss = 0.157, Accuracy = 0.584, Val Loss = 1.289 Learning Rate: 0.001000
Epoch 11: Train Loss = 0.126, Accuracy = 0.595, Val Loss =

## Training with best parameters 
hidden_dim=64
dropout_rate=0.3
batch_size=64

In [28]:
hidden_dim = 64
lr=0.001
dropout_rate=0.3
bs=64
output_dim = 1 

model_33 = SentimentBiGRU(embedding_matrix_tensor, hidden_dim, output_dim, dropout_rate).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_33.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)

train_iterator_33 = DataLoader(train_dataset_33, bs)
valid_iterator_33 = DataLoader(valid_dataset_33, bs)

train_and_validate_33(25, model_33, train_iterator_33, valid_iterator_33, optimizer, criterion, scheduler)


Epoch 1: Train Loss = 0.753, Accuracy = 0.500, Val Loss = 1.265 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.878, Accuracy = 0.499, Val Loss = 0.695 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.700, Accuracy = 0.542, Val Loss = 0.685 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.642, Accuracy = 0.546, Val Loss = 0.759 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.550, Accuracy = 0.568, Val Loss = 0.889 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.426, Accuracy = 0.596, Val Loss = 0.902 Learning Rate: 0.001000
Epoch 7: Train Loss = 0.354, Accuracy = 0.614, Val Loss = 0.880 Learning Rate: 0.001000
Epoch 8: Train Loss = 0.270, Accuracy = 0.623, Val Loss = 0.879 Learning Rate: 0.000100
Epoch 9: Train Loss = 0.283, Accuracy = 0.647, Val Loss = 0.775 Learning Rate: 0.000100
Epoch 10: Train Loss = 0.193, Accuracy = 0.666, Val Loss = 0.756 Learning Rate: 0.000100
Epoch 11: Train Loss = 0.156, Accuracy = 0.687, Val Loss = 0.757 Learning Rate: 0.000100
Convergence reached, stopping 

## Testing

In [19]:
# Step 8: Get a sample sentence from the test set and predict
import random
# Select a random index from the test dataset
random_index = random.randint(0, len(test_dataset) - 1)

# Get the corresponding sentence and its label from the test dataset
sample_sentence = test_dataset[random_index]['text']  # Assuming the dataset contains a 'text' field
true_label = test_dataset[random_index]['label']  # Assuming there's a label field
# Tokenize the sample sentence
sample_tokens = word_tokenize(sample_sentence.lower())

vocab_33 = build_vocab_dict(vocab)
# Convert tokens to indices
sample_indices = []
for token in sample_tokens:
    if token in vocab:
        sample_indices.append(vocab_33[token])
    else:
        sample_indices.append(vocab_33['<UNK>'])

sample_tensor = torch.tensor(sample_indices).unsqueeze(0)  # Add batch dimension
sample_tensor = sample_tensor.to(device)  # Move to GPU if available
# Make prediction using the model
model_33.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to compute gradients during inference
    output = model_33(sample_tensor)  # Pass the tensor to the model
    print(output)
    probs = model_33.sigmoid(output)
    predicted = (probs >= 0.5)
    print(predicted.item())

# Map predicted index to sentiment label
sentiment_labels = ['negative', 'positive']  # Adjust according to your label encoding
predicted_label = sentiment_labels[predicted]

# Print results
print(f"Sample Sentence: '{sample_sentence}'")
print(f"True Label: {sentiment_labels[true_label]}")
print(f"Predicted Label: {predicted_label}")

TypeError: 'NoneType' object is not subscriptable

In [20]:
def test_33(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in iterator:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            output = model(X_batch).squeeze(1)
            
            probs = model.sigmoid(output)
            prediction = (probs >= 0.5).float()
            
            loss = criterion(output, y_batch.float())

            epoch_loss += loss.item()
            all_preds.extend(prediction.tolist())
            all_labels.extend(y_batch.tolist())
            
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy = {accuracy:.3f}")

In [21]:
test_33(model_33, test_iterator_33, criterion)

Accuracy = 0.750


## BiGRU with an Attention Layer

In [22]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, hidden, encoder_outputs):
        """
        hidden: [batch_size, hidden_dim] (last hidden state of the decoder or a step in the sequence)
        encoder_outputs: [batch_size, seq_len, hidden_dim] (outputs from the encoder)
        """
        # Compute the attention weights using the query (hidden state) and keys (encoder outputs)
        attn_weights = torch.matmul(encoder_outputs, hidden.unsqueeze(2)).squeeze(2) # [batch_size, seq_len]
        
        # Apply softmax to get the attention weights
        attn_weights = F.softmax(attn_weights, dim=1)
        
        # Compute the weighted sum of the encoder outputs (values)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1) # [batch_size, hidden_dim]
        
        return context, attn_weights

In [23]:

class BiGRUWithAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout_rate):
        super(BiGRUWithAttention, self).__init__()
        # Load pre-trained embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        
        # Define the GRU layer (bidirectional)
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        # Attention layer
        self.attn = Attention(hidden_dim * 2)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Embed the input
        embedded = self.embedding(x) # [batch_size, seq_len, embedding_dim]
        
        # Get the outputs and hidden states from the GRU
        gru_out, _ = self.gru(embedded) # gru_out: [batch_size, seq_len, hidden_dim * 2]
        
        # Use the last hidden state of the GRU as the query for the attention mechanism
        last_hidden = gru_out[:, -1, :] # [batch_size, 2 * hidden_dim]
        
        # Apply attention to the GRU outputs
        context, attn_weights = self.attn(last_hidden, gru_out) # context: [batch_size, 2 * hidden_dim]
        
        # Pass the context vector through a fully connected layer
        output = self.fc(context) # [batch_size, output_dim]
        
        return output

In [24]:
hidden_dim_attn = 128
lr_attn=0.001
dropout_rate_attn=0.3
bs_attn=64
output_dim_attn = 1 


model_attn= BiGRUWithAttention(embedding_matrix_tensor, hidden_dim_attn, output_dim_attn, dropout_rate_attn).to(device)
criterion_attn = nn.BCEWithLogitsLoss()
optimizer_attn = torch.optim.Adam(model_attn.parameters(), lr=1e-3)
scheduler_attn = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_attn, mode='min', factor=0.1, patience=4)

In [25]:
train_and_validate_33(20, model_attn, train_iterator_33,valid_iterator_33, optimizer_attn, criterion_attn, scheduler_attn)

Epoch 1: Train Loss = 0.626, Accuracy = 0.732, Val Loss = 0.543 Learning Rate: 0.001000
Epoch 2: Train Loss = 0.378, Accuracy = 0.723, Val Loss = 0.556 Learning Rate: 0.001000
Epoch 3: Train Loss = 0.220, Accuracy = 0.735, Val Loss = 0.615 Learning Rate: 0.001000
Epoch 4: Train Loss = 0.125, Accuracy = 0.736, Val Loss = 0.717 Learning Rate: 0.001000
Epoch 5: Train Loss = 0.073, Accuracy = 0.748, Val Loss = 0.739 Learning Rate: 0.001000
Epoch 6: Train Loss = 0.053, Accuracy = 0.744, Val Loss = 0.981 Learning Rate: 0.000100
Epoch 7: Train Loss = 0.030, Accuracy = 0.746, Val Loss = 1.049 Learning Rate: 0.000100
Epoch 8: Train Loss = 0.025, Accuracy = 0.743, Val Loss = 1.077 Learning Rate: 0.000100
Convergence reached, stopping training.


In [26]:
test_33(model_attn, test_iterator_33, criterion_attn)

Accuracy = 0.770
