In [14]:
%pip install portalocker>=2.0.0 torchdata torchtext

Note: you may need to restart the kernel to use updated packages.


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
import numpy as np
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import vocab


AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [None]:
# Define the LSTMModel class, which inherits from nn.Module
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_size=100):
        super().__init__()  # Initialize the parent class (nn.Module)

        # Define the embedding layer, which converts input tokens into dense vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=False)

        # Define a bidirectional LSTM layer for processing the embedded sequences
        self.lstm = nn.LSTM(
            input_size=embed_dim,  # Size of the input (embedding dimension)
            hidden_size=hidden_size,  # Number of units in each LSTM cell
            num_layers=1,  # Use a single LSTM layer
            batch_first=True,  # Input and output tensors are provided as (batch_size, seq_len, features)
            bidirectional=True  # Make the LSTM bidirectional
        )

        # Define a fully connected layer for classification
        # Since the LSTM is bidirectional, the hidden_size is doubled
        self.fc = nn.Linear(in_features=hidden_size * 2, out_features=num_class)

        # Initialize the weights of the model
        self.init_weights()

    # Method to initialize weights of the embedding and fully connected layers
    def init_weights(self):
        initrange = 0.5  # Define the range for uniform weight initialization
        self.embedding.weight.data.uniform_(-initrange, initrange)  # Initialize embedding weights
        self.fc.weight.data.uniform_(-initrange, initrange)  # Initialize fully connected layer weights
        self.fc.bias.data.zero_()  # Initialize the bias of the fully connected layer to zero

    # Define the forward pass of the model
    def forward(self, text):
        # Convert input text (token indices) into dense embeddings
        embedded = self.embedding(text)

        # Pass the embeddings through the bidirectional LSTM layer
        lstm_out, (hn, cn) = self.lstm(embedded)

        # Reshape the LSTM output to flatten the sequence dimension for the fully connected layer
        # Since the LSTM is bidirectional, the last dimension is doubled
        lstm_out = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)

        # Pass the flattened output through the fully connected layer to get the final predictions
        result = self.fc(lstm_out)

        return result


In [None]:
def tokenize(text):
    return text.lower().split()

train_iter, test_iter = IMDB()

counter = Counter()
# Turn every single review into a sequence of words, that can then be added to a counter.
for label, text in train_iter:
    tokenized_text = tokenize(text)
    counter.update(tokenized_text)

min_freq = 10
max_words = 100000
#Adjust the amount of tokens to the maximal defined amount, ranking them according to their occurences in the text
ordered_counter = OrderedDict(counter.most_common(max_words))
specials = ["<unk>"]

# Build a vocabulary from the tokenized data, with a maximum of `max_tokens` tokens, and set never seen before words to the unknown index.
vocab = vocab(ordered_dict=ordered_counter, min_freq=min_freq, specials=specials)
vocab.set_default_index(vocab[specials[0]])

# Function to process and collate a batch of data
def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        if _label == 1: # This version of the IMDB dataset har 1 for a negative review and 2 for a positive, adjust to make 0 indexed.
            label = 0
        elif _label == 2:
            label = 1
        else:
            raise ValueError("Not a correct review.") # Check if out schema is correct, catches potential edge cases if wrong IMDB dataset is used.
        label_list.append(label)
        processed_text = vocab(tokenize(_text))
        if len(processed_text) > 200: #This limits every review to have a maximal length of 200 words, also pads short reviews with zeros for consistent length
          processed_text = processed_text[:200]
        else:
          for i in range(200 - len(processed_text)):
            processed_text.append(0)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64) #Tensor convertions to make dataset work with pytorch models
    text_list = torch.tensor(text_list,dtype=torch.int64)

    return label_list.to(device), text_list.to(device)

# Load the IMDB dataset as iterators
train_iter, test_iter = IMDB()

# Convert iterators to map-style datasets for DataLoader compatibility
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Create DataLoaders for training and testing with batch processing and custom collate function
train_dataloader = DataLoader(
    train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch
)

# Determine the number of classes from the training dataset (e.g., positive/negative reviews)
num_class = len(set([label for (label, text) in train_iter]))  # Should be 2 for binary classification

# Calculate the size of the vocabulary based on the tokens identified earlier
vocab_size = len(vocab)  # Example size: 100683 tokens

# Define the size of the embedding vectors and initialize the RNN model
emsize = 256  # Embedding size for each token
model = LSTMModel(vocab_size, emsize, num_class, hidden_size=100).to(device)  # Move the model to the device (e.g., GPU)

NameError: name 'IMDB' is not defined

In [None]:
import time

# Function to train the model using the given dataloader
def train(dataloader):
    model.train()
    total_acc, total_count = 0,0
    total_loss = 0
    log_interval = 100
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        total_loss += loss.item() * label.shape[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) #Clip gradients to protect us from exploding gradients.
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)

    return total_acc / total_count, total_loss / total_count

# Function to evaluate the model's performance on a validation or test dataset
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_loss += loss.item() * label.shape[0]
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count, total_loss / total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 30  # Number of epochs to train the model
LR = 0.001  # Learning rate for the optimizer
BATCH_SIZE = 64  # Number of samples per batch for training

# Define the loss function (CrossEntropyLoss) for classification tasks
criterion = torch.nn.CrossEntropyLoss()

# Define the optimizer (Stochastic Gradient Descent) with the specified learning rate
optimizer = optim.Adam(model.parameters(), lr=LR)

# Define a learning rate scheduler that decays the learning rate by a factor of 0.1 every epoch
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Variable to track the best validation accuracy
total_accu = None

# Load the IMDB dataset and convert iterators to map-style datasets
train_iter, test_iter = IMDB()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Create DataLoaders for training, validation, and testing
train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

# Training loop over the specified number of epochs
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()  # Record the start time of the epoch

    # Train the model for one epoch and get the training accuracy and loss
    accu_train, loss_train = train(train_dataloader)

    # Evaluate the model on the validation set and get the validation accuracy and loss
    accu_test, loss_test = evaluate(test_dataloader)

    # Adjust the learning rate if the validation accuracy does not improve
    if total_accu is not None and total_accu > accu_test:
        old_LR = scheduler.optimizer.param_groups[0]['lr']
        scheduler.step()  # Decay the learning rate
        new_LR = scheduler.optimizer.param_groups[0]['lr']
        print(f"Adjusting learning rate from {old_LR} --> {new_LR}")
    else:
        total_accu = accu_test  # Update the best validation accuracy

    # Print the results for the current epoch
    print("-" * 58)
    print(
        "| end of epoch {:3d} | train loss {:6.3f} | "
        "test loss {:6.3f} ".format(
            epoch, loss_train, loss_test
        )
    )
    print(
        "| time: {:9.2f}s | train acc. {:6.3f} | "
        "test acc. {:6.3f} ".format(
            time.time() - epoch_start_time, accu_train, accu_test
        )
    )
    print("-" * 58)


----------------------------------------------------------
| end of epoch   1 | train loss  0.596 | test loss  0.490 
| time:      9.55s | train acc.  0.682 | test acc.  0.786 
----------------------------------------------------------
----------------------------------------------------------
| end of epoch   2 | train loss  0.385 | test loss  0.449 
| time:      9.50s | train acc.  0.841 | test acc.  0.805 
----------------------------------------------------------
----------------------------------------------------------
| end of epoch   3 | train loss  0.259 | test loss  0.447 
| time:      9.58s | train acc.  0.903 | test acc.  0.817 
----------------------------------------------------------
Adjusting learning rate from 0.001 --> 0.0009000000000000001
----------------------------------------------------------
| end of epoch   4 | train loss  0.173 | test loss  0.525 
| time:      9.58s | train acc.  0.940 | test acc.  0.810 
------------------------------------------------------