# Lab sheet - 9 (04 - 05 APR 2025)

# Setup and Import Dependencies
First, we will import the necessary libraries and modules.

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

#Load the MNIST Dataset
We will use torchvision to load the MNIST dataset. You can normalize the data by dividing by 255 and apply transformations like ToTensor to convert images to tensors.

In [None]:
# Transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [0, 1] range
])

# Load MNIST training and test datasets
trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# DataLoader for batching
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=False)



In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# obtain one batch of training images
dataiter = iter(trainloader)
images, labels = next(dataiter) #common mistake is using dataiter.next() which doesn't work anymore
images = images.numpy()

# get one image from the batch
img = np.squeeze(images[9])

fig = plt.figure(figsize = (5,5))
ax = fig.add_subplot(111)
ax.imshow(img, cmap='gray')

#Define the RNN Model
We will define a simple RNN architecture for digit classification. The RNN will take 28 rows of the 28x28 image as sequential input (each row treated as a time-step).

In [None]:
class RNN_Model(nn.Module):
    def __init__(self, input_size=28, hidden_size=128, num_classes=10):
        super(RNN_Model, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the output of the last time-step
        return out


# LSTM
A common variant on the vanilla RNN is the Long-Short Term Memory (LSTM) RNN. Vanilla RNNs can be tough to train on long sequences due to vanishing and exploding gradients caused by repeated matrix multiplication. LSTMs solve this problem by replacing the simple update rule of the vanilla RNN with a gating mechanism as follows.

Similar to the vanilla RNN, at each timestep we receive an input $x_t\in\mathbb{R}^D$ and the previous hidden state $h_{t-1}\in\mathbb{R}^H$; the LSTM also maintains an $H$-dimensional *cell state*, so we also receive the previous cell state $c_{t-1}\in\mathbb{R}^H$. The learnable parameters of the LSTM are an *input-to-hidden* matrix $W_x\in\mathbb{R}^{4H\times D}$, a *hidden-to-hidden* matrix $W_h\in\mathbb{R}^{4H\times H}$ and a *bias vector* $b\in\mathbb{R}^{4H}$.

At each timestep we first compute an *activation vector* $a\in\mathbb{R}^{4H}$ as $a=W_xx_t + W_hh_{t-1}+b$. We then divide this into four vectors $a_i,a_f,a_o,a_g\in\mathbb{R}^H$ where $a_i$ consists of the first $H$ elements of $a$, $a_f$ is the next $H$ elements of $a$, etc. We then compute the *input gate* $g\in\mathbb{R}^H$, *forget gate* $f\in\mathbb{R}^H$, *output gate* $o\in\mathbb{R}^H$ and *block input* $g\in\mathbb{R}^H$ as

$$
i = \sigma(a_i) \hspace{2pc}
f = \sigma(a_f) \hspace{2pc}
o = \sigma(a_o) \hspace{2pc}
g = \tanh(a_g)
$$

where $\sigma$ is the sigmoid function and $\tanh$ is the hyperbolic tangent, both applied elementwise.

Finally we compute the next cell state $c_t$ and next hidden state $h_t$ as

$$
c_{t} = f\odot c_{t-1} + i\odot g \hspace{4pc}
h_t = o\odot\tanh(c_t)
$$

where $\odot$ is the elementwise product of vectors.

In the rest of the notebook we will implement the LSTM update rule and apply it to the image captioning task.

In the code, we assume that data is stored in batches so that $X_t \in \mathbb{R}^{N\times D}$ and will work with *transposed* versions of the parameters: $W_x \in \mathbb{R}^{D \times 4H}$, $W_h \in \mathbb{R}^{H\times 4H}$ so that activations $A \in \mathbb{R}^{N\times 4H}$ can be computed efficiently as $A = X_t W_x + H_{t-1} W_h$

#Define the LSTM Model
Now, we will define the LSTM model, which will improve upon the RNN by addressing the vanishing gradient problem.

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, input_size=28, hidden_size=128, num_layers=2, num_classes=10):
        super(LSTM_Model, self).__init__()

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: (batch_size, seq_length, input_size) -> (batch_size, 28, 28)

        # LSTM expects input shape (batch, seq_length, input_size)
        lstm_out, (hn, cn) = self.lstm(x)

        # Use the last hidden state to predict the class
        out = self.fc(hn[-1])
        return out



#Training and Evaluation Functions
We will write the training and evaluation functions, which can be reused for both models.

In [None]:
def train(model, trainloader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for i, (inputs, labels) in enumerate(trainloader):
            # Flatten the input for RNN/LSTM (batch_size, seq_len, input_size)
            inputs = inputs.view(-1, 28, 28)  # Reshaping for RNN or LSTM input
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss/len(trainloader):.4f}, Accuracy: {100 * correct / total:.2f}%')


Evaluation Function:

In [None]:
def evaluate(model, testloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.view(-1, 28, 28)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')


#Model Training and Evaluation
Now, let's train and evaluate both the RNN and LSTM models on the MNIST dataset.

#Training RNN Model:

In [None]:
# Initialize the model, loss function, and optimizer for RNN
rnn_model = RNN_Model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

# Train the RNN model
train(rnn_model, trainloader, criterion, optimizer, num_epochs=5)

# Evaluate the RNN model
evaluate(rnn_model, testloader)


#Training LSTM Model:

In [None]:
# Initialize the model, loss function, and optimizer for LSTM
lstm_model = LSTM_Model()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Train the LSTM model
train(lstm_model, trainloader, criterion, optimizer, num_epochs=5)

# Evaluate the LSTM model
evaluate(lstm_model, testloader)
