In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
batch_size = 64

train_dataset = datasets.MNIST(root = "dataset/", train = True, transform=transforms.ToTensor(), download= True)
train_loader = DataLoader(dataset = train_dataset, batch_size= batch_size, shuffle= True)

test_dataset = datasets.MNIST(root = "dataset/", train = False, transform=transforms.ToTensor(), download= True)
test_loader = DataLoader(dataset = test_dataset, batch_size= batch_size, shuffle= True)

# Hyperparameters
mnist : Nx1x28x28 ➡️ 28 sequences with 28 features each

In [9]:
# features
input_size = 28
sequence_length = 28
num_layers = 2
# #nodes in each time step
hidden_size = 256
num_classes = 10
batch_size = 64
num_epochs = 2
learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()


# RNN (or GRU or LSTM)

In [32]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # initial hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        # initial cell state for LSTM
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        # _ is the hidden state
        # out, _ = self.rnn(x, h0)
        out, _ = self.lstm(x, (h0, c0))
        # keep the batch and concatenate else
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
x = torch.randn(batch_size, sequence_length, input_size).to(device)
print(model(x).shape)


before torch.Size([64, 28, 256])
after torch.Size([64, 7168])
torch.Size([64, 10])


# BRNN

In [43]:
class BRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        # multiply by 2 for bidirectional
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # initial hidden state (multiply by 2 for bidirectional)
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        # initial cell state for LSTM (multiply by 2 for bidirectional)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        # _ is the hidden state
        # out, _ = self.rnn(x, h0)
        out, _ = self.lstm(x, (h0, c0))
        # keep the batch and concatenate else
        # out = out.reshape(out.shape[0], -1)
        print(out.shape)
        out = self.fc(out[:, -1, :])
        print(out.shape)
        return out

model = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)
x = torch.randn(batch_size, sequence_length, input_size).to(device)
print(model(x).shape)

torch.Size([64, 28, 512])
torch.Size([64, 10])
torch.Size([64, 10])


# Train and Test Functions

In [12]:
def train(dataloader, model, num_epochs, loss_fn, learning_rate):
    
    # optimizer
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)

    losses = []
    for epoch in range(num_epochs):
        for batch, (X, y )in enumerate(dataloader):
            # to cuda
            X = X.to(device = device).squeeze(1)
            y = y.to(device = device)

            # forward
            pred = model(X)
            loss = loss_fn(pred, y)

            # Backpropagation
            # set all gradients to zero on each batch
            optimizer.zero_grad()
            loss.backward()
            # gradient descent or adam step
            optimizer.step()

            losses.append(loss.item())

            if batch % 50 == 0:
                print(f"Epoch {epoch} batch {batch} loss {losses[-1]:.4f}")
            
    return losses

In [13]:
def test(dataloader, model, loss_fn):
    
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:

            X = X.to(device).squeeze(1)
            y = y.to(device)

            pred = model(X)
            test_loss += loss_fn(pred, y).item()

            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [14]:
alpha = torch.randn((3,2,1,4,2,1))
print(alpha.shape)
alpha = alpha.squeeze(2)
print(alpha.shape)

torch.Size([3, 2, 1, 4, 2, 1])
torch.Size([3, 2, 4, 2, 1])


In [38]:
modelRNN = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Train the model
history = train(train_loader, modelRNN, num_epochs, loss_fn, learning_rate)
# Test the model
test(train_loader, modelRNN, loss_fn)
test(test_loader, modelRNN, loss_fn)

In [42]:
modelBRNN = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Train the model
history = train(train_loader, modelBRNN, num_epochs, loss_fn, learning_rate)
# Test the model
test(train_loader, modelBRNN, loss_fn)
test(test_loader, modelBRNN, loss_fn)

Epoch 0 batch 0 loss 2.3115
Epoch 0 batch 50 loss 1.3887
Epoch 0 batch 100 loss 0.9739
Epoch 0 batch 150 loss 0.5187
Epoch 0 batch 200 loss 0.8073
Epoch 0 batch 250 loss 0.3293
Epoch 0 batch 300 loss 0.4694
Epoch 0 batch 350 loss 0.3742
Epoch 0 batch 400 loss 0.3503
Epoch 0 batch 450 loss 0.4475
Epoch 0 batch 500 loss 0.2085
Epoch 0 batch 550 loss 0.2572
Epoch 0 batch 600 loss 0.3773
Epoch 0 batch 650 loss 0.0615
Epoch 0 batch 700 loss 0.0900
Epoch 0 batch 750 loss 0.1113
Epoch 0 batch 800 loss 0.0798
Epoch 0 batch 850 loss 0.1928
Epoch 0 batch 900 loss 0.1801
Epoch 1 batch 0 loss 0.3189
Epoch 1 batch 50 loss 0.1912
Epoch 1 batch 100 loss 0.0282
Epoch 1 batch 150 loss 0.0743
Epoch 1 batch 200 loss 0.0736
Epoch 1 batch 250 loss 0.1415
Epoch 1 batch 300 loss 0.1539
Epoch 1 batch 350 loss 0.2108
Epoch 1 batch 400 loss 0.1859
Epoch 1 batch 450 loss 0.1486
Epoch 1 batch 500 loss 0.0213
Epoch 1 batch 550 loss 0.0472
Epoch 1 batch 600 loss 0.0660
Epoch 1 batch 650 loss 0.0954
Epoch 1 batch 70