### Loading MNIST Train Dataset

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [2]:
torch.cuda.is_available()

True

In [3]:
cuda = torch.device('cuda')
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [4]:
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

In [5]:
print(train_dataset.data.size())
print(train_dataset.targets.size())
print(test_dataset.data.size())
print(test_dataset.targets.size())

torch.Size([60000, 28, 28])
torch.Size([60000])
torch.Size([10000, 28, 28])
torch.Size([10000])


### Make Dataset Iterable

In [6]:
batch_size = 100
n_iters = 4000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

### Create Model Class

In [7]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device)

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device)

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

### Instantiate our LSTM model

In [8]:
input_dim = 28
hidden_dim = 100
layer_dim = 1
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim).to(device)

### Cross Entry Loss Function

In [9]:
criterion = nn.CrossEntropyLoss()

### Instantiate Optimizer Class

In [10]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [11]:
len(list(model.parameters()))

6

In [12]:
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

torch.Size([400, 28])
torch.Size([400, 100])
torch.Size([400])
torch.Size([400])
torch.Size([10, 100])
torch.Size([10])


In [16]:
learning_rate = 0.1

In [18]:
# Number of steps to unroll
seq_dim = 28
num_epochs = 50

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as a torch tensor with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).to(device)    # .cuda()     # requires_grad_()
        labels = labels.to(device)

        if (iter+1) % 2000 == 0:
            learning_rate *= 0.3
            optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim).to(device)   # .cuda()
                labels = labels.to(device)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()


            accuracy = 100 * correct / total

            # Print Loss
            print(f'Iteration {epoch}: {iter}/{n_iters}. Loss: {loss.item()}. Accuracy: {accuracy}, learning_rate={"%.8f" % learning_rate}.')

Iteration 0: 500/4000. Loss: 0.9481704831123352. Accuracy: 65, learning_rate=0.03000000.
Iteration 1: 1000/4000. Loss: 0.6778351664543152. Accuracy: 78, learning_rate=0.03000000.
Iteration 2: 1500/4000. Loss: 0.573559582233429. Accuracy: 84, learning_rate=0.03000000.
Iteration 3: 2000/4000. Loss: 0.3222646713256836. Accuracy: 89, learning_rate=0.00900000.
Iteration 4: 2500/4000. Loss: 0.3601502478122711. Accuracy: 92, learning_rate=0.00900000.
Iteration 4: 3000/4000. Loss: 0.2521718740463257. Accuracy: 92, learning_rate=0.00900000.
Iteration 5: 3500/4000. Loss: 0.1871059089899063. Accuracy: 92, learning_rate=0.00900000.
Iteration 6: 4000/4000. Loss: 0.1089683324098587. Accuracy: 93, learning_rate=0.00270000.
Iteration 7: 4500/4000. Loss: 0.17405270040035248. Accuracy: 93, learning_rate=0.00270000.
Iteration 8: 5000/4000. Loss: 0.11825156956911087. Accuracy: 93, learning_rate=0.00270000.
Iteration 9: 5500/4000. Loss: 0.15355783700942993. Accuracy: 94, learning_rate=0.00270000.
Iteration

In [44]:
correct = 0
total = 0
# Iterate through test dataset
for images, labels in test_loader:
    # Resize images
    images = images.view(-1, seq_dim, input_dim).to(device)   # .cuda()
    labels = labels.to(device)

    # Forward pass only to get logits/output
    outputs = model(images)
    #print(outputs)

    # Get predictions from the maximum value
    _, predicted = torch.max(outputs.data, 1)

    # Total number of labels
    total += labels.size(0)

    # Total correct predictions
    #######################
    #  USE GPU FOR MODEL  #
    #######################
    if torch.cuda.is_available():
        correct += (predicted.cpu() == labels.cpu()).sum()
    else:
        correct += (predicted == labels).sum()
        
    break

In [None]:
cnt = 0
for images, labels in test_loader:
    cnt += 1
    print(cnt, images.shape, labels.shape)
    images = images.view(-1, seq_dim, input_dim).to(device)
    print(cnt, images.shape, labels.shape)
    

In [65]:
images.shape

torch.Size([100, 28, 28])

In [None]:
outputs.data

In [None]:
torch.max(outputs.data, 1)