#### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
sys.path.append("..")

In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [3]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

In [4]:
from tqdm import tqdm

In [5]:
from bptt_tgeb_mnist_architecture import *

#### Test for CUDA

In [6]:
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('No GPU, training on CPU')
    dev = torch.device('cpu')
else:
    print('GPU found, training on GPU')
    dev = torch.device('cuda')

No GPU, training on CPU


#### Load MNIST

In [7]:
## Make sure batch_size = 1 for now!!

def load_mnist(batch_size=1, shuffle_train=True):
    transform = torchvision.transforms.Compose(
        [torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.5,), (0.5,))])
    train_set = torchvision.datasets.MNIST("../data", train=True, download=True, transform=transform)
    test_set = torchvision.datasets.MNIST("../data", train=False, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=shuffle_train)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [8]:
mnist_train_loader, mnist_test_loader = load_mnist()

#### Architectural initialisations

In [9]:
n_classes = 10

In [10]:
input_dim = 784
hidden_dim = 100
output_dim = 1

In [11]:
## Gating vector
tvec_hh = torch.zeros(n_classes,hidden_dim)
for ii in range(n_classes):
    t_half = torch.randint(0, 2, (1, hidden_dim//2)).float()*2 - 1
    tvec_hh[ii,::2] = t_half
    tvec_hh[ii,1::2] = -t_half

In [12]:
## Gating vector
tvec_ih = torch.zeros(n_classes,hidden_dim)
for ii in range(n_classes):
    t_half = torch.randint(0, 2, (1, hidden_dim//2)).float()*2 - 1
    tvec_ih[ii,::2] = t_half
    tvec_ih[ii,1::2] = -t_half

In [13]:
# tvec_ih = tvec_hh

#### Architecture

In [14]:
cell = RNNModule(input_dim, hidden_dim, output_dim, tvec_ih, tvec_hh)
rnn = RNN(cell)

In [15]:
# cell.to(dev)
# rnn.to(dev)

#### Loss

In [16]:
def compute_loss(ys, ts):
    return 0.5 * torch.sum((ys - ts)**2)

In [17]:
criterion_ce = nn.CrossEntropyLoss()

#### Clip micro gradients

In [18]:
def clip_micro_grads(grad_tensor, minVal=-1e-7, maxVal=1e-7):
    grad_tensor[grad_tensor==torch.clamp(grad_tensor, minVal, maxVal)] = 0
    return grad_tensor

#### Training loop

In [19]:
params = [rnn.cell.Whh]+[rnn.cell.Woh]

In [20]:
optimizer = optim.SGD(params, lr=1e-3)

In [21]:
epochs = 10

In [22]:
train_losses = np.zeros(epochs)
train_acc = np.zeros(epochs)

acc_classes = np.zeros(n_classes)

In [23]:
# a = ys.float().view(1, -1)
# b = torch.LongTensor([6])

# criterion = nn.CrossEntropyLoss()
# ll = criterion(a, b)
# print(ll)

In [24]:
for e in range(epochs):
    
    running_loss = 0
    running_acc = 0
    
    for image, label in tqdm(mnist_test_loader):
        
        ## Clear older gradients
        optimizer.zero_grad()
        
        ## Change to appropriate shapes!!
        image = torch.squeeze(image).view(1,-1)
        image = image.repeat(n_classes,1)

        xs = image
        hp = torch.zeros(cell.hid_dim) ## very first hidden state is the zero vector
        ts = torch.LongTensor(label)
        
        if train_on_gpu:
            xs, hp, ts = xs.cuda(), hp.cuda(), ts.cuda()

        ## Forward pass
        ys, hs = rnn.forward(xs, hp)
#         loss = compute_loss(ys, ts)
        loss = criterion_ce(ys.float().view(1, -1),ts)

        ## Compute gradients w/ Backprop (autograd)
        loss.backward()
        
        ## update weights
        optimizer.step()
        
        ## update loss
        running_loss += loss.item()
        
        ##check if sample is correctly classified
        pred_class = torch.argmax(ys)
        true_class = ts
        if (pred_class-true_class) == 0:
            running_acc +=1
        acc_classes[int(true_class[0])] += 1
    
    train_loss = running_loss
    train_acc = running_acc
    print(f"Training loss: {running_loss/len(mnist_test_loader)}")
    print(f"Training acc: {running_acc/len(mnist_test_loader)}")

100%|██████████| 10000/10000 [00:35<00:00, 285.26it/s]


Training loss: 2.291307710003853
Training acc: 0.1707


100%|██████████| 10000/10000 [00:34<00:00, 288.47it/s]


Training loss: 2.2598904163360594
Training acc: 0.2317


100%|██████████| 10000/10000 [00:35<00:00, 284.68it/s]


Training loss: 2.209179785335064
Training acc: 0.3146


100%|██████████| 10000/10000 [00:36<00:00, 274.80it/s]


Training loss: 2.152346224117279
Training acc: 0.4047


100%|██████████| 10000/10000 [00:35<00:00, 281.22it/s]


Training loss: 2.08814946488142
Training acc: 0.4458


100%|██████████| 10000/10000 [00:36<00:00, 276.39it/s]


Training loss: 2.0108334921598434
Training acc: 0.4811


100%|██████████| 10000/10000 [00:35<00:00, 278.15it/s]


Training loss: 1.926364913713932
Training acc: 0.5347


100%|██████████| 10000/10000 [00:36<00:00, 276.22it/s]


Training loss: 1.8523283343315124
Training acc: 0.6371


100%|██████████| 10000/10000 [00:50<00:00, 196.45it/s]


Training loss: 1.8013119679808616
Training acc: 0.7062


100%|██████████| 10000/10000 [00:35<00:00, 281.54it/s]

Training loss: 1.767165866100788
Training acc: 0.7351





In [25]:
for e in range(epochs):
    
    running_loss = 0
    running_acc = 0
    
    for image, label in tqdm(mnist_test_loader):
        
        ## Clear older gradients
        optimizer.zero_grad()
        
        ## Change to appropriate shapes!!
        image = torch.squeeze(image).view(1,-1)
        image = image.repeat(n_classes,1)

        xs = image
        hp = torch.zeros(cell.hid_dim) ## very first hidden state is the zero vector
        ts = torch.LongTensor(label)
        
        if train_on_gpu:
            xs, hp, ts = xs.cuda(), hp.cuda(), ts.cuda()

        ## Forward pass
        ys, hs = rnn.forward(xs, hp)
#         loss = compute_loss(ys, ts)
        loss = criterion_ce(ys.float().view(1, -1),ts)

        ## Compute gradients w/ Backprop (autograd)
        loss.backward()
        
        ## update weights
        optimizer.step()
        
        ## update loss
        running_loss += loss.item()
        
        ##check if sample is correctly classified
        pred_class = torch.argmax(ys)
        true_class = ts
        if (pred_class-true_class) == 0:
            running_acc +=1
        acc_classes[int(true_class[0])] += 1
    
    train_loss = running_loss
    train_acc = running_acc
    print(f"Training loss: {running_loss/len(mnist_test_loader)}")
    print(f"Training acc: {running_acc/len(mnist_test_loader)}")

100%|██████████| 10000/10000 [00:31<00:00, 312.91it/s]


Training loss: 1.7425547768354417
Training acc: 0.7533


100%|██████████| 10000/10000 [00:32<00:00, 304.43it/s]


Training loss: 1.7228998099803925
Training acc: 0.7666


100%|██████████| 10000/10000 [00:34<00:00, 293.52it/s]


Training loss: 1.7060010840296744
Training acc: 0.7816


100%|██████████| 10000/10000 [00:34<00:00, 292.93it/s]


Training loss: 1.6906853364944459
Training acc: 0.7965


100%|██████████| 10000/10000 [00:35<00:00, 282.37it/s]


Training loss: 1.6772167379021645
Training acc: 0.8083


100%|██████████| 10000/10000 [00:34<00:00, 288.71it/s]


Training loss: 1.667074357664585
Training acc: 0.8165


100%|██████████| 10000/10000 [00:36<00:00, 276.25it/s]


Training loss: 1.659704866719246
Training acc: 0.8205


100%|██████████| 10000/10000 [00:35<00:00, 279.12it/s]


Training loss: 1.6515677145957948
Training acc: 0.8292


100%|██████████| 10000/10000 [00:38<00:00, 259.06it/s]


Training loss: 1.6453031437993049
Training acc: 0.8353


100%|██████████| 10000/10000 [00:35<00:00, 282.01it/s]

Training loss: 1.6402118409991264
Training acc: 0.8359





In [26]:
for e in range(epochs):
    
    running_loss = 0
    running_acc = 0
    
    for image, label in tqdm(mnist_test_loader):
        
        ## Clear older gradients
        optimizer.zero_grad()
        
        ## Change to appropriate shapes!!
        image = torch.squeeze(image).view(1,-1)
        image = image.repeat(n_classes,1)

        xs = image
        hp = torch.zeros(cell.hid_dim) ## very first hidden state is the zero vector
        ts = torch.LongTensor(label)
        
        if train_on_gpu:
            xs, hp, ts = xs.cuda(), hp.cuda(), ts.cuda()

        ## Forward pass
        ys, hs = rnn.forward(xs, hp)
#         loss = compute_loss(ys, ts)
        loss = criterion_ce(ys.float().view(1, -1),ts)

        ## Compute gradients w/ Backprop (autograd)
        loss.backward()
        
        ## update weights
        optimizer.step()
        
        ## update loss
        running_loss += loss.item()
        
        ##check if sample is correctly classified
        pred_class = torch.argmax(ys)
        true_class = ts
        if (pred_class-true_class) == 0:
            running_acc +=1
        acc_classes[int(true_class[0])] += 1
    
    train_loss = running_loss
    train_acc = running_acc
    print(f"Training loss: {running_loss/len(mnist_test_loader)}")
    print(f"Training acc: {running_acc/len(mnist_test_loader)}")

100%|██████████| 10000/10000 [00:35<00:00, 282.39it/s]


Training loss: 1.6355363038301467
Training acc: 0.8414


100%|██████████| 10000/10000 [00:36<00:00, 276.65it/s]


Training loss: 1.6306881885528564
Training acc: 0.8466


100%|██████████| 10000/10000 [00:46<00:00, 215.30it/s]


Training loss: 1.6278311962127685
Training acc: 0.8448


100%|██████████| 10000/10000 [01:00<00:00, 165.96it/s]


Training loss: 1.6234499951958656
Training acc: 0.8511


100%|██████████| 10000/10000 [00:37<00:00, 267.33it/s]


Training loss: 1.6210652653932571
Training acc: 0.8513


100%|██████████| 10000/10000 [00:55<00:00, 180.55it/s]


Training loss: 1.616035195529461
Training acc: 0.8573


100%|██████████| 10000/10000 [00:34<00:00, 287.60it/s]


Training loss: 1.6129517855882645
Training acc: 0.8639


100%|██████████| 10000/10000 [00:38<00:00, 262.75it/s]


Training loss: 1.6099500394940376
Training acc: 0.8624


100%|██████████| 10000/10000 [00:35<00:00, 283.64it/s]


Training loss: 1.6084320532679557
Training acc: 0.8631


100%|██████████| 10000/10000 [00:57<00:00, 173.16it/s]

Training loss: 1.6052231096506118
Training acc: 0.8669



