In [1]:
import sys
sys.path.append("../src")
import torch
import matplotlib.pyplot as plt
import numpy as np
import torchvision
import torch.nn.functional as F

import glob
import os
from datetime import datetime
import time
import math
from tqdm import tqdm

from itertools import repeat
from torch.nn.parameter import Parameter
import collections
import matplotlib
from torch_utils import *
from models import *
from visualization import *
# matplotlib.use('Agg')

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), 
                                            torchvision.transforms.Normalize(mean=(0.0,), std=(1.0,))])

mnist_dset_train = torchvision.datasets.MNIST('./data', train=True, transform=transform, target_transform=None, download=True)
train_loader = torch.utils.data.DataLoader(mnist_dset_train, batch_size=20, shuffle=True, num_workers=0)

mnist_dset_test = torchvision.datasets.MNIST('./data', train=False, transform=transform, target_transform=None, download=True)
test_loader = torch.utils.data.DataLoader(mnist_dset_test, batch_size=20, shuffle=False, num_workers=0)

In [4]:
activation_type = "tanh"
architecture = [784, 500, 500, 10]

x,y = next(iter(train_loader))
x = x.view(x.size(0),-1).to(device).T
y_one_hot = F.one_hot(y, 10).to(device).T
lambda_ = 0.9999
epsilon = 0.1#0.5
neural_lr_start = 0.1/15
neural_lr_stop = 0.0001
neural_lr_rule = "constant"
neural_lr_decay_multiplier = 0.01
neural_dynamic_iterations = 50

lr_start = {'ff' : 0.01, "fb": 0.001}
# lr_start = {'ff' : 0.001, "fb": 0.001}

model = CorInfoMaxV3(architecture, lambda_, epsilon, activation_type)

In [7]:
neurons = model.fast_forward(x)

In [9]:
layers = [x] + neurons

In [12]:
layers[1].shape

torch.Size([500, 20])

In [22]:
torch.norm(F.softmax(x, 0)[:,0] - F.softmax(x[:,0], 0))

tensor(0., device='cuda:0')

In [30]:
sigmaX = F.softmax(x[:,0], 0)
J1 = (torch.diag(sigmaX) - torch.outer(sigmaX, sigmaX))
J1.shape

torch.Size([784, 784])

In [31]:
sigmaX = F.softmax(x, 0)
J2 = torch.diag_embed(sigmaX.T) - torch.einsum('ij, ik -> ijk', sigmaX.T, sigmaX.T)
J2.shape

torch.Size([20, 784, 784])

In [32]:
torch.norm(J1 - J2[0])

tensor(0., device='cuda:0')

In [37]:
sigmaX.T.unsqueeze(2).shape

torch.Size([20, 784, 1])

In [42]:
J1.shape

torch.Size([784, 784])

In [47]:
J1 @ F.softmax(x[:,0], 0).T

tensor([-3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6357e-07,
        -3.6357e-07, -3.6357e-07, -3.6357e-07, -3.6

In [45]:
torch.norm(torch.bmm(J2, sigmaX.T.unsqueeze(2))[0], J1 @ F.softmax(x[:,0], 0).T)

tensor(0.0002, device='cuda:0')

In [46]:
torch.bmm(J2, sigmaX.T.unsqueeze(2)).shape

torch.Size([20, 784, 1])

In [16]:
(torch.diag_embed(x.T) - torch.einsum('ij, ik -> ijk', x.T, x.T)).shape

torch.Size([20, 784, 784])

In [6]:
trn_acc_list = []
tst_acc_list = []

n_epochs = 10
lr = lr_start
for epoch_ in range(n_epochs):
    lr = {'ff' : lr_start['ff'] * (0.9)**epoch_, 'fb' : lr_start['fb'] * (0.9)**epoch_}
    for idx, (x, y) in tqdm(enumerate(train_loader)):
        x, y = x.to(device), y.to(device)
        #x = activation_inverse(x.view(x.size(0),-1).T, activation_type)
        x = x.view(x.size(0),-1).T
        y_one_hot = F.one_hot(y, 10).to(device).T
        y_one_hot = 0.94 * y_one_hot + 0.03 * torch.ones(*y_one_hot.shape, device = device)
        _ = model.batch_step( x, y_one_hot, lr, neural_lr_start, neural_lr_stop, neural_lr_rule,
                              neural_lr_decay_multiplier, neural_dynamic_iterations, optimizer = "sgd"
                            )

    trn_acc = evaluatePC(model, train_loader, device, apply_activation_inverse = False, activation_type = activation_type, printing = False)
    tst_acc = evaluatePC(model, test_loader, device, apply_activation_inverse = False, activation_type = activation_type, printing = False)
    trn_acc_list.append(trn_acc)
    tst_acc_list.append(tst_acc)
    
    print("Epoch : {}, Train Accuracy : {}, Test Accuracy : {}".format(epoch_+1, trn_acc, tst_acc))

3000it [03:17, 15.18it/s]
1it [00:00,  9.25it/s]

Epoch : 1, Train Accuracy : 0.8418, Test Accuracy : 0.8501


153it [00:09, 15.99it/s]


KeyboardInterrupt: 

In [None]:
plot_convergence_plot(trn_acc_list, xlabel = 'Number of Epochs', ylabel = 'Accuracy %',
                      title = 'PC Train Accuracy w.r.t. Epochs', 
                      figsize = (12,8), fontsize = 25, linewidth = 3)

In [None]:
plot_convergence_plot(tst_acc_list, xlabel = 'Number of Epochs', ylabel = 'Accuracy %',
                      title = 'PC Test Accuracy w.r.t. Epochs', 
                      figsize = (12,8), fontsize = 25, linewidth = 3)

In [None]:
plt.figure(figsize = (7,7))
plt.imshow(torch2numpy(model.B[0]["weight"]))

In [None]:
plt.figure(figsize = (7,7))
plt.imshow(torch2numpy(model.Bsigma[1]["weight"]))

In [None]:
model.Bsigma[1]["weight"]