In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import new_alg_v2 as na2
import matplotlib.pyplot as plt
import math
from torchvision.datasets import MNIST, FashionMNIST
from torchvision import transforms
from tqdm import tqdm
import torch.optim as optim
import statistics

torch.set_default_dtype(torch.float64)

In [2]:
def define_dataloaders(n_examples_train, n_examples_test, batch_size, classes=np.arange(10)):
    # MNIST data, batch training
    #batch_size = n_examples_train

    # Choose the classes (at most 10)
    assert max(classes) <= 9

    # Transformation for the images
    transform = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((0.5,), (0.5,)),
                                  ])
    trainset = MNIST(data_dir, download=True, train=True, transform=transform)
    testset = MNIST(data_dir, download=True, train=False, transform=transform)

    # Obtain training and test data. 
    # Note that both datasets are sorted, but the train and test loaders will shuffle them during training.
    n_examples_tt = [n_examples_train, n_examples_test]
    for i_d, (n_examples_i, dataset) in enumerate(zip(n_examples_tt, [trainset, testset])):
        n_per_class = n_examples_i // len(classes)
        data_orig = dataset.data.detach().clone()
        targets_orig = dataset.targets.detach().clone()
        for i_c, class_i in enumerate(classes):
            mask = targets_orig == class_i
            i0 = i_c * n_per_class
            i1 = (i_c+1) * n_per_class
            dataset.data[i0:i1] = data_orig[mask][:n_per_class]
            dataset.targets[i0:i1] = targets_orig[mask][:n_per_class]
        # Fill the remaining slots with random classes from the available choices
        n_remain = n_examples_i - i1 
        for i in range(n_remain):
            class_i = np.random.choice(classes)
            mask = targets_orig == class_i
            idx_i = np.random.choice(torch.where(mask)[0].cpu())
            dataset.data[i1+i] = data_orig[idx_i]
            dataset.targets[i1+i] = targets_orig[idx_i]

        # Cut off
        dataset.data = dataset.data[:n_examples_i]
        dataset.targets = dataset.targets[:n_examples_i]

    # Batch-loader
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
    testloader = torch.utils.data.DataLoader(testset, batch_size=n_examples_test, shuffle=False, num_workers=0)

    return trainloader, testloader

In [3]:
data_dir = '/files/'
batch_size = 512

# Transformation for the images
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
trainset = MNIST(data_dir, download=True, train=True, transform=transform)
testset = MNIST(data_dir, download=True, train=False, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=0)

In [4]:
def calc_output_dim(input_H, input_W, kernel_size, pooling):

    # calc the parameters of the conv1 out put:
    H_out_conv1= math.floor(input_H-kernel_size+1)
    W_out_conv1= math.floor(input_W-kernel_size+1)

    # calc the dim after the pooling:
    pool_H_out= math.floor(((H_out_conv1-pooling)/pooling)+1)
    pool_W_out= math.floor(((W_out_conv1-pooling)/pooling)+1)

    # calc the parameters of the conv2 out put:
    H_out_conv2= math.floor(pool_H_out-kernel_size+1)
    W_out_conv2= math.floor(pool_W_out-kernel_size+1)

    # calc the dim after the pooling:
    pool2_H_out= math.floor(((H_out_conv2-pooling)/pooling)+1)
    pool2_W_out= math.floor(((W_out_conv2-pooling)/pooling)+1)

    # clac the dim of nn.Linear
    final_dim=pool2_H_out*pool2_W_out
    return final_dim

In [5]:
class CNN(nn.Module):
    def __init__(self, n_ch, n_classes, final_dim):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(n_ch, 1, kernel_size=kern_size, bias=False)
        self.conv2 = nn.Conv2d(1, 1, kernel_size=kern_size, bias=False)
        self.fc1 = nn.Linear(final_dim, final_dim, bias=False)
        self.fc2= nn.Linear(final_dim, n_classes, bias=False)

    def forward(self, x):
        final_dim = self.fc1.in_features
        x =torch.tanh( F.max_pool2d(self.conv1(x),pool_size))
        x = torch.tanh(F.max_pool2d(self.conv2(x),pool_size))
        x = x.view(-1, final_dim)
        x = torch.tanh(self.fc1(x))
        x= self.fc2(x)
        return x

In [6]:
def train(net, eta, n_epoch, train_loader, optim_alg = 'NA'):
    if optim_alg == 'GD':
        optimizer = optim.SGD(net.parameters(), lr=eta, momentum=0)
    elif optim_alg == 'ADAM':
        optimizer = optim.Adam(net.parameters(), lr=eta)
    elif optim_alg == 'NA':
        optimizer = na2.new_alg(net.parameters(), lr=eta)
    train_losses = []
    train_accuracy = []
    net.train()
    loss = nn.CrossEntropyLoss()
    for epoch in range(n_epoch):
        for j,(data,target) in enumerate(train_loader):
            optimizer.zero_grad()
            py_hat = net(data)
            objective = loss(py_hat, torch.flatten(target))
            train_losses.append(float(objective))
            objective.backward()
            optimizer.step()
            #if (j%50) == 0:
            correct = 0
            with torch.no_grad():
                output = net(data)
                pred = output.data.max(1, keepdim=True)[1]
                correct += pred.eq(target.data.view_as(pred)).sum()
            #train_accuracy.append(100. * correct / batch_size)
            train_accuracy.append(100. * correct / len(data))

    return train_losses, train_accuracy

In [7]:
def LWC(net, test_loader, conv1_init, conv2_init, fc1_init, fc2_init):
    final_loss = test(net, test_loader)

    conv1_final = net.conv1.weight.clone()
    conv2_final = net.conv2.weight.clone()
    fc1_final = net.fc1.weight.clone()
    fc2_final = net.fc2.weight.clone()

    #init weights conv1
    with torch.no_grad():
      net.conv1.weight[:] = conv1_init
    L_c1 = test(net, test_loader)

    #init weights conv2
    with torch.no_grad():
      net.conv1.weight[:] = conv1_final
      net.conv2.weight[:] = conv2_init
    L_c2 = test(net, test_loader)

    #init weights fc1
    with torch.no_grad():
      net.conv2.weight[:] = conv2_final
      net.fc1.weight[:] = fc1_init
    L_fc1 = test(net, test_loader)

    #init weights fc2
    with torch.no_grad():
      net.fc1.weight[:] = fc1_final
      net.fc2.weight[:] = fc2_init
    L_fc2 = test(net, test_loader)

    with torch.no_grad():
      net.fc2.weight[:] = fc2_final
  
    denom = 4*final_loss - (L_c1 + L_c2 + L_fc1 + L_fc2)
    Xl_conv1 = (final_loss - L_c1)/denom
    Xl_conv2 = (final_loss - L_c2)/denom
    Xl_fc1 = (final_loss - L_fc1)/denom
    Xl_fc2 = (final_loss - L_fc2)/denom
    return np.array([Xl_conv1, Xl_conv2, Xl_fc1, Xl_fc2])

In [8]:
def train_(net, eta, n_epoch, train_loader, test_loader, conv1_init, conv2_init, fc1_init, fc2_init, optim_alg = 'NA'):
    if optim_alg == 'GD':
        optimizer = optim.SGD(net.parameters(), lr=eta, momentum=0)
    elif optim_alg == 'ADAM':
        optimizer = optim.Adam(net.parameters(), lr=eta)
    elif optim_alg == 'NA':
        optimizer = na2.new_alg(net.parameters(), lr=eta)
    train_losses = []
    train_accuracy = []
    LWCs = []

    net.train()
    loss = nn.CrossEntropyLoss()
    for epoch in range(n_epoch):
        for j,(data,target) in enumerate(train_loader):
            optimizer.zero_grad()
            py_hat = net(data)
            objective = loss(py_hat, torch.flatten(target))
            train_losses.append(float(objective))
            objective.backward()
            optimizer.step()
            correct = 0
            with torch.no_grad():
                output = net(data)
                pred = output.data.max(1, keepdim=True)[1]
                correct += pred.eq(target.data.view_as(pred)).sum()
            train_accuracy.append(100. * correct / len(data))
            if (j%10) == 0:
              LWCs.append(LWC(net, test_loader, conv1_init, conv2_init, fc1_init, fc2_init))

    return train_losses, train_accuracy, LWCs

In [9]:
def test(net, test_loader):
    loss = nn.CrossEntropyLoss()
    with torch.no_grad():
        for j,(data, target) in enumerate(test_loader):
            output = net(data)
            test_loss = loss(output, torch.flatten(target))

    return float(test_loss)

In [10]:
# parameters
n_s = batch_size # number of sample
n_ch = 1 # number of input channels 
d0 = 28 # dim0 of input image
d1 = 28 # dim1 of input image
fc2_in= 10
n_epoch = 1
output_dim = n_s
kern_size=5
pool_size=2
n_classes = 10
lr = 1e-2
batch_size = 1024
DS_size = 60000
iterations = 10
alg = 'NA'

final_dim = calc_output_dim(d0, d1, kern_size, pool_size)

In [11]:
def xl_calc_fullt_trained_nn(eta, n_ch, d0, d1, kern_size, pool_size, train_loader, test_loader, optim_alg, n_epoch):

    #define the net and save init params
    final_dim = calc_output_dim(d0, d1, kern_size, pool_size)
    network = CNN(n_ch, n_classes, final_dim)
    conv1_init = network.conv1.weight.clone()
    conv2_init = network.conv2.weight.clone()
    fc1_init = network.fc1.weight.clone()
    fc2_init = network.fc2.weight.clone()

    #train and save init and final loss
    initial_loss = test(network, test_loader)
    train_losses, train_accuracy, LWCs = train_(network, eta, n_epoch, train_loader, test_loader, conv1_init, conv2_init, fc1_init, fc2_init, initial_loss, optim_alg)
    final_loss = test(network, test_loader)

    #save final net params
    conv1_final = network.conv1.weight.clone()
    conv2_final = network.conv2.weight.clone()
    fc1_final = network.fc1.weight.clone()
    fc2_final = network.fc2.weight.clone()

    #init weights conv1
    with torch.no_grad():
      network.conv1.weight[:] = conv1_init
    loss_conv1 = test(network, test_loader)
        
    #init weights conv2
    with torch.no_grad():
      network.conv1.weight[:] = conv1_final
      network.conv2.weight[:] = conv2_init
    loss_conv2 = test(network, test_loader)
    
    #init weights fc1
    with torch.no_grad():
      network.conv2.weight[:] = conv2_final
      network.fc1.weight[:] = fc1_init
    loss_fc1 = test(network, test_loader)

    #init weights fc2
    with torch.no_grad():
      network.fc1.weight[:] = fc1_final
      network.fc2.weight[:] = fc2_init

    loss_fc2 = test(network, test_loader)
    dL = final_loss - initial_loss

    #Xl_conv1 = (final_loss - loss_conv1)/dL
    #Xl_conv2 = (final_loss - loss_conv2)/dL
    #Xl_fc1 = (final_loss-loss_fc1)/dL
    #Xl_fc2 = (final_loss-loss_fc2)/dL

    # alternative calc of layer contribution
    denom = 4*final_loss - (loss_conv1 + loss_conv2 + loss_fc1 + loss_fc2)
    Xl_conv1 = (final_loss - loss_conv1)/denom
    Xl_conv2 = (final_loss - loss_conv2)/denom
    Xl_fc1 = (final_loss - loss_fc1)/denom
    Xl_fc2 = (final_loss - loss_fc2)/denom

    return [Xl_conv1, Xl_conv2, Xl_fc1 ,Xl_fc2], train_loss, train_accuracy, LWCs
    

In [12]:
def layer_cont_avg(batch_size, alg, iter, n_epoch):

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=0)
    train_losses = []
    train_accuracies = []
    LWCs_trough_training = []
    layer_cont = np.zeros((iter,4))
    for k in range(iter):
        Xls, train_loss, train_accuracy, LWCs = xl_calc_fullt_trained_nn(lr, n_ch, d0, d1, kern_size, pool_size, trainloader, testloader, alg, n_epoch)
        layer_cont[k,:] = np.array(Xls)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        

    stds = []
    avgs = []
    for j in range(4):
        stds.append(np.std(layer_cont[:,j]))
        avgs.append(np.sum(layer_cont[:,j])/iter)

    return train_loss, train_accuracy, stds, avgs, LWCs

In [13]:
def plot_res(train_loss, train_accuracy, stds, avgs, batch_size, alg):
    fig = plt.figure(figsize=(18, 5))
    if alg == 'NA':
        title = 'New Alg, batch size ' + str(batch_size)
    elif alg == 'GD':
        title = 'GD, batch size ' + str(batch_size)
    elif alg == 'ADAM':
        title = 'Adam, batch size ' + str(batch_size)
    fig.suptitle(title, fontsize=25)

    ax = fig.add_subplot(1,3,1)
    ax.bar(['1','2','3','4'], avgs, yerr = stds, width = 0.4, align='center', alpha=0.5, ecolor='black', capsize=5)
    ax.set_title('layer contribution', fontsize=15)

    x = list(range(len(train_loss)))
    ax = fig.add_subplot(1,3,2)
    plt.plot(x,train_loss)
    ax.set_title('loss vs training steps', fontsize=15)

    x = list(range(len(train_accuracy)))
    ax = fig.add_subplot(1,3,3)
    plt.plot(x,train_accuracy)
    ax.set_title('accuracy vs training steps', fontsize=15)

    plt.tight_layout()

In [14]:
batch_size = 1024
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds1024, avgs1024, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds1024, avgs1024, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

TypeError: train_() takes from 9 to 10 positional arguments but 11 were given

In [None]:
print(len(L_c1), len(L_c2), len(L_fc1), len(L_fc2))
R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('GD - layer contributions average and std', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
batch_size = 512
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds512, avgs512, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds512, avgs512, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
batch_size = 256
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds256, avgs256, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds256, avgs256, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
batch_size = 128
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds128, avgs128, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds128, avgs128, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
batch_size = 64
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds64, avgs64, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds64, avgs64, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
batch_size = 32
n_epoch = round((1875*batch_size)/60000)
train_loss, train_accuracy, stds32, avgs32, L_c1, L_c2, L_fc1, L_fc2 = layer_cont_avg(batch_size, alg, iterations, n_epoch)
plot_res(train_loss, train_accuracy, stds32, avgs32, batch_size, alg)

R = [L_c1, L_c2, L_fc1, L_fc2]
fig = plt.figure(figsize=(18, 10))
fig.suptitle('layer contributions through training', fontsize=25)
R_x = range(len(L_fc1))
for i in range(4):
    plt.plot(R_x, R[i])
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
# results
batch_sizes = [32, 64 ,128 ,256 ,512 ,1024]
layer_cont = np.array([avgs32, avgs64, avgs128, avgs256, avgs512, avgs1024]).T
layer_cont_std = np.array([stds32, stds64, stds128, stds256, stds512, stds1024]).T


fig = plt.figure(figsize=(18, 10))

fig.suptitle('New Alg - layer contributions average and std', fontsize=25)
for i in range(4):
    plt.plot(batch_sizes,layer_cont[i,:])
    plt.fill_between(batch_sizes, layer_cont[i,:]-layer_cont_std[i,:], layer_cont[i,:]+layer_cont_std[i,:], alpha=0.2)    
plt.legend(['l1', 'l2', 'l3', 'l4'])

In [None]:
# results
fig = plt.figure(figsize=(18, 10))
fig.suptitle('New Alg - layer contributions average and std', fontsize=25)
colors = ['C0', 'C1', 'C4', 'C3']
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    plt.plot(batch_sizes,layer_cont[i,:], colors[i])
    plt.fill_between(batch_sizes, layer_cont[i,:]-layer_cont_std[i,:], layer_cont[i,:]+layer_cont_std[i,:], facecolor = colors[i], alpha=0.2)
    title = 'L' + str(i+1)
    ax.set_title(title, fontsize=15)
    plt.ylim(-0.2, 1)


---