In [None]:
#Implementation of FCN cases in the paper "are all layers created equal?"

In [1]:
import numpy as np
import torch 
import matplotlib.pyplot as plt
import torchvision
from time import time
from torchvision import datasets, transforms
from torch import nn,optim

In [2]:
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,),(0.5,)), ])

In [3]:
trainset = datasets.MNIST('./',download = True, train = True, transform = transform)

In [4]:
valset = datasets.MNIST('./',download= True, train= False, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64,shuffle = True)
valloader = torch.utils.data.DataLoader(valset, batch_size = 64, shuffle = True)

In [5]:
dataiter = iter(trainloader)
images, labels = dataiter.next()

print(images.shape)
print(labels.shape)

torch.Size([64, 1, 28, 28])
torch.Size([64])


In [6]:
#initialization function
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)



In [7]:
#define the model 
inputsize = 784
hiddensizes = [256,256,256]
outputsize = 10
net = nn.Sequential(nn.Linear(inputsize,hiddensizes[0]),nn.ReLU(),nn.Linear(hiddensizes[0],hiddensizes[1]),nn.ReLU(),
                    nn.Linear(hiddensizes[1],hiddensizes[2]),nn.ReLU(),nn.Linear(hiddensizes[2],outputsize),nn.LogSoftmax(dim=1))
net.apply(init_weights)
w_init= list(net.parameters())
torch.save(net.state_dict(),'weights_i.pth')
print(net)

Sequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): ReLU()
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=10, bias=True)
  (7): LogSoftmax()
)


In [8]:
#
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0],-1)
logps = (net(images))
loss = criterion(logps,labels)

In [9]:
optimizer = optim.SGD(net.parameters(),lr = 0.003, momentum = 0.9 )
time0 = time()
epochs =16
for e in range(epochs):
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
#     else:
    print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader)))
#     print(running_loss)
            
    print("\n Training time in minutes = ", (time()-time0)/60)

Epoch 0 - Training loss: 0.41311991288622557

 Training time in minutes =  0.31357962687810265
Epoch 1 - Training loss: 0.1938161818223245

 Training time in minutes =  0.5853533585866292
Epoch 2 - Training loss: 0.1435377686254696

 Training time in minutes =  0.8783549110094706
Epoch 3 - Training loss: 0.11363233616754317

 Training time in minutes =  1.1872979005177815
Epoch 4 - Training loss: 0.09497234638013852

 Training time in minutes =  1.4982877930005392
Epoch 5 - Training loss: 0.0808761963578088

 Training time in minutes =  1.8090701540311178
Epoch 6 - Training loss: 0.06877056525440327

 Training time in minutes =  2.1140501936276754
Epoch 7 - Training loss: 0.06088526386182819

 Training time in minutes =  2.421694445610046
Epoch 8 - Training loss: 0.05279280053975899

 Training time in minutes =  2.732470198472341
Epoch 9 - Training loss: 0.04566774734962128

 Training time in minutes =  3.0414255460103354
Epoch 10 - Training loss: 0.04079383348739311

 Training time in

In [10]:
torch.save(net.state_dict(),'weights_f.pth')

In [11]:
#testing
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))

Number Of Images Tested = 10000

Model Accuracy = 0.9772


In [20]:
wi = torch.load('weights_i.pth')
wf= torch.load('weights_f.pth')
wf['0.bias'] = wi['0.bias']
wf['0.weight'] = wi['0.weight']
torch.save(wf,'weights_exp1.pth')


In [21]:
#testing with layer 1 re-initialized 
#testing
cpt = [0,2,4,8,16]
ctr = 0

for i in range(16):
    
    for j in cpt:
        if (j == ctr):
            #run testing
            net.load_state_dict(torch.load('weights_exp1.pth'))
            correct_count, all_count = 0, 0
            for images,labels in valloader:
                for i in range(len(labels)):
                    img = images[i].view(1, 784)
                    with torch.no_grad():
                        logps = net(img)
                    ps = torch.exp(logps)
                    probab = list(ps.numpy()[0])
                    pred_label = probab.index(max(probab))
                    true_label = labels.numpy()[i]
                    if(true_label == pred_label):
                          correct_count += 1
                    all_count += 1
            print("checkpoint=",ctr,"\nModel Accuracy =", (correct_count/all_count))

    
    #run training
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        torch.save(net.state_dict(),'weights_exp1.pth')    
                
    ctr = ctr + 1
            

checkpoint= 0 
Model Accuracy = 0.5299
checkpoint= 2 
Model Accuracy = 0.9708
checkpoint= 4 
Model Accuracy = 0.97
checkpoint= 8 
Model Accuracy = 0.9772


In [22]:
#changing layer 2 weights
wf= torch.load('weights_f.pth')
wf['2.bias'] = wi['2.bias']
wf['2.weight'] = wi['2.weight']
torch.save(wf,'weights_exp2.pth')
#testing with layer 1 re-initialized 
#testing
cpt = [0,2,4,8,16]
ctr = 0

for i in range(16):
    
    for j in cpt:
        if (j == ctr):
            #run testing
            net.load_state_dict(torch.load('weights_exp2.pth'))
            correct_count, all_count = 0, 0
            for images,labels in valloader:
                for i in range(len(labels)):
                    img = images[i].view(1, 784)
                    with torch.no_grad():
                        logps = net(img)
                    ps = torch.exp(logps)
                    probab = list(ps.numpy()[0])
                    pred_label = probab.index(max(probab))
                    true_label = labels.numpy()[i]
                    if(true_label == pred_label):
                          correct_count += 1
                    all_count += 1
            print("checkpoint=",ctr,"\nModel Accuracy =", (correct_count/all_count))

    
    #run training
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        torch.save(net.state_dict(),'weights_exp2.pth')    
                
    ctr = ctr + 1
            

checkpoint= 0 
Model Accuracy = 0.9463
checkpoint= 2 
Model Accuracy = 0.9775
checkpoint= 4 
Model Accuracy = 0.9723
checkpoint= 8 
Model Accuracy = 0.9791


In [24]:
#changing layer 3 weights
wf= torch.load('weights_f.pth')
wf['4.bias'] = wi['4.bias']
wf['4.weight'] = wi['4.weight']
torch.save(wf,'weights_exp3.pth')
#testing with layer 3 re-initialized 
cpt = [0,2,4,8,16]
ctr = 0

for i in range(16):
    
    for j in cpt:
        if (j == ctr):
            #run testing
            net.load_state_dict(torch.load('weights_exp3.pth'))
            correct_count, all_count = 0, 0
            for images,labels in valloader:
                for i in range(len(labels)):
                    img = images[i].view(1, 784)
                    with torch.no_grad():
                        logps = net(img)
                    ps = torch.exp(logps)
                    probab = list(ps.numpy()[0])
                    pred_label = probab.index(max(probab))
                    true_label = labels.numpy()[i]
                    if(true_label == pred_label):
                          correct_count += 1
                    all_count += 1
            print("checkpoint=",ctr,"\nModel Accuracy =", (correct_count/all_count))

    
    #run training
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        torch.save(net.state_dict(),'weights_exp3.pth')    
                
    ctr = ctr + 1
            

checkpoint= 0 
Model Accuracy = 0.9528
checkpoint= 2 
Model Accuracy = 0.979
checkpoint= 4 
Model Accuracy = 0.9787
checkpoint= 8 
Model Accuracy = 0.9786


In [25]:
#changing layer 4 weights
wf= torch.load('weights_f.pth')
wf['6.bias'] = wi['6.bias']
wf['6.weight'] = wi['6.weight']
torch.save(wf,'weights_exp4.pth')
#testing with layer 4 re-initialized 
cpt = [0,2,4,8,16]
ctr = 0

for i in range(16):
    
    for j in cpt:
        if (j == ctr):
            #run testing
            net.load_state_dict(torch.load('weights_exp4.pth'))
            correct_count, all_count = 0, 0
            for images,labels in valloader:
                for i in range(len(labels)):
                    img = images[i].view(1, 784)
                    with torch.no_grad():
                        logps = net(img)
                    ps = torch.exp(logps)
                    probab = list(ps.numpy()[0])
                    pred_label = probab.index(max(probab))
                    true_label = labels.numpy()[i]
                    if(true_label == pred_label):
                          correct_count += 1
                    all_count += 1
            print("checkpoint=",ctr,"\nModel Accuracy =", (correct_count/all_count))

    
    #run training
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        torch.save(net.state_dict(),'weights_exp4.pth')    
                
    ctr = ctr + 1

checkpoint= 0 
Model Accuracy = 0.9493
checkpoint= 2 
Model Accuracy = 0.9785
checkpoint= 4 
Model Accuracy = 0.9792
checkpoint= 8 
Model Accuracy = 0.9779


In [26]:
#checking for re-randomization performance
#re-randomizing layer 1 weights
wf= torch.load('weights_f.pth')
torch.nn.init.xavier_uniform_(wf['0.weight'])
wf['0.bias'].data.fill_(0.01)

torch.save(wf,'weights_exp_rerand_1.pth')
#testing with layer 1 re-initialized 
#testing
net.load_state_dict(torch.load('weights_exp_rerand_1.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))

Number Of Images Tested = 10000

Model Accuracy = 0.0758


In [27]:
#checking for re-randomization performance
#re-randomizing layer 2 weights
wf= torch.load('weights_f.pth')
torch.nn.init.xavier_uniform_(wf['2.weight'])
wf['2.bias'].data.fill_(0.01)

torch.save(wf,'weights_exp_rerand_2.pth')
#testing with layer 1 re-initialized 
#testing
net.load_state_dict(torch.load('weights_exp_rerand_2.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))

Number Of Images Tested = 10000

Model Accuracy = 0.1045


In [28]:
#checking for re-randomization performance
#re-randomizing layer 3 weights
wf= torch.load('weights_f.pth')
torch.nn.init.xavier_uniform_(wf['4.weight'])
wf['4.bias'].data.fill_(0.01)

torch.save(wf,'weights_exp_rerand_3.pth')
#testing with layer 1 re-initialized 
#testing
net.load_state_dict(torch.load('weights_exp_rerand_3.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))

Number Of Images Tested = 10000

Model Accuracy = 0.2064


In [30]:
#checking for re-randomization performance
#re-randomizing layer 4 weights
wf= torch.load('weights_f.pth')
torch.nn.init.xavier_uniform_(wf['6.weight'])
wf['6.bias'].data.fill_(0.01)

torch.save(wf,'weights_exp_rerand_4.pth')
#testing with layer 1 re-initialized 
#testing
net.load_state_dict(torch.load('weights_exp_rerand_3.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))

Number Of Images Tested = 10000

Model Accuracy = 0.2064


In [31]:
wi = torch.load('weights_i.pth')
wf= torch.load('weights_f.pth')
wf['0.bias'] = wi['0.bias']
wf['0.weight'] = wi['0.weight']
torch.save(wf,'weights_exp1.pth')

wi = torch.load('weights_i.pth')
wf= torch.load('weights_f.pth')
wf['2.bias'] = wi['2.bias']
wf['2.weight'] = wi['2.weight']
torch.save(wf,'weights_exp2.pth')

wi = torch.load('weights_i.pth')
wf= torch.load('weights_f.pth')
wf['4.bias'] = wi['4.bias']
wf['4.weight'] = wi['4.weight']
torch.save(wf,'weights_exp3.pth')

wi = torch.load('weights_i.pth')
wf= torch.load('weights_f.pth')
wf['6.bias'] = wi['6.bias']
wf['6.weight'] = wi['6.weight']
torch.save(wf,'weights_exp4.pth')

In [32]:
#L2 norm between learned and re initialized weights of l1
w_reinit_1= torch.load('weights_exp1.pth')
w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256)
wf= torch.load('weights_f.pth')
wf_0=wf['0.weight'].resize(1,784*256)
pdist = torch.nn.PairwiseDistance(p=2)

print('Reinit distance L1',pdist(wf_0,w_reinit_l1).item())

#L norm between learned and re initialized weights of l2
w_reinit_2= torch.load('weights_exp2.pth')
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')
wf_2=wf['2.weight'].resize(1,256*256)

print('Reinit distance L2',pdist(wf_2,w_reinit_l2).item())

#L2 norm between learned and re initialized weights of l3
w_reinit_3= torch.load('weights_exp3.pth')
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')
wf_3=wf['4.weight'].resize(1,256*256)

print('Reinit distance L3',pdist(wf_3,w_reinit_l3).item())

#L2 norm between learned and re initialized weights of l4
w_reinit_4= torch.load('weights_exp4.pth')
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,256*10)
wf= torch.load('weights_f.pth')

wf_4=wf['6.weight'].resize(1,256*10)

print('Reinit distance L4',pdist(wf_4,w_reinit_l4).item())



Reinit distance L1 6.6157546043396
Reinit distance L2 4.034752368927002
Reinit distance L3 3.7830379009246826
Reinit distance L4 3.2720789909362793




Unlike as shown in the paper the layer 1 which affects final accuracy the most also accounts for the largest L2 norm difference in initial and final weights is largest as expected from 

In [27]:
#checking the same for re-randomized weights of layer 1
w_rerand_1= torch.load('weights_exp_rerand_1.pth')
w_rerand_l1 = w_rerand_1['0.weight'].resize(1,784*256)
wf= torch.load('weights_f.pth')

wf_0=wf['0.weight'].resize(1,784*256)
pdist = torch.nn.PairwiseDistance(p=2)

pdist(wf_0,w_rerand_l1).item()

28.204221725463867

In [32]:
#checking the same for re-randomized weights of layer 2
w_rerand_2= torch.load('weights_exp_rerand_2.pth')
w_rerand_l2 = w_rerand_2['2.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')

wf_2=wf['2.weight'].resize(1,256*256)
pdist = torch.nn.PairwiseDistance(p=2)

pdist(wf_2,w_rerand_l2).item()

22.968156814575195

In [33]:
#checking the same for re-randomized weights of layer 3
w_rerand_3= torch.load('weights_exp_rerand_3.pth')
w_rerand_l3 = w_rerand_3['4.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')

wf_3=wf['4.weight'].resize(1,256*256)
pdist = torch.nn.PairwiseDistance(p=2)

pdist(wf_3,w_rerand_l3).item()

22.963138580322266

In [36]:
#checking the same for re-randomized weights of layer 4
w_rerand_4= torch.load('weights_exp_rerand_4.pth')
w_rerand_l4 = w_rerand_4['6.weight'].resize(1,10*256)
wf= torch.load('weights_f.pth')

wf_4=wf['6.weight'].resize(1,10*256)
pdist = torch.nn.PairwiseDistance(p=2)

pdist(wf_4,w_rerand_l4).item()

7.329382419586182

    Here as well, the one change from the trend explained in the paper is that layer 1 has the highest change 
    in weight in the rerandomized case. Layer 2 and 3 having a larger distance than layer 4 holds true.

In [33]:
#infinity norm distance layer 1 reinitialization
w_reinit_1= torch.load('weights_exp1.pth')
w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256)
wf= torch.load('weights_f.pth')
wf_0=wf['0.weight'].resize(1,784*256)

print('Infinity norm distance for l1',torch.max(torch.abs((w_reinit_l1 - wf_0))).item())

#Linf norm between learned and re initialized weights of l2
w_reinit_2= torch.load('weights_exp2.pth')
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')
wf_2=wf['2.weight'].resize(1,256*256)
print('Infinity norm distance for l2',torch.max(torch.abs((w_reinit_l2 - wf_2))).item())

# l3
w_reinit_3= torch.load('weights_exp3.pth')
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')
wf_3=wf['4.weight'].resize(1,256*256)
print('Infinity norm distance for l3',torch.max(torch.abs((w_reinit_l3 - wf_3))).item())


#Linf norm between learned and re initialized weights of l4
w_reinit_4= torch.load('weights_exp4.pth')
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,256*10)
wf= torch.load('weights_f.pth')
wf_4=wf['6.weight'].resize(1,256*10)
print('Infinity nirm distance l4',torch.max(torch.abs((w_reinit_l4 - wf_4))).item())


Infinity norm distance for l1 0.15976768732070923
Infinity norm distance for l2 0.12841816246509552
Infinity norm distance for l3 0.11122281104326248
Infinity nirm distance l4 0.26753726601600647


Layer 4 has the largest Linf norm as shown in the paper. Layer 1 has a smaller Linf norm than layer 2 and 3

In [32]:
#Linf norm between learned and re randomized weights 
#re-randomized weights of layer 1
w_rerand_1= torch.load('weights_exp_rerand_1.pth')
w_rerand_l1 = w_rerand_1['0.weight'].resize(1,784*256)
wf= torch.load('weights_f.pth')

wf_0=wf['0.weight'].resize(1,784*256)
print('Linf norm layer 1:',torch.max(torch.abs((w_rerand_l1 - wf_0))).item()
)

# re-randomized weights of layer 2
w_rerand_2= torch.load('weights_exp_rerand_2.pth')
w_rerand_l2 = w_rerand_2['2.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')

wf_2=wf['2.weight'].resize(1,256*256)
print('Linf norm layer 2:',torch.max(torch.abs((w_rerand_l2 - wf_2))).item()
)

#re-randomized weights of layer 3
w_rerand_3= torch.load('weights_exp_rerand_3.pth')
w_rerand_l3 = w_rerand_3['4.weight'].resize(1,256*256)
wf= torch.load('weights_f.pth')

wf_3=wf['4.weight'].resize(1,256*256)
print('Linf norm layer 3:',torch.max(torch.abs((w_rerand_l3 - wf_3))).item()
)

#checking the same for re-randomized weights of layer 4
w_rerand_4= torch.load('weights_exp_rerand_4.pth')
w_rerand_l4 = w_rerand_4['6.weight'].resize(1,10*256)
wf= torch.load('weights_f.pth')

wf_4=wf['6.weight'].resize(1,10*256)
print('Linf norm layer 4:',torch.max(torch.abs((w_rerand_l4 - wf_4))).item()
)




Linf norm layer 1: 0.2124536633491516
Linf norm layer 2: 0.26174092292785645
Linf norm layer 3: 0.2511447072029114
Linf norm layer 4: 0.42163801193237305


Unlike as shown in the paper (which is layer 2,3, and 4 have similar linf norm and l1 has a low norm), we see Layer 4 has a considerably larger Linf norm.

Use np for norm rechecking 
Train for 50 epochs on google collab
Affine spaces of parameters


In [22]:
#L2 norm trend across layers
from numpy import linalg as LA
w_reinit_1= torch.load('weights_exp1.pth')
w_reinit_2= torch.load('weights_exp2.pth')
w_reinit_3= torch.load('weights_exp3.pth')
w_reinit_4= torch.load('weights_exp4.pth')

w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256).numpy()
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256).numpy()
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256).numpy()
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,10*256).numpy()

wf= torch.load('weights_f.pth')

wf_0=wf['0.weight'].resize(1,784*256).numpy()
wf_2=wf['2.weight'].resize(1,256*256).numpy()
wf_3=wf['4.weight'].resize(1,256*256).numpy()
wf_4=wf['6.weight'].resize(1,10*256).numpy()
print('layer  1',LA.norm(w_reinit_l1-wf_0))
print('layer  2',LA.norm(w_reinit_l2-wf_2))
print('layer  3',LA.norm(w_reinit_l3-wf_3))
print('layer  4',LA.norm(w_reinit_l4-wf_4))


layer  1 28.278309
layer  2 22.891495
layer  3 22.930168
layer  4 7.3277464


In [44]:
#Linf norm trend across layers
from numpy import linalg as LA
w_reinit_1= torch.load('weights_exp1.pth')
w_reinit_2= torch.load('weights_exp2.pth')
w_reinit_3= torch.load('weights_exp3.pth')
w_reinit_4= torch.load('weights_exp4.pth')

w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256).numpy()
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256).numpy()
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256).numpy()
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,10*256).numpy()

wf= torch.load('weights_f.pth')

wf_0=wf['0.weight'].resize(1,784*256).numpy()
wf_2=wf['2.weight'].resize(1,256*256).numpy()
wf_3=wf['4.weight'].resize(1,256*256).numpy()
wf_4=wf['6.weight'].resize(1,10*256).numpy()
print('layer  1: linalg.norm value',LA.norm((w_reinit_l1-wf_0),np.inf), '; max(abs()) value',np.max(np.abs(w_reinit_l1-wf_0)))
print('layer  2 linalg.norm value',LA.norm(w_reinit_l2-wf_2,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l2-wf_2)))
print('layer  3 linalg.norm value',LA.norm(w_reinit_l3-wf_3,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l3-wf_3)))
print('layer  4 linalg.norm value',LA.norm(w_reinit_l4-wf_4,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l4-wf_4)))


layer  1: linalg.norm value 10331.423 ; max(abs()) value 0.22720364
layer  2 linalg.norm value 4785.814 ; max(abs()) value 0.24625365
layer  3 linalg.norm value 4787.505 ; max(abs()) value 0.24873625
layer  4 linalg.norm value 302.1077 ; max(abs()) value 0.4389783


In [7]:
#same robustness experiment with five layers
#define the model 
inputsize = 784
hiddensizes = [256,256,256,256]
outputsize = 10
net5 = nn.Sequential(nn.Linear(inputsize,hiddensizes[0]),nn.ReLU(),nn.Linear(hiddensizes[0],hiddensizes[1]),nn.ReLU(),
                    nn.Linear(hiddensizes[1],hiddensizes[2]),nn.ReLU(),nn.Linear(hiddensizes[2],hiddensizes[3]),nn.ReLU(),nn.Linear(hiddensizes[3],outputsize),nn.LogSoftmax(dim=1))
net5.apply(init_weights)
w_init= list(net5.parameters())
torch.save(net5.state_dict(),'weights5_i.pth')
print(net5)

Sequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): ReLU()
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=256, bias=True)
  (7): ReLU()
  (8): Linear(in_features=256, out_features=10, bias=True)
  (9): LogSoftmax()
)


In [9]:
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0],-1)
logps = (net5(images))
loss = criterion(logps,labels)

In [10]:
optimizer = optim.SGD(net5.parameters(),lr = 0.003, momentum = 0.9 )
time0 = time()
epochs =5
for e in range(epochs):
    running_loss = 0
    for images,labels in trainloader:
        images = images.view(images.shape[0],-1)
        optimizer.zero_grad()
        output = net5(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
#     else:
    print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader)))
#     print(running_loss)
            
    print("\n Training time in minutes = ", (time()-time0)/60)

Epoch 0 - Training loss: 0.4248684939068518

 Training time in minutes =  0.20493873755137126
Epoch 1 - Training loss: 0.18384488870395715

 Training time in minutes =  0.4071003278096517
Epoch 2 - Training loss: 0.13241435576683042

 Training time in minutes =  0.6055184006690979
Epoch 3 - Training loss: 0.10105363108805501

 Training time in minutes =  0.8018073519070943
Epoch 4 - Training loss: 0.08550828020634459

 Training time in minutes =  0.9952053149541219


In [11]:
torch.save(net5.state_dict(),'weights_f5.pth')

In [16]:
#Testing reinitialization performance by changing different layers
wi=torch.load('weights5_i.pth')
wf= torch.load('weights_f5.pth')
wf['0.bias'] = wi['0.bias']
wf['0.weight'] = wi['0.weight']
torch.save(wf,'weights5_exp1.pth')
#testing
net5.load_state_dict(torch.load('weights5_exp1.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net5(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

print("Number Of Images Tested =", all_count)
print("\nLayer 1 reinit Model Accuracy =", (correct_count/all_count))

wi=torch.load('weights5_i.pth')
wf= torch.load('weights_f5.pth')
wf['2.bias'] = wi['2.bias']
wf['2.weight'] = wi['2.weight']
torch.save(wf,'weights5_exp2.pth')
#testing
net5.load_state_dict(torch.load('weights5_exp2.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net5(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

#print("Number Of Images Tested =", all_count)
print("\nLayer 2 reinit Model Accuracy =", (correct_count/all_count))


wi=torch.load('weights5_i.pth')
wf= torch.load('weights_f5.pth')
wf['4.bias'] = wi['4.bias']
wf['4.weight'] = wi['4.weight']
torch.save(wf,'weights5_exp3.pth')
#testing
net5.load_state_dict(torch.load('weights5_exp3.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net5(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

#print("Number Of Images Tested =", all_count)
print("\nLayer 3 reinit Model Accuracy =", (correct_count/all_count))

wi=torch.load('weights5_i.pth')
wf= torch.load('weights_f5.pth')
wf['6.bias'] = wi['6.bias']
wf['6.weight'] = wi['6.weight']
torch.save(wf,'weights5_exp4.pth')
#testing
net5.load_state_dict(torch.load('weights5_exp4.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net5(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

# print("Number Of Images Tested =", all_count)
print("\nLayer 4 reinit Model Accuracy =", (correct_count/all_count))

wi=torch.load('weights5_i.pth')
wf= torch.load('weights_f5.pth')
wf['8.bias'] = wi['8.bias']
wf['8.weight'] = wi['8.weight']
torch.save(wf,'weights5_exp5.pth')
#testing
net5.load_state_dict(torch.load('weights5_exp4.pth'))
correct_count, all_count = 0, 0
for images,labels in valloader:
    for i in range(len(labels)):
        img = images[i].view(1, 784)
        with torch.no_grad():
            logps = net5(img)

    
        ps = torch.exp(logps)
        probab = list(ps.numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.numpy()[i]
        if(true_label == pred_label):
              correct_count += 1
        all_count += 1

# print("Number Of Images Tested =", all_count)
print("\nLayer 5 reinit Model Accuracy =", (correct_count/all_count))


Number Of Images Tested = 10000

Layer 1 reinit Model Accuracy = 0.5523

Layer 2 reinit Model Accuracy = 0.9483

Layer 3 reinit Model Accuracy = 0.9542

Layer 4 reinit Model Accuracy = 0.9541

Layer 5 reinit Model Accuracy = 0.9541


In [20]:
#L2 norm trend across layers
from numpy import linalg as LA
w_reinit_1= torch.load('weights5_exp1.pth')
w_reinit_2= torch.load('weights5_exp2.pth')
w_reinit_3= torch.load('weights5_exp3.pth')
w_reinit_4= torch.load('weights5_exp4.pth')
w_reinit_5= torch.load('weights5_exp5.pth')

w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256).numpy()
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256).numpy()
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256).numpy()
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,256*256).numpy()
w_reinit_l5 = w_reinit_5['8.weight'].resize(1,10*256).numpy()


wf= torch.load('weights_f5.pth')

wf_0=wf['0.weight'].resize(1,784*256).numpy()
wf_2=wf['2.weight'].resize(1,256*256).numpy()
wf_3=wf['4.weight'].resize(1,256*256).numpy()
wf_4=wf['6.weight'].resize(1,256*256).numpy()
wf_5=wf['8.weight'].resize(1,10*256).numpy()


print('layer  1',LA.norm(w_reinit_l1-wf_0))
print('layer  2',LA.norm(w_reinit_l2-wf_2))
print('layer  3',LA.norm(w_reinit_l3-wf_3))
print('layer  4',LA.norm(w_reinit_l4-wf_4))
print('layer  5',LA.norm(w_reinit_l5-wf_5))

layer  1 4.4859114
layer  2 2.5516217
layer  3 2.2833698
layer  4 2.2638535
layer  5 1.962611


In [24]:
#Linf norm trend across layers
from numpy import linalg as LA
w_reinit_1= torch.load('weights5_exp1.pth')
w_reinit_2= torch.load('weights5_exp2.pth')
w_reinit_3= torch.load('weights5_exp3.pth')
w_reinit_4= torch.load('weights5_exp4.pth')
w_reinit_5= torch.load('weights5_exp5.pth')

w_reinit_l1 = w_reinit_1['0.weight'].resize(1,784*256).numpy()
w_reinit_l2 = w_reinit_2['2.weight'].resize(1,256*256).numpy()
w_reinit_l3 = w_reinit_3['4.weight'].resize(1,256*256).numpy()
w_reinit_l4 = w_reinit_4['6.weight'].resize(1,256*256).numpy()
w_reinit_l5 = w_reinit_5['8.weight'].resize(1,10*256).numpy()

wf= torch.load('weights_f5.pth')

wf_0=wf['0.weight'].resize(1,784*256).numpy()
wf_2=wf['2.weight'].resize(1,256*256).numpy()
wf_3=wf['4.weight'].resize(1,256*256).numpy()
wf_4=wf['6.weight'].resize(1,256*256).numpy()
wf_5=wf['8.weight'].resize(1,10*256).numpy()

print('layer  1: linalg.norm value',LA.norm((w_reinit_l1-wf_0),np.inf), '; max(abs()) value',np.max(np.abs(w_reinit_l1-wf_0)))
print('layer  2 linalg.norm value',LA.norm(w_reinit_l2-wf_2,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l2-wf_2)))
print('layer  3 linalg.norm value',LA.norm(w_reinit_l3-wf_3,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l3-wf_3)))
print('layer  4 linalg.norm value',LA.norm(w_reinit_l4-wf_4,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l4-wf_4)))

print('layer  5 linalg.norm value',LA.norm(w_reinit_l5-wf_5,np.inf),'; max(abs()) value',np.max(np.abs(w_reinit_l5-wf_5)))

layer  1: linalg.norm value 1153.2687 ; max(abs()) value 0.14283744
layer  2 linalg.norm value 407.02307 ; max(abs()) value 0.08732679
layer  3 linalg.norm value 374.83377 ; max(abs()) value 0.08170561
layer  4 linalg.norm value 361.94498 ; max(abs()) value 0.08172127
layer  5 linalg.norm value 67.206696 ; max(abs()) value 0.17559001
