# <center>Implementing SVRG in a state-of-the-art CNN model</center>

##### Import Pytorch and other useful librairies

In [6]:
from matplotlib import pyplot as plt
plt.gray()
import math
import numpy as np
import pandas as pd

import torch
import torchvision.datasets as datasets
import torch.nn.functional as F
from torch import nn
from torch import optim
import copy

<Figure size 432x288 with 0 Axes>

In [14]:
loss_func = F.cross_entropy

def accuracy(Y_hat, Y):
    preds = torch.argmax(Y_hat, dim=1)
    return (preds == Y).float().mean()

##### Load and preprocess dataset

In [83]:
#import data
mnist_trainset = datasets.MNIST(root='../data', train=True, download=True, transform=None)
mnist_testset = datasets.MNIST(root='../data', train=False, download=True, transform=None)

#load trainset into tensors
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=1, shuffle=True)
X_train = train_loader.dataset.data
Y_train = train_loader.dataset.targets

#load testset into tensors
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=10000, shuffle=False)
X_test = test_loader.dataset.data
Y_test = test_loader.dataset.targets

#scale data to [0:1] and convert to float32
X_train = (X_train.to(dtype=torch.float32) / X_train.max().to(dtype=torch.float32))
X_test = (X_test.to(dtype=torch.float32) / X_test.max().to(dtype=torch.float32))

#Flatten train and test data
X_train = X_train.reshape(X_train.shape[0],1,28,28)
X_test = X_test.reshape(X_test.shape[0],1,28,28)

print("Train examples : ",X_train.shape[0])
print("Test examples : ",X_test.shape[0])
print("Nb of features : ",X_train.shape[1])

Train examples :  60000
Test examples :  10000
Nb of features :  1


In [97]:
class SimpleNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(784,100)
        self.linear2 = nn.Linear(100,10)
        self.linear1_snapshot = nn.Linear(784,100)
        self.linear2_snapshot = nn.Linear(100,10)
        
        self.number_params = 4 
        
        self.mu = [None] * self.number_params
        
        self.copy_snapshot()
 
    def forward(self, x):
        x = torch.sigmoid(self.linear1(x))
        x = torch.softmax(self.linear2(x),1)
        return x
    
    def forward_snapshot(self, x):
        x = torch.sigmoid(self.linear1_snapshot(x))
        x = torch.softmax(self.linear2_snapshot(x),1)
        return x
    
    def copy_snapshot(self):
        params = list(self.parameters())
        for i in range(self.number_params):
            params[i+self.number_params].data.copy_(params[i])

        i=0
        for param in self.parameters():
            if (i < self.number_params) :
                self.mu[i] = torch.zeros(param.shape)
                i+=1

    def update_SGD(self, lr=1):
        params = list(self.parameters())
        for i in range(self.number_params):
            params[i].data.copy_(params[i].data - lr * params[i].grad.data)

    def update_SVRG(self,lr):
        params = list(self.parameters())
        for i in range(self.number_params):
            params[i].data.copy_(params[i].data - lr * (params[i].grad.data - params[i+self.number_params].grad.data + self.mu[i].data))       

    def update_mu(self,n,batch_size):
        params = list(self.parameters())
        for i in range(len(self.mu)):
            self.mu[i].data.copy_(self.mu[i].data + params[i+self.number_params].grad.data / (n/batch_size))
                            
    def fit_SVRG(self,optimizer,epochs,warm_epochs,n,batch_size,lr):
        params = list(self.parameters())
        
        n = X_train.shape[0]
        self.train()
        
        #Warm start
        for epoch in range(warm_epochs):
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward( X )
                loss = loss_func( pred , Y )
                loss.backward()
                self.update_SGD(0.1)
                
            print(epoch,"\t",loss.item())

        self.copy_snapshot()
    
        for epoch in range(epochs):
            self.train()
            #update mu
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward_snapshot( X )
                loss_snapshot = loss_func( pred , Y )

                loss_snapshot.backward()

                self.update_mu(n,batch_size)
            
            
            for m in range(5):
                for i in range((n - 1) // batch_size + 1):
                    optimizer.zero_grad()
                    
                    #Snapshot gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    X = self.forward_snapshot( X )
                    loss_snapshot = loss_func( X , Y )
                    loss_snapshot.backward()
                    #'real' gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    X = self.forward( X )
                    loss = loss_func( X , Y )
                    loss.backward()
                    self.update_SVRG(lr)
                    
                print(epoch * 2 + m+warm_epochs,"\t",loss.item())
            
            self.copy_snapshot()
            with torch.no_grad():
                self.eval()
                print("Test set \t", round(accuracy( self.forward(X_test.reshape(-1,784)) , Y_test).item(),3))

    def fit_SGD(self,optimizer,epochs,batch_size,lr):
        n = X_train.shape[0]
       
        for epoch in range(epochs):
            self.train()
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward( X )
                loss = loss_func( pred , Y )
                loss.backward()
                self.update_SGD(lr)
                
            print(epoch,"\t",loss.item())

            with torch.no_grad():
                self.eval()
                print("Test set \t", round(accuracy( self.forward(X_test.reshape(-1,784)) , Y_test).item(),3))

In [103]:
X_train = X_train.reshape(60000,784)

simple_mod = SimpleNet()
opt = optim.SGD(simple_mod.parameters(), lr=10)
epochs = 5
warm_epochs = 5
batch_size = 60
learning_rate = 0.5

#simple_mod.fit_SGD(opt,epochs,batch_size,learning_rate)
simple_mod.fit_SVRG(opt,epochs,warm_epochs,X_train.shape[0],batch_size,learning_rate)

0 	 2.152837038040161
1 	 1.9950679540634155
2 	 1.9186946153640747
3 	 1.84818696975708
4 	 1.7446726560592651
5 	 1.6252782344818115
6 	 1.5430420637130737
7 	 1.5173792839050293
8 	 1.499821424484253
9 	 1.4892243146896362
Test set 	 0.931
7 	 1.4814927577972412
8 	 1.4791216850280762
9 	 1.4777387380599976
10 	 1.4771500825881958
11 	 1.4767204523086548
Test set 	 0.945
9 	 1.4711124897003174
10 	 1.470357894897461
11 	 1.4697270393371582
12 	 1.4691221714019775
13 	 1.4685349464416504
Test set 	 0.952
11 	 1.4695355892181396
12 	 1.4691652059555054
13 	 1.4688200950622559
14 	 1.4684512615203857
15 	 1.4680485725402832
Test set 	 0.956
13 	 1.4669963121414185
14 	 1.4667611122131348
15 	 1.4665948152542114
16 	 1.4664931297302246
17 	 1.466439127922058
Test set 	 0.962


##### Define the CNN architecture

In [191]:
class NeuralNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 24, kernel_size=5, stride=1, padding=2)
        self.max1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn1 = nn.BatchNorm2d(24)
        self.conv2 = nn.Conv2d(24, 48, kernel_size=5, stride=1, padding=2)
        self.max2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn2 = nn.BatchNorm2d(48)
        self.conv3 = nn.Conv2d(48, 64, kernel_size=5, stride=1, padding=2)
        self.max3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn3 = nn.BatchNorm2d(64)
        self.linear4 = nn.Linear(64*3*3,256)
        self.bn4 = nn.BatchNorm1d(256)
        self.linear5 = nn.Linear(256,10)
        
        self.number_params = 18
        
        self.mu = [None] * self.number_params
        
        self.copy_snapshot()
 
    def forward(self, x):
        #print("--------FORWARD---------")
        x = torch.relu(self.conv1(x))
        #print("conv1 :" , x.shape)
        x = self.max1(x)
        x = self.bn1(x)
        #print("max1 :" , x.shape)
        x = torch.relu(self.conv2(x))
        #print("conv2 :" , x.shape)
        x = self.max2(x)
        x = self.bn2(x)
        #print("max2 :" , x.shape)
        x = torch.relu(self.conv3(x))
        #print("conv3 :" , x.shape)
        x = self.max3(x)
        x = self.bn3(x)
        #print("max3 :" , x.shape)
        x = self.linear4(torch.relu(x.reshape(x.shape[0],-1)))
        #print("linear4 :" , x.shape)
        x = self.bn4(x)
        x = self.linear5(torch.softmax(x,1))
        #print("linear5 :" , x.shape)
        return x
    
    def forward_snapshot(self, x):
        #print("--------FORWARD---------")
        x = torch.relu(self.conv1_snapshot(x))
        #print("conv1 :" , x.shape)
        x = self.max1(x)
        x = self.bn1_snapshot(x)
        #print("max1 :" , x.shape)
        x = torch.relu(self.conv2_snapshot(x))
        #print("conv2 :" , x.shape)
        x = self.max2(x)
        x = self.bn2_snapshot(x)
        #print("max2 :" , x.shape)
        x = torch.relu(self.conv3_snapshot(x))
        #print("conv3 :" , x.shape)
        x = self.max3(x)
        x = self.bn3_snapshot(x)
        #print("max3 :" , x.shape)
        x = self.linear4_snapshot(torch.relu(x.reshape(x.shape[0],-1)))
        #print("linear4 :" , x.shape)
        x = self.bn4_snapshot(x)
        x = self.linear5_snapshot(torch.softmax(x,1))
        #print("linear5 :" , x.shape)
        return x
    
    def copy_snapshot(self):
        self.conv1_snapshot = copy.deepcopy(self.conv1)
        self.bn1_snapshot = copy.deepcopy(self.bn1)
        self.conv2_snapshot = copy.deepcopy(self.conv2)
        self.bn2_snapshot = copy.deepcopy(self.bn2)
        self.conv3_snapshot = copy.deepcopy(self.conv3)
        self.bn3_snapshot = copy.deepcopy(self.bn3)
        self.linear4_snapshot = copy.deepcopy(self.linear4)
        self.bn4_snapshot = copy.deepcopy(self.bn4)
        self.linear5_snapshot = copy.deepcopy(self.linear5)

        i=0
        for param in self.parameters():
            if (i < self.number_params) :
                self.mu[i] = torch.zeros(param.shape)
                i+=1

    def update_SGD(self, lr=1):
        params = list(self.parameters())
        for i in range(self.number_params):
            params[i].data.copy_(params[i].data - lr * params[i].grad.data)

    def update_SVRG(self,lr):
        params = list(self.parameters())
        for i in range(self.number_params):
            params[i].data.copy_(params[i].data - lr * (params[i].grad.data - params[i+self.number_params].grad.data + self.mu[i].data))       

    def update_mu(self,batch_size):
        params = list(self.parameters())
        for i in range(len(self.mu)):
            self.mu[i].data.copy_(self.mu[i].data + params[i+self.number_params].grad.data / 5)
                            
    def fit_SVRG(self,optimizer,epochs,warm_epochs,batch_size,lr):
        n = X_train.shape[0]
        model.train()
        
        #Warm start
        for epoch in range(warm_epochs):
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward( X )
                loss = loss_func( pred , Y )
                loss.backward()
                self.update_SGD(10)
                
            print(epoch,"\t",loss.item())

        self.copy_snapshot()
    
        for epoch in range(epochs):
            model.train()
            #update mu
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward_snapshot( X )
                loss_snapshot = loss_func( pred , Y )
                loss_snapshot.backward()
                self.update_mu(batch_size)
            
            
            for m in range(5):
                for i in range((n - 1) // batch_size + 1):
                    optimizer.zero_grad()
                    
                    #Snapshot gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    pred = self.forward_snapshot( X )
                    loss_snapshot = loss_func( pred , Y )
                    loss_snapshot.backward()
                    
                    #'real' gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    pred = self.forward( X )
                    loss = loss_func( pred , Y )
                    loss.backward()
                    self.update_SVRG(lr)
                    
           
                print(epoch * 2 + m+warm_epochs,"\t",loss.item())
            
            self.copy_snapshot()
            with torch.no_grad():
                model.eval()
                print("Test set \t", round(accuracy( model.forward(X_test) , Y_test).item(),3))

    def fit_SGD(self,optimizer,epochs,batch_size,lr):
        n = X_train.shape[0]
       
        for epoch in range(epochs):
            model.train()
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward( X )
                loss = loss_func( pred , Y )
                loss.backward()
                self.update_SGD(lr)
                
            print(epoch,"\t",loss.item())

            with torch.no_grad():
                model.eval()
                print("Test set \t", round(accuracy( model.forward(X_test) , Y_test).item(),3))

In [192]:
model = NeuralNet()
opt = optim.SGD(model.parameters(), lr=10)
epochs = 2
warm_epochs = 5
batch_size = 100
learning_rate = 0.1

model.fit_SVRG(opt,epochs,warm_epochs,batch_size,learning_rate)

0 	 2.327699661254883
1 	 2.311671495437622
2 	 2.2557857036590576
3 	 1.9566709995269775
4 	 1.9679818153381348
5 	 1.7659927606582642
6 	 1.854134202003479
7 	 1.9519540071487427
8 	 2.063992977142334
9 	 2.209808588027954
Test set 	 0.171
7 	 2.315607786178589
8 	 2.581749200820923
9 	 3.146223545074463
10 	 4.1037116050720215
11 	 5.4671831130981445
Test set 	 0.096


###### Load, Preprocess and predict test set from Kaggle

In [104]:
#Load data from CSV
test = pd.read_csv('../data/MNIST/test.csv')
test_tensor = torch.tensor(test.values)

#Preprocess
test_tensor = (test_tensor.to(dtype=torch.float32) / test_tensor.max().to(dtype=torch.float32))
test_tensor = test_tensor.reshape(test_tensor.shape[0],1,28,28)

#Predict
test_tensor = model.forward(test_tensor)
test_tensor = test_tensor.argmax(1)

##### Save predictions to a csv file

In [19]:
#Convert to a numpy array
arr = test_tensor.numpy()

# write CSV
np.savetxt('../data/MNIST/predictions.csv', arr)

NameError: name 'test_tensor' is not defined

In [78]:
for i in range(len(model.mu)):
    print(model.mu[i].shape)

torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])


In [147]:
for param in model.parameters():
    print(param.shape)

torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])


In [83]:
params = list(model.parameters())
params[10].grad.data     


AttributeError: 'NoneType' object has no attribute 'data'