# Optimizers
+ SGD
+ Momentum
+ Nestrov
+ AdaGrad
+ RMS Prop
+ Adam

#### Note:
+ this notebook is written in python3


In [0]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split

## Activation functions

In [0]:
class Activations:
# utility functions
    @staticmethod
    def relu(x):
        return np.maximum(0,x)

    @staticmethod
    def d_relu(x):
        return np.array(x>=0, dtype=np.float)
    
    @staticmethod
    def sigmoid(x):
        return (np.tanh(x/2)+1)/2 

    @staticmethod
    def d_sigmoid(x):
        sig = Activations.sigmoid(x)
        return sig*(1-sig)
    
    
    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def d_tanh(x):
        tanhx = Activations.tanh(x)
        return (1+tanhx)*(1-tanhx)
    
    @staticmethod
    def softmax(x): 
        ex = np.exp(x)
        return (ex.T/np.sum(ex, axis=1)).T
    
    @staticmethod
    def d_softmax(x): # returns 2D array
        p = Activations.softmax(x)
        return p*(1-p)
    

### Utility Functions 
+ categorical crossentropy function
+ function to convert labels to onehot vectors (also returns the unique labels - useful for the order of classes in the onehot vector)

In [0]:
class utils:
    @staticmethod
    def categorical_crossentropy(y_true, y_pred):
        return -np.sum(y_true*np.log(y_pred))
    
    @staticmethod
    def to_categorical(labels, num_classes=None):
        y_unique = np.unique(labels)
        if num_classes==None: num_classes = len(y_unique)
        assert num_classes==len(y_unique), 'num_classes is not same as classes found in labels array'
        y_onehot = np.reshape(labels, newshape=(-1,1)) == y_unique
        return y_onehot.astype(np.float), y_unique

## Optimizer Algorithms
Reference: Chapter 8, DeepLearningBook

In [0]:
class optimizer(object):
    def __init__(self): pass
    def interim_update(self, weights): return weights 
    def update(self, weights, gradients): pass

######################################################################################################   
class SGD(optimizer):
    def __init__(self, start_lr, final_lr, decay_iters):
        self.name = 'SGD'
        self.start_lr = start_lr
        self.final_lr = final_lr
        self.lr = start_lr
        self.decay_iters = decay_iters # iteration number till which learning rate should linearly decay
        self.iter_count = 1
        super().interim_update
    
    def update(self, weights, gradients):
        new_weights = weights - (self.lr * gradients)
        alpha = np.minimum(1, self.iter_count/self.decay_iters)
        self.lr = (1-alpha)*self.start_lr + alpha*self.final_lr
        self.iter_count += 1
        return new_weights

######################################################################################################      
class SGD_Momentum(optimizer):
    def __init__(self, lr, alpha):
        self.name = 'sgd_momentum'
        self.lr = lr
        self.alpha = alpha
        self.velocity = 0 #initial velocity
        super().interim_update
        
    def update(self, weights, gradients):
        self.velocity = self.alpha*self.velocity - self.lr*gradients
        new_weights = weights + self.velocity
        return new_weights
    
######################################################################################################   
class SGD_NesterovMomentum(optimizer):
    def __init__(self, lr, alpha):
        self.name = 'sgd_nesterov_momentum'
        self.lr = lr
        self.alpha = alpha
        self.velocity = 0 #initial velocity
        
    def interim_update(self, weights):
        interim_weights = weights + self.alpha*self.velocity
        return interim_weights
    
    def update(self, weights, gradients):
        self.velocity = self.alpha*self.velocity - self.lr*gradients
        new_weights = weights + self.velocity
        return new_weights
######################################################################################################   
class AdaGrad(optimizer):
    def __init__(self, lr, epsilon=None):
        self.name = 'adagrad'
        self.lr = lr
        self.epsilon = 1e-7 if epsilon==None else epsilon
        self.cum_sq_grad = 0
        super().interim_update
        
    def update(self, weights, gradients):
        self.cum_sq_grad = self.cum_sq_grad + gradients**2
        d_weights = -self.lr*gradients/(self.epsilon + np.sqrt(self.cum_sq_grad))
        new_weights = weights + d_weights
        return new_weights
    
######################################################################################################   
class RMSProp(optimizer):
    def __init__(self, lr, decay_rate,  epsilon=None):
        self.name = 'rmsprop'
        self.lr = lr
        self.decay_rate = decay_rate
        self.epsilon = 1e-6 if epsilon==None else epsilon
        self.cum_sq_grad = 0
        super().interim_update
        
    def update(self, weights, gradients):
        self.cum_sq_grad = self.decay_rate*self.cum_sq_grad + (1-self.decay_rate)*gradients**2
        d_weights = -self.lr*gradients/np.sqrt(self.cum_sq_grad + self.epsilon)
        new_weights = weights + d_weights
        return new_weights

######################################################################################################  
# reference: Algorithm 8.7 of Deeplearningbook 
class Adam(optimizer):
    def __init__(self, lr, beta1, beta2, epsilon=None):
        self.name = 'adam'
        self.lr = lr
        self.beta1, self.beta2 = beta1, beta2
        self.epsilon = 1e-8 if epsilon==None else epsilon #suggested default 1e-8
        self.s = self.r = 0 # intialize first and second moments to zero
        self.t = 0 # time step
        super().interim_update
        
    def update(self, weights, gradients):
        self.t += 1
        self.s = self.beta1*self.s + (1-self.beta1)*gradients
        self.r = self.beta2*self.r + (1-self.beta2)*(gradients**2)
        s_hat = self.s/(1-self.beta1**self.t)
        r_hat = self.r/(1-self.beta2**self.t)
        d_weights = -self.lr*s_hat/(np.sqrt(r_hat)+self.epsilon)
        new_weights = weights + d_weights
        return new_weights

## Multilayer Perceptron 
In this implementation one can define arbitrary number of hidden layers with desired activation functions - gradients are computed with respect to those activation functions but only for crossentropy loss function. 

In [0]:
class MLP(object):
    def __init__(self, dims, activations):
        self.dims = dims
        self.input_dim = self.dims[0]
        self.hidden_dims = self.dims[1:-1]
        self.output_dim = self.dims[-1]
        self.weights = [np.random.normal(0, 0.5, size=(self.dims[i+1], self.dims[i])) for i in range(len(self.dims)-1)]
        self.biases = [np.random.normal(0, 0.05, size=(self.dims[i+1])) for i in range(len(self.dims)-1)]
        self.activations = activations
        self.d_func = []
        for func in self.activations:
            if func==Activations.sigmoid: self.d_func.append(Activations.d_sigmoid)
            elif func==Activations.relu: self.d_func.append(Activations.d_relu)
            elif func==Activations.tanh: self.d_func.append(Activations.d_tanh)
            elif func==Activations.softmax: self.d_func.append(Activations.d_softmax)
        
    def _layer(self, x_batch, W, b, activation=None):
        if activation==None: activation= lambda a:a
        return activation(np.matmul(x_batch, W.T)+b)
    
    def forward(self, batch):
        z = batch
        layers = [z]
        act_maps = [z]
        for n in range(len(self.hidden_dims)):
            z = np.matmul(z, self.weights[n].T) + self.biases[n]
            layers.append(z)
            z = self.activations[n](z)
            act_maps.append(z)
        logits = np.matmul(z, self.weights[-1].T) + self.biases[-1]
        layers.append(logits)
        act_maps.append(self.activations[-1](logits))
        return layers, act_maps
        
    # gradients are comupted for crossentropy loss 
    def back_propogate(self, batch, layers, act_maps, y_true, weights, biases):
        
        m = len(batch)
        W_grad = [np.zeros_like(w) for w in weights]
        bias_grad = [np.zeros_like(b) for b in biases]
        
        delta = -(y_true/act_maps[-1]) * self.d_func[-1](layers[-1])
        
        for n in np.arange(1, len(W_grad)+1):
            bias_grad[-n] = np.mean(delta, axis=0) / m
            W_grad[-n] = np.tensordot(delta, act_maps[-n-1], axes=[0,0]) / m
            delta = np.matmul(delta, weights[-n]) * self.d_func[-n](layers[-n-1])
        return W_grad, bias_grad
    
    def train(self, x_train, y_train, epochs, batch_size, optimizer, opt_kwargs, shuffle=True):
        epoch = 1
        N = len(x_train)
        
        # optimizers 
        w_opt, bias_opt = [optimizer(**opt_kwargs) for _ in range(len(self.weights))], [optimizer(**opt_kwargs) for _ in range(len(self.biases))]
        
        while(epoch <= epochs):
            if shuffle:
                indices = np.arange(N)
                np.random.shuffle(indices)
                x_train, y_train = x_train[indices], y_train[indices]
            loss = 0
            for batch in np.arange(0, N, batch_size):
                X, Y = x_train[batch:batch+batch_size], y_train[batch:batch+batch_size]
                
                layers, act_maps = self.forward(X)
                loss += utils.categorical_crossentropy(Y, act_maps[-1])
                weights = [w_opt[i].interim_update(self.weights[i]) for i in range(len(self.weights))]
                biases = [bias_opt[i].interim_update(self.biases[i]) for i in range(len(self.biases))]
                w_gradients, bias_gradients = self.back_propogate(batch=X, layers=layers, act_maps=act_maps, y_true=Y, weights=weights, biases=biases)
                self.weights = [w_opt[i].update(weights[i], w_gradients[i]) for i in range(len(self.weights))]
                self.biases =  [bias_opt[i].update(biases[i], bias_gradients[i]) for i in range(len(self.biases))]
                       
            print('Epoch:', epoch, ' Loss:', loss)
            epoch += 1
        print('Done Training')
        
    def predict(self, x):
        z = x
        for n in range(len(self.hidden_dims)):
            print(self.weights[n].shape)
            z = np.matmul(z, self.weights[n].T) + self.biases[n]
            z = self.activations[n](z)
        logits = np.matmul(z, self.weights[-1].T) + self.biases[-1]
        return self.activations[-1](logits)

## Training and testing

#### Dataset 
Dataset chosen - iris dataset from UCI dataset repository - http://archive.ics.uci.edu/ml/datasets/Iris

In [0]:
data_dir = 'Iris'
df = pd.read_table(os.path.join(data_dir, 'iris.data'), sep=',', header=None)
x_data = np.asarray(df.iloc[:, 0:4]).astype(np.float)
y_data = np.asarray(df.iloc[:, -1])
y_onehot, unique_labels = utils.to_categorical(y_data)


In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_onehot, test_size=0.1)

### Training with different optimizers
Training of same network with different optimizers 

### SGD

In [0]:
model1 = MLP(dims=[4, 10, 10, 3], activations = [Activations.sigmoid]*2+[Activations.softmax])

In [35]:
model1.train(x_train=x_train, y_train=y_train, epochs=500, batch_size=1, optimizer=SGD, opt_kwargs=dict(start_lr=0.01, final_lr=0.001, decay_iters=500))

Epoch: 1  Loss: 184.3920572684449
Epoch: 2  Loss: 158.75679717503942
Epoch: 3  Loss: 151.70341203909936
Epoch: 4  Loss: 149.9981535627772
Epoch: 5  Loss: 149.35691803717154
Epoch: 6  Loss: 149.07869660186984
Epoch: 7  Loss: 148.7495101754555
Epoch: 8  Loss: 148.50396877798866
Epoch: 9  Loss: 148.3090703084251
Epoch: 10  Loss: 148.07269495348112
Epoch: 11  Loss: 147.92906433644353
Epoch: 12  Loss: 147.83338598033498
Epoch: 13  Loss: 147.7910366344366
Epoch: 14  Loss: 147.65218713595428
Epoch: 15  Loss: 147.54235261931598
Epoch: 16  Loss: 147.49275775062554
Epoch: 17  Loss: 147.47926958255175
Epoch: 18  Loss: 147.3441336290091
Epoch: 19  Loss: 147.3312866402224
Epoch: 20  Loss: 147.34488679747537
Epoch: 21  Loss: 147.29710773504297
Epoch: 22  Loss: 147.27424232308442
Epoch: 23  Loss: 147.28109776134048
Epoch: 24  Loss: 147.25997010537364
Epoch: 25  Loss: 147.29478219509087
Epoch: 26  Loss: 147.2451000285618
Epoch: 27  Loss: 147.2013882861328
Epoch: 28  Loss: 147.22635193610898
Epoch: 29 

### SGD Momentum

In [0]:
model2 = MLP(dims=[4, 10, 10, 3], activations = [Activations.sigmoid]*2+[Activations.softmax])

In [38]:
model2.train(x_train=x_train, y_train=y_train, epochs=500, batch_size=10, optimizer=SGD_Momentum, opt_kwargs=dict(lr=0.001, alpha=0.9))

Epoch: 1  Loss: 165.83219722578602
Epoch: 2  Loss: 164.13202673298537
Epoch: 3  Loss: 162.1595235088416
Epoch: 4  Loss: 160.30915289286173
Epoch: 5  Loss: 158.71538122696666
Epoch: 6  Loss: 157.34061098856264
Epoch: 7  Loss: 156.0001579054844
Epoch: 8  Loss: 154.81985364512218
Epoch: 9  Loss: 153.97098042640593
Epoch: 10  Loss: 153.1765048207699
Epoch: 11  Loss: 152.44832511407972
Epoch: 12  Loss: 151.86905777591866
Epoch: 13  Loss: 151.3557174969172
Epoch: 14  Loss: 150.92714554458763
Epoch: 15  Loss: 150.52287962587937
Epoch: 16  Loss: 150.0892389756402
Epoch: 17  Loss: 149.792239423721
Epoch: 18  Loss: 149.57093281925418
Epoch: 19  Loss: 149.35830412020638
Epoch: 20  Loss: 149.13819849089188
Epoch: 21  Loss: 149.04387593875393
Epoch: 22  Loss: 148.91568504845114
Epoch: 23  Loss: 148.84772090796892
Epoch: 24  Loss: 148.83495706914778
Epoch: 25  Loss: 148.80218112162618
Epoch: 26  Loss: 148.73504793912014
Epoch: 27  Loss: 148.63544387896448
Epoch: 28  Loss: 148.56200444353118
Epoch: 2

### SGD Nesterov Momentum

In [0]:
model3 = MLP(dims=[4, 10, 10, 3], activations = [Activations.sigmoid]*2+[Activations.softmax])

In [52]:
model3.train(x_train=x_train, y_train=y_train, epochs=500, batch_size=10, optimizer=SGD_NesterovMomentum, opt_kwargs=dict(lr=0.001, alpha=0.9))

Epoch: 1  Loss: 186.5513057303786
Epoch: 2  Loss: 181.42287995213582
Epoch: 3  Loss: 175.49224478881428
Epoch: 4  Loss: 169.72368171033756
Epoch: 5  Loss: 165.5460499552294
Epoch: 6  Loss: 161.81334510026406
Epoch: 7  Loss: 158.99711475429794
Epoch: 8  Loss: 156.95192388397282
Epoch: 9  Loss: 155.0855441783096
Epoch: 10  Loss: 153.88577343572695
Epoch: 11  Loss: 152.98651932430124
Epoch: 12  Loss: 151.68812065987606
Epoch: 13  Loss: 151.22726391245016
Epoch: 14  Loss: 150.77591889270963
Epoch: 15  Loss: 150.26367390278136
Epoch: 16  Loss: 149.81456498716773
Epoch: 17  Loss: 149.39323472197796
Epoch: 18  Loss: 148.8002856769523
Epoch: 19  Loss: 148.08487906765333
Epoch: 20  Loss: 147.67919642100455
Epoch: 21  Loss: 147.39080057178592
Epoch: 22  Loss: 147.1207807521418
Epoch: 23  Loss: 146.79324537404378
Epoch: 24  Loss: 146.55183344633505
Epoch: 25  Loss: 146.3932664419168
Epoch: 26  Loss: 146.27148179237284
Epoch: 27  Loss: 146.24235292520632
Epoch: 28  Loss: 146.1328504820067
Epoch: 2

### AdaGrad

In [0]:
model4 = MLP(dims=[4, 10, 10, 3], activations = [Activations.sigmoid]*2+[Activations.softmax])

In [55]:
model4.train(x_train=x_train, y_train=y_train, epochs=500, batch_size=10, optimizer=AdaGrad, opt_kwargs=dict(lr=0.001))

Epoch: 1  Loss: 177.99231313676864
Epoch: 2  Loss: 177.70943886929197
Epoch: 3  Loss: 177.4946158609255
Epoch: 4  Loss: 177.3045792520855
Epoch: 5  Loss: 177.13756383557583
Epoch: 6  Loss: 176.91721584537888
Epoch: 7  Loss: 176.87553315152954
Epoch: 8  Loss: 176.79874891735344
Epoch: 9  Loss: 176.68926552507963
Epoch: 10  Loss: 176.58572067480384
Epoch: 11  Loss: 176.45188182680675
Epoch: 12  Loss: 176.40415384023527
Epoch: 13  Loss: 176.328183326931
Epoch: 14  Loss: 176.24601002521086
Epoch: 15  Loss: 176.15330572402735
Epoch: 16  Loss: 176.07834137397023
Epoch: 17  Loss: 176.03787820558435
Epoch: 18  Loss: 176.00223660686652
Epoch: 19  Loss: 175.9465330659627
Epoch: 20  Loss: 175.82808478268754
Epoch: 21  Loss: 175.75567493420414
Epoch: 22  Loss: 175.73330886834356
Epoch: 23  Loss: 175.63324815069143
Epoch: 24  Loss: 175.5886769276922
Epoch: 25  Loss: 175.54598182023662
Epoch: 26  Loss: 175.45962636004535
Epoch: 27  Loss: 175.4519124184603
Epoch: 28  Loss: 175.3802075073928
Epoch: 29

### RMSProp

In [0]:
model5 = MLP(dims=[4, 10, 10, 3], activations = [Activations.sigmoid]*2+[Activations.softmax])

In [86]:
model5.train(x_train=x_train, y_train=y_train, epochs=300, batch_size=10, optimizer=RMSProp, opt_kwargs=dict(lr=0.0001, decay_rate=0.9))

Epoch: 1  Loss: 167.110803620579
Epoch: 2  Loss: 167.12202473612962
Epoch: 3  Loss: 167.10331929657605
Epoch: 4  Loss: 167.1290522717776
Epoch: 5  Loss: 167.13856895136772
Epoch: 6  Loss: 167.1333415510698
Epoch: 7  Loss: 167.1365047628325
Epoch: 8  Loss: 167.17409052950396
Epoch: 9  Loss: 167.17088186813604
Epoch: 10  Loss: 167.19333168281258
Epoch: 11  Loss: 167.1894346826619
Epoch: 12  Loss: 167.21136486020055
Epoch: 13  Loss: 167.20914640595083
Epoch: 14  Loss: 167.23520867233688
Epoch: 15  Loss: 167.1736722137018
Epoch: 16  Loss: 167.24617396677002
Epoch: 17  Loss: 167.20446513095078
Epoch: 18  Loss: 167.2402698116144
Epoch: 19  Loss: 167.25962141726126
Epoch: 20  Loss: 167.28766838290503
Epoch: 21  Loss: 167.2736322103703
Epoch: 22  Loss: 167.2461533983442
Epoch: 23  Loss: 167.2716214395055
Epoch: 24  Loss: 167.29896395050056
Epoch: 25  Loss: 167.2621278009352
Epoch: 26  Loss: 167.300800504341
Epoch: 27  Loss: 167.2982536906611
Epoch: 28  Loss: 167.29218348719726
Epoch: 29  Loss:

### Adam

In [0]:
model6  = MLP(dims=[4,10,10,3], activations=[Activations.sigmoid]*3+[Activations.softmax])

In [88]:
model6.train(x_train=x_train, y_train=y_train, epochs=500, batch_size=20, optimizer=Adam, opt_kwargs=dict(lr=0.0001, beta1=0.7, beta2=0.9, epsilon=1e-8))

Epoch: 1  Loss: 207.1020570460366
Epoch: 2  Loss: 207.0074043157225
Epoch: 3  Loss: 206.9243005314945
Epoch: 4  Loss: 206.82114386369096
Epoch: 5  Loss: 206.72721879592413
Epoch: 6  Loss: 206.63416828907998
Epoch: 7  Loss: 206.54732060657034
Epoch: 8  Loss: 206.45256281431406
Epoch: 9  Loss: 206.34767352204537
Epoch: 10  Loss: 206.26062306723995
Epoch: 11  Loss: 206.16291916652747
Epoch: 12  Loss: 206.0746615399479
Epoch: 13  Loss: 205.96340755148495
Epoch: 14  Loss: 205.85929167445082
Epoch: 15  Loss: 205.74980562100384
Epoch: 16  Loss: 205.67432143115653
Epoch: 17  Loss: 205.57095076863715
Epoch: 18  Loss: 205.47213103942445
Epoch: 19  Loss: 205.38243111251896
Epoch: 20  Loss: 205.28650745898597
Epoch: 21  Loss: 205.19975247457955
Epoch: 22  Loss: 205.09622472849443
Epoch: 23  Loss: 204.99481768785375
Epoch: 24  Loss: 204.90515218619856
Epoch: 25  Loss: 204.81166033983772
Epoch: 26  Loss: 204.69656577343855
Epoch: 27  Loss: 204.5851596864015
Epoch: 28  Loss: 204.48857212047116
Epoch: