For classification problems, we do not need to use MSE error like regression problems, there is a better loss function called the softmax cross entropy loss
Softmax : is partial to the max of all values in a sequence. So, this leads to steeper gradients while training the model. also, it normalizes the sequence returning the probability values. \
Softmax is coupled with cross entropy loss which penalizes the model when we have lower probability values for target being 1 and higher probability values for target being 0. \
softmax(x) = $\frac{e^{x_i}}{\sum _i e^{x_i}}$ \
Cross entropy loss = $(y_i \times \log{(softmax(x))}) \times ((1-y_i) \times \log{(1-softmax(x))})$

In [277]:
import numpy as np
from typing import Callable, List, Tuple, Dict

In [278]:
class Operation(object):
    '''
    Base class for an operation in a neural network
    '''
    def __init__(self):
        pass
    
    def forward(self, input_ : np.ndarray):
        
        '''
        Stores input in the self.input attribute. 
        store output of forward computation is self.output attribute
        '''

        self.input_ = input_
        self.output = self._output()
        return self.output
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray:
        '''
        Calls the self._input_grad() function
        '''
        assert(self.output.shape == output_grad.shape)
        self.input_grad = self._input_grad(output_grad)
        assert(self.input_grad.shape == self.input_.shape)
        return self.input_grad
    
    def _output(self) -> np.ndarray:
        '''
        the output method must be defined for each operation
        '''
        raise NotImplementedError()
    
    def _input_grad(self,output_grad : np.ndarray) -> np.ndarray:
        '''
        the input_grad method must be defined for each operation
        '''
        raise NotImplementedError()



In [279]:
class ParamOperation(Operation):
    def __init__(self,param : np.ndarray) -> np.ndarray:
        super().__init__()
        self.param = param
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray:
        assert(self.output.shape == output_grad.shape)
    
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)
        assert(self.input_grad.shape == self.input_.shape)
        assert(self.param_grad.shape == self.param.shape)

        return self.input_grad
    
    def _param_grad(self,output_grad : np.ndarray) -> np.ndarray :
        raise NotImplementedError()
  

In [280]:
class weightMultiply(ParamOperation):
    def __init__(self, W : np.ndarray):
        super().__init__(W)
    
    def _output(self) -> np.ndarray :
        return np.dot(self.input_, self.param)
        
    def _input_grad(self,output_grad : np.ndarray) -> np.ndarray:
        return np.dot(output_grad, np.transpose(self.param, (1,0)))
    
    def _param_grad(self,output_grad : np.ndarray) -> np.ndarray:
        return np.dot(np.transpose(self.input_,(1,0)), output_grad)
    

In [281]:
# Addition of bias term
class BiasAdd(ParamOperation):
    def __init__(self,B:np.ndarray):
        assert B.shape[0] == 1
        super().__init__(B)
    
    def _output(self):
        return self.input_ + self.param
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray :
        return np.ones_like(self.input_) *  output_grad
    
    def _param_grad(self, output_grad : np.ndarray) -> np.ndarray :
        param_grad = np.ones_like(self.param) * output_grad
        return np.sum(param_grad, axis=0).reshape(1, self.param.shape[1])

In [282]:
# sigmoid activation layer

class Sigmoid(Operation):
    def __init__(self) -> None:
        super().__init__()
    
    def _output(self) -> np.ndarray:
        return (1.0/(1.0+np.exp(-1.0 * self.input_)))
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray:
        sigmoid_backward = self.output * (1-self.output) # derivative of sigmoid(x) = sigmoid(x) * (1-sigmoid(x))
        return (sigmoid_backward * output_grad)

class Linear(Operation):
    def __init__(self) -> None :
        super().__init__()
    
    def _output(self) -> np.ndarray:
        return self.input_
    
    def _input_grad(self, output_grad:np.ndarray) -> np.ndarray:
        return output_grad


In [283]:
# abstract layer class
class Layer(object):
    def __init__(self, neurons : int) :
        self.first = True # first layer or not
        self.neurons = neurons
        self.params : List[np.ndarray] = []
        self.param_grads : List[np.ndarray] = []
        self.operations : List[Operation] = []


    
    def _setup_layer(self, input_ : np.ndarray) -> None:
        raise NotImplementedError() # to be filled in derived class
    
    def forward(self, input_ : np.ndarray) -> np.ndarray : 
        if self.first:
            self._setup_layer(input_)
            self.first = False
            
        self.input_ = input_
        for operation in self.operations: 
            input_ = operation.forward(input_)
        
        self.output = input_
        return self.output
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray :
        assert(self.output.shape == output_grad.shape)

        for operation in reversed(self.operations):
            output_grad = operation.backward(output_grad)
        
        input_grad = output_grad
        self._param_grads()
        return input_grad
    
    def _param_grads(self):
        self.param_grads = []
        for operation in self.operations:
            if issubclass(operation.__class__, ParamOperation):
                self.param_grads.append(operation.param_grad)
            
    def _params(self):
        self.params = []
        for operation in self.operations:
            if issubclass(operation.__class__, ParamOperation):
                self.params.append(operation.param)

        
        
        
    


In [284]:
class Dense(Layer):
    def __init__(self, neurons : int, activation : Operation = Sigmoid()) -> None:
        super().__init__(neurons)
        self.activation = activation

    
    def _setup_layer(self, input_:np.ndarray) -> None:
        if self.seed:
            np.random.seed(self.seed)
        
        self.params = []

        self.params.append(np.random.randn(input_.shape[1],self.neurons))
        self.params.append(np.random.randn(1,self.neurons))
        self.operations = [weightMultiply(self.params[0]),
                           BiasAdd(self.params[1]),
                           self.activation]
        
        return None


        

In [285]:
class Loss(object):
    '''
    Loss function of the neural network
    '''
    def __init__(self):
        pass

    def forward(self, prediction : np.ndarray, target : np.ndarray) -> float :
        assert(prediction.shape == target.shape)
        self.prediction = prediction
        self.target = target
        loss_value = self._output()
        return loss_value
    
    def backward(self) -> np.ndarray :
        self.input_grad = self._input_grad()
        assert(self.input_grad.shape == self.prediction.shape)
        return self.input_grad
    
    def _output(self) -> float:
        raise NotImplementedError()
    
    def _input_grad(self) -> np.ndarray:
        raise NotImplementedError()


In [286]:
# Mean squared loss : a subclass of Loss class

class MeanSquaredLoss(Loss):
    def __init__(self):
        super().__init__()
    
    def _output(self) -> float:
        return np.sum(np.power((self.prediction-self.target),2)) / self.prediction.shape[0]
    
    def _input_grad(self) -> np.ndarray:
        return 2.0 * (self.prediction-self.target) / self.prediction.shape[0]
    

In [287]:
# neural network class
class NeuralNetwork(object):
   def __init__(self,layers : List[Layer],
                loss : Loss, seed : float = 1):
      self.layers = layers
      self.loss = loss
      self.seed = seed
      for layer in self.layers:
         setattr(layer, "seed", self.seed)
    
   def forward(self, x_batch : np.ndarray) -> np.ndarray:
      x_out = x_batch
      for layer in self.layers:
         x_out = layer.forward(x_out)
      
      return x_out

   def backward(self, loss_grad : np.ndarray) -> None:
      grad = loss_grad
      for layer in reversed(self.layers):
         grad = layer.backward(grad)
   
   def train_batch(self, X_batch : np.ndarray, y_batch : np.ndarray) -> float:
      predictions = self.forward(X_batch)
      loss = self.loss.forward(predictions, y_batch)
      loss_grad = self.loss.backward()
      self.backward(loss_grad)
      return loss
   
   def params(self):
      for layer in self.layers:
         yield from layer.params # get the parameters from the generator instead of writing a loop and getting the params for the batch

   def param_grads(self):
      for layer in self.layers:
         yield from layer.param_grads
         


   



In [288]:
# next is the optimizer class
# while training, we need to update the parameters based on gradients

class Optimizer(object):
    def __init__(self,lr = 0.001):
        self.lr = lr
        self.first = True
    
    def step(self) -> None :
        pass



In [240]:
class SGD(Optimizer):
    def __init__(self, lr : float = 0.001):
        super().__init__(lr)

    
    def step(self) -> None:
        for (param,param_grad) in zip(self.net.params(), self.net.param_grads()):
            param -= self.lr * param_grad

In [289]:
#helper functions
def permute_data(X, y):
    perm = np.random.permutation(X.shape[0])
    return X[perm], y[perm]

def mae(preds: np.ndarray, actuals: np.ndarray):
    '''
    Compute mean absolute error.
    '''
    return np.mean(np.abs(preds - actuals))

def rmse(preds: np.ndarray, actuals: np.ndarray):
    '''
    Compute root mean squared error.
    '''
    return np.sqrt(np.mean(np.power(preds - actuals, 2)))

In [290]:
#Trainer class -> the class which implements the training loop
from typing import Tuple
from copy import deepcopy
class Trainer(object):
    def __init__(self, net : NeuralNetwork, optim : Optimizer) -> None:
        self.net = net
        self.optim = optim
        self.best_loss = np.inf # initial value for best loss
        setattr(self.optim, "net",self.net)
    
    def generate_batches(self,X:np.ndarray, y:np.ndarray, batch_size : int = 32) -> Tuple[np.ndarray] :
        assert X.shape[0] == y.shape[0] # shape check
        N = X.shape[0]
        for i in range(0,N,batch_size):
            X_batch, y_batch = X[i:i+batch_size], y[i:i+batch_size]
            yield X_batch, y_batch
    
    def fit(self, X_train : np.ndarray, y_train : np.ndarray, 
            X_test : np.ndarray, y_test : np.ndarray, batch_size : int = 32,
            epochs : int = 100, eval_every : int = 100, seed : int = 42, restart : bool  = True) -> None:
        np.random.seed(seed)
        if restart:
            for layer in self.net.layers:
                layer.first = True
            
            self.best_loss = np.inf
        
        for i in range(epochs):
            if (i+1)%eval_every == 0:
                last_model = deepcopy(self.net)
            
            X_train, y_train = permute_data(X_train, y_train)
            batch_generator = self.generate_batches(X_train, y_train)
            for ii, (X_batch, y_batch) in enumerate(batch_generator):
                self.net.train_batch(X_batch, y_batch)
                self.optim.step()
            
            if (i+1)%eval_every == 0:
                test_preds = self.net.forward(X_test)
                loss = self.net.loss.forward(test_preds, y_test)

                if loss < self.best_loss:
                    print(f"Validation loss after {i+1} epochs is {loss:.3f}")
                    self.best_loss = loss
                else:
                    self.net = last_model
                    print(f"""Loss increased after epoch {i+1}, final loss was {self.best_loss:.3f}, using the model from epoch {i+1-eval_every}""")
                    setattr(self.optim, "net", self.net)
                    break
                

all the building blocks from the previous notebook are copied

In [291]:
def normalize(a : np.ndarray) -> np.ndarray :
    other = 1-a
    return np.concatenate([a,1-a], axis=1)

In [292]:
def unnormalize(a : np.ndarray):
    return a[np.newaxis,0]

In [293]:
from scipy import special
def softmax(x: np.ndarray, axis=None) -> np.ndarray:
    return np.exp(x - special.logsumexp(x, axis=axis, keepdims=True))

In [294]:
# Softmax cross entropy class
class SoftmaxCrossEntropy(Loss):
    def __init__(self, eps: float = 1e-9) -> None:
        super().__init__()
        self.eps = eps
        self.single_class = False

    def _output(self) -> float:

        # if the network is just outputting probabilities
        # of just belonging to one class:
        if self.target.shape[1] == 0:
            self.single_class = True

        # if "single_class", apply the "normalize" operation defined above:
        if self.single_class:
            self.prediction, self.target = normalize(self.prediction), normalize(
                self.target
            )

        # applying the softmax function to each row (observation)
        softmax_preds = softmax(self.prediction, axis=1)

        # clipping the softmax output to prevent numeric instability
        self.softmax_preds = np.clip(softmax_preds, self.eps, 1 - self.eps)

        # actual loss computation
        softmax_cross_entropy_loss = -1.0 * self.target * np.log(self.softmax_preds) - (
            1.0 - self.target
        ) * np.log(1 - self.softmax_preds)

        return np.sum(softmax_cross_entropy_loss) / self.prediction.shape[0]

    def _input_grad(self) -> np.ndarray:

        # if "single_class", "un-normalize" probabilities before returning gradient:
        if self.single_class:
            return unnormalize(self.softmax_preds - self.target)
        else:
            return (self.softmax_preds - self.target) / self.prediction.shape[0]


Sigmoid activation function : introduces non-linearity to the model, but it has a downside, the gradients are flat when we reach extremes, near 0 or 1. So, the model does not have gradients to learn. as an alternative, another activation function called Rectified linear unit (ReLU) is used which has opposite strengths and weaknesses. the weakness is that it draws an arbitrary distinction between 0 and 1 and thus is not smooth , but it is compensated by other techniques. It produces larger gradients on average compared to sigmoid. 0.5 compared to 0.25 of sigmoid. 

there is another activation function which is an intermediate between the two, which is the tanh activation function. it is used pretty often in deep learning models. the max gradient is 1 compared to sigmoid's 0.25 and also the function's differential is quite calculable.

Activation functions have their own strenghs and drawbacks. To correct for Relu's drawbacks, there is the Leaky Relu which allows a little negative slope and thus allows flow of gradients backwards better. RELU6 caps the positive value at 6 introducing further non-linearity, but these are complex functions and for simple models, these are not required. for complex deeper models, these maybe employed after experimentation.

In [295]:
class Tanh(Operation):
    def __init__(self) -> None:
        super().__init__()
    
    def _output(self) -> np.ndarray:
        return np.tanh(self.input_)
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray:
        tanh_backward = 1 - self.output * self.output # derivative of tanh(x) = (1-tanh(x)^2)
        return (tanh_backward * output_grad)

To illustrate the benefit of using tanh activation along with Softmax cross entropy for classification problems, we use the famous MNIST digit classification dataset

In [355]:
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [356]:
train_images.shape, train_labels.shape

((60000, 28, 28), (60000,))

In [357]:
# preprocessing on images and labels : Feature scaling for images and one-hot encoding for labels
train_images = train_images.reshape(-1,28*28)
test_images = test_images.reshape(-1,28*28)

# we do not scale the dataset to have mean =0 and unit variance using individual image means and variances , this would distort the image
# instead we use the overall mean and variance
# this way we have one mean and one variance for all pixels in image and they all get transformed by the same amout preserving the structural integrity
# if we had used mean of pixel 1 across all images to normalize pixel1, it would alter pixel1 differently and pixel2 would be altered differently thereby destroying the intra-image structure

X_train = (train_images - np.mean(train_images)) / np.std(train_images)
X_test = (test_images - np.mean(test_images)) / np.std(test_images)

In [358]:
# one hot encoding labels

y_train = np.zeros((train_labels.size, train_labels.max()+1))
y_train[np.arange(train_labels.size), train_labels] = 1

y_test = np.zeros((test_labels.size, train_labels.max()+1))
y_test[np.arange(test_labels.size), test_labels] = 1


In [359]:
X_train.shape, y_train.shape
X_test.shape,y_test.shape

((10000, 784), (10000, 10))

let us build a model with MSE loss function and sigmoid activation function. \ 
the number of neurons chosen is our choice. but the thumb rule is to have n = $\sqrt{n_{in} * n_{out}}$. So, h = sqrt(784*10) = 89

In [65]:
model1 = NeuralNetwork(layers = [Dense(neurons = 89, activation = Tanh()), Dense(10, activation=Sigmoid())],loss=MeanSquaredLoss(),seed=42)


In [69]:
optimizer = SGD(0.1)

In [70]:
trainer = Trainer(model1, optimizer)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.586
Validation loss after 20 epochs is 0.440
Validation loss after 30 epochs is 0.362
Validation loss after 40 epochs is 0.347
Validation loss after 50 epochs is 0.342


In [301]:
def calc_accuracy_model(model, test_set):
    return print(
        '''The model validation accuracy is: {0:.2f}%'''.format(
            np.equal(np.argmax(model.forward(test_set), axis=1), test_labels).sum()
            * 100.0
            / test_set.shape[0]
        )
    )

In [71]:
calc_accuracy_model(model1, X_test)

The model validation accuracy is: 73.79%


In [72]:
model2 = NeuralNetwork(
    layers = [Dense(neurons=89, activation=Tanh()), Dense(neurons=10,activation=Linear())],
    loss = SoftmaxCrossEntropy(), seed=42
)

In [73]:
optimizer = SGD(0.1)
trainer = Trainer(model2, optimizer)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.548
Validation loss after 20 epochs is 0.514
Validation loss after 30 epochs is 0.508
Validation loss after 40 epochs is 0.507
Loss increased after epoch 50, final loss was 0.507, using the model from epoch 40


In [74]:
calc_accuracy_model(model2, X_test)

The model validation accuracy is: 91.87%


with sigmoid activation function + MSE , we got an accuracy of 73.9 percent after 50 epochs. but for the softmax cross entropy loss, we see an accuracy of 91.87 percent which is way higher than the one with MSE. So, take-away lesson is that a careful selection of loss function can help train the model more effectively.

Momentum : So, we have been using the gradients to update the weights. But, we can incorporate momentum into the weight update. it is based on the concept in physics that velocity does not depend on the applied forces, but also the past velocities. Hence we keep track of the change in gradients in the past and use that to update the gradients. \
Mathematical equation : 
update = $del _t + \mu \times del _{t-1} + \mu^2 \times del_{t-2} $

In [360]:
class SGDMomentum(Optimizer):
    def __init__(
        self, lr: float = 0.01, momentum: float = 0.9
    ) -> None:
        super().__init__(lr)
        self.momentum = momentum

    def step(self) -> None:
        if self.first:
            self.velocities = [np.zeros_like(param) for param in self.net.params()]
            self.first = False

        for param, param_grad, velocity in zip(
            self.net.params(), self.net.param_grads(), self.velocities
        ):
            self._update_rule(param=param, grad=param_grad, velocity=velocity)

    def _update_rule(self, **kwargs) -> None:

        # Update velocity
        kwargs["velocity"] *= self.momentum
        kwargs["velocity"] += self.lr * kwargs["grad"]

        # Use this to update parameters
        kwargs["param"] -= kwargs["velocity"]


In [303]:
# model with same specs as before but with SGDMomentum optimzier instead of SGD
model3 = NeuralNetwork(
    layers=[
        Dense(neurons=89, activation=Tanh()),
        Dense(neurons=10, activation=Linear()),
    ],
    loss=SoftmaxCrossEntropy(),
    seed=42,
)

In [107]:
optimizer2 = SGDMomentum(lr = 0.1, momentum = 0.9)

In [108]:
trainer = Trainer(model3, optimizer2)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.537
Validation loss after 20 epochs is 0.358
Validation loss after 30 epochs is 0.304
Loss increased after epoch 40, final loss was 0.304, using the model from epoch 30


In [109]:
calc_accuracy_model(model3, X_test)

The model validation accuracy is: 94.91%


with the addition of momentum in our gradient update rule, we see an improvement in accuracy from 91.87% to 94.91 % which is quite a jump. \
We will now try to set an adaptible learning rate instead of a fixed learning rate and see if there is an improvement.

In [361]:
# the next step is to implement learning rate decay
# this is incorporated inside the optimizer
class Optimizer2(object):
    def __init__(self, initial_lr : float = 0.01, final_lr : float = 0., decay_type : str = "exponential"):
        self.lr = initial_lr
        self.final_lr = final_lr
        self.decay_type = decay_type
        self.first = True
    def step(self):
        pass

    def _setup_decay(self) -> None:
        if not self.decay_type:
            return
        elif self.decay_type == "exponential":
            self.decay_per_epoch = np.power(self.final_lr/self.lr, 1.0/(self.max_epochs-1))
        
        elif self.decay_type == "linear":
            self.decay_per_epoch = (self.lr - self.final_lr) / (self.max_epochs - 1)
    
    def _decay_lr(self) -> None : 
        if not self.decay_type:
            return
        elif self.decay_type == "exponential":
            self.lr *= self.decay_per_epoch
        elif self.decay_type == "linear":
            self.lr -= self.decay_per_epoch
        


We have the new optimizer class, we will now have to modify other classes a bit

In [362]:
class SGD2(Optimizer2):
    def __init__(self, lr : float = 0.001, final_lr : float = 0., decay_type : str = "exponential"):
        super().__init__(lr, final_lr, decay_type)

    
    def step(self) -> None:
        for (param,param_grad) in zip(self.net.params(), self.net.param_grads()):
            param -= self.lr * param_grad

In [363]:
class SGDMomentum2(Optimizer2):
    def __init__(
        self, lr: float = 0.01, final_lr : float = 0. , decay_type : str = "exponential", momentum: float = 0.9
    ) -> None:
        super().__init__(lr, final_lr,decay_type)
        self.momentum = momentum

    def step(self) -> None:
        if self.first:
            self.velocities = [np.zeros_like(param) for param in self.net.params()]
            self.first = False

        for param, param_grad, velocity in zip(
            self.net.params(), self.net.param_grads(), self.velocities
        ):
            self._update_rule(param=param, grad=param_grad, velocity=velocity)

    def _update_rule(self, **kwargs) -> None:

        # Update velocity
        kwargs["velocity"] *= self.momentum
        kwargs["velocity"] += self.lr * kwargs["grad"]

        # Use this to update parameters
        kwargs["param"] -= kwargs["velocity"]

In [307]:
#Trainer class -> the class which implements the training loop
from typing import Tuple
from copy import deepcopy
class Trainer2(object):
    def __init__(self, net : NeuralNetwork, optim : Optimizer2) -> None:
        self.net = net
        self.optim = optim
        self.best_loss = np.inf # initial value for best loss
        setattr(self.optim, "net",self.net)

    
    def generate_batches(self,X:np.ndarray, y:np.ndarray, batch_size : int = 32) -> Tuple[np.ndarray] :
        assert X.shape[0] == y.shape[0] # shape check
        N = X.shape[0]
        for i in range(0,N,batch_size):
            X_batch, y_batch = X[i:i+batch_size], y[i:i+batch_size]
            yield X_batch, y_batch
    
    def fit(self, X_train : np.ndarray, y_train : np.ndarray, 
            X_test : np.ndarray, y_test : np.ndarray, batch_size : int = 32,
            epochs : int = 100, eval_every : int = 100, seed : int = 42, restart : bool  = True) -> None:
        np.random.seed(seed)
        setattr(self.optim, "max_epochs", epochs)
        self.optim._setup_decay()
        if restart:
            for layer in self.net.layers:
                layer.first = True
            
            self.best_loss = np.inf
        
        for i in range(epochs):
            if (i+1)%eval_every == 0:
                last_model = deepcopy(self.net)
            
            X_train, y_train = permute_data(X_train, y_train)
            batch_generator = self.generate_batches(X_train, y_train)
            for ii, (X_batch, y_batch) in enumerate(batch_generator):
                self.net.train_batch(X_batch, y_batch)
                self.optim.step()
            if self.optim.final_lr:
                self.optim._decay_lr()
            
            if (i+1)%eval_every == 0:
                test_preds = self.net.forward(X_test)
                loss = self.net.loss.forward(test_preds, y_test)

                if loss < self.best_loss:
                    print(f"Validation loss after {i+1} epochs is {loss:.3f}")
                    self.best_loss = loss
                else:
                    self.net = last_model
                    print(f"""Loss increased after epoch {i+1}, final loss was {self.best_loss:.3f}, using the model from epoch {i+1-eval_every}""")
                    setattr(self.optim, "net", self.net)
                    break
                

In [308]:
optimizer = SGDMomentum2(0.15,0.05,"linear", 0.9)

In [309]:
trainer = Trainer2(model3, optimizer)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

KeyboardInterrupt: 

In [60]:
calc_accuracy_model(model3, X_test)

The model validation accuracy is: 96.02%


In [381]:
optimizer2 = SGDMomentum2(0.2,0.05,"exponential", 0.9)

In [311]:
trainer = Trainer2(model3, optimizer2)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.783
Validation loss after 20 epochs is 0.504
Validation loss after 30 epochs is 0.385
Validation loss after 40 epochs is 0.345
Validation loss after 50 epochs is 0.330


In [312]:
calc_accuracy_model(model3, X_test)

The model validation accuracy is: 95.24%


with an adaptible learning rate, we see that there is a reduction in validation loss with both linear and exponential decay, Also the accuracy has increased to 96.02 percent which is cool

We have added two optimizations to the model : momentum in gradient update, learning rate decay. the next optimization which helps deep learning model perform better is better weight initialization. currently we are initializing the weight of to have unit variance. in mnist dataset, we are also normalizing the inputs. but, after the first matrix multiplication, assuming our input features are independent, we see that the var_out = var_in1 + var_in2 + ... var_nin. \
So, if we have 785 input values, after the first layer matrix multiply and bias add, we get standard deviation to be about 28. So, the values after matrix multiplication operation are spread out and these are then passed to activation functions which squash these values within a range , say -1 to 1. So, most of the values are around 1 or -1 and this is undesirable [especially for larger deep learning models.]

there are quite a few methods to solve this problem. one of the prominent ones is clever initialization of weights. this way, we see that the spread of values problem can be mitigated. We also need to worry about the variance of gradients flowing back from the end. 

if the input neurons is $n_{in}$ and the number of output neurons is $n_{out}$, to scale the variance during forward pass, we need to multiply the scale factor $\frac{1}{n_{in}}$ and during the backward pass, the gradients need to be scaled by $\frac{1}{n_{out}}$. 

In [313]:
class Dense2(Layer):

    def __init__(self, neurons : int, activation : Operation = Sigmoid(), weight_init = "glorot") -> None:
        super().__init__(neurons)
        self.activation = activation
        self.weight_init = weight_init


    
    def _setup_layer(self, input_:np.ndarray) -> None:
        if self.seed:
            np.random.seed(self.seed)

        if self.weight_init == "glorot":
            scale = 2 / (input_.shape[1] + self.neurons)
        else:
            scale = 1.0
        
        self.params = []

        self.params.append(scale * np.random.randn(input_.shape[1],self.neurons))
        self.params.append(np.random.randn(1,self.neurons))
        self.operations = [weightMultiply(self.params[0]),
                           BiasAdd(self.params[1]),
                           self.activation]

        
        return None
    

model with glorot weight initialization

In [314]:
model4 = NeuralNetwork(
    layers=[
        Dense2(neurons=89, activation=Tanh(), weight_init = "glorot" ),
        Dense2(neurons=10, activation=Linear(), weight_init = "glorot"),
    ],
    loss=SoftmaxCrossEntropy(),
    seed=42,
)

In [315]:
trainer = Trainer2(model4, optimizer2)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.246
Validation loss after 20 epochs is 0.217
Loss increased after epoch 30, final loss was 0.217, using the model from epoch 20


In [316]:
calc_accuracy_model(model4, X_test)

The model validation accuracy is: 96.90%


we see that the loss has reduced to 0.217 and the accuracy has increased to 96.55 percent with the improvements in weight initialization

There is one final optimization : Dropout \
When we try to build a deeper model with more number of layers, neurons per layer etc, the performance worsens. this is because the model overfits on the training data and thus does not generalize well to unseen data. To prevent this, we do something called dropout. i.e we dropout a proportion of neurons from learning during training. this way, there is less probability of the model overfitting on the data. 

During inference, we do not want a proportion of neurons to miss out from firing, so we use all the neurons. But since we disabled p percent neurons from participating during training, the magnitude of values being passed forward was M * (1-p) instead of usual M. To simulate this drop in magnitude, we multiple by (1-p) with all neurons.

let us now code all the classes with support of SGD Momentum, Weight initialization, learning rate decay and Dropout


In [365]:
class Operation_final(object):
    '''
    Base class for an operation in a neural network
    '''
    def __init__(self):
        pass
    
    def forward(self, input_ : np.ndarray, inference : bool = False):
        
        '''
        Stores input in the self.input attribute. 
        store output of forward computation is self.output attribute
        '''
        self.input_ = input_

        self.output = self._output(inference)
        return self.output
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray:
        '''
        Calls the self._input_grad() function
        '''
        assert(self.output.shape == output_grad.shape)
        self.input_grad = self._input_grad(output_grad)
        assert(self.input_grad.shape == self.input_.shape)
        return self.input_grad
    
    def _output(self, inference : bool = False) -> np.ndarray:
        '''
        the output method must be defined for each operation
        '''
        raise NotImplementedError()
    
    def _input_grad(self,output_grad : np.ndarray) -> np.ndarray:
        '''
        the input_grad method must be defined for each operation
        '''
        raise NotImplementedError()

In [366]:
class ParamOperation_final(Operation_final):
    def __init__(self,param : np.ndarray) -> np.ndarray:
        super().__init__()
        self.param = param
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray:
        assert(self.output.shape == output_grad.shape)
    
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)
        assert(self.input_grad.shape == self.input_.shape)
        assert(self.param_grad.shape == self.param.shape)

        return self.input_grad
    
    def _param_grad(self,output_grad : np.ndarray) -> np.ndarray :
        raise NotImplementedError()

In [367]:
class weightMultiply_final(ParamOperation_final):
    def __init__(self, W : np.ndarray):
        super().__init__(W)
    
    def _output(self, inference : bool = False) -> np.ndarray :
        return np.dot(self.input_, self.param)
        
    def _input_grad(self,output_grad : np.ndarray) -> np.ndarray:
        return np.dot(output_grad, np.transpose(self.param, (1,0)))
    
    def _param_grad(self,output_grad : np.ndarray) -> np.ndarray:
        return np.dot(np.transpose(self.input_,(1,0)), output_grad)

In [368]:
# Addition of bias term
class BiasAdd_final(ParamOperation_final):
    def __init__(self,B:np.ndarray):
        assert B.shape[0] == 1
        super().__init__(B)
    
    def _output(self, inference : bool = False):
        return self.input_ + self.param
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray :
        return np.ones_like(self.input_) *  output_grad
    
    def _param_grad(self, output_grad : np.ndarray) -> np.ndarray :
        param_grad = np.ones_like(self.param) * output_grad
        return np.sum(param_grad, axis=0).reshape(1, self.param.shape[1])

In [369]:
# sigmoid activation layer

class Sigmoid_final(Operation_final):
    def __init__(self) -> None:
        super().__init__()
    
    def _output(self, inference : bool = False) -> np.ndarray:
        return (1.0/(1.0+np.exp(-1.0 * self.input_)))
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray:
        sigmoid_backward = self.output * (1-self.output) # derivative of sigmoid(x) = sigmoid(x) * (1-sigmoid(x))
        return (sigmoid_backward * output_grad)

class Linear_final(Operation_final):
    def __init__(self) -> None :
        super().__init__()
    
    def _output(self, inference : bool = False) -> np.ndarray:
        return self.input_
    
    def _input_grad(self, output_grad:np.ndarray) -> np.ndarray:
        return output_grad
    
class Tanh_final(Operation_final):
    def __init__(self) -> None:
        super().__init__()
    
    def _output(self, inference : bool = False) -> np.ndarray:
        return np.tanh(self.input_)
    
    def _input_grad(self, output_grad : np.ndarray) -> np.ndarray:
        tanh_backward = 1 - self.output * self.output # derivative of tanh(x) = (1-tanh(x)^2)
        return (tanh_backward * output_grad)

In [370]:
class Dropout(Operation_final):
    def __init__(self, keep_prob : float = 0.8):
        super().__init__()
        self.keep_prob = keep_prob
    
    def _output(self, inference : bool = False) -> np.ndarray:
        if inference : 
            return self.input_ * self.keep_prob
        else:
            self.mask = np.random.binomial(1, self.keep_prob, size=self.input_.shape)
            return self.input_ * self.mask 
    
    def _input_grad(self,output_grad : np.ndarray) -> np.ndarray:
        return output_grad * self.mask

In [371]:
# abstract layer class
class Layer_final(object):
    def __init__(self, neurons : int, dropout : float = 1.) :
        self.first = True # first layer or not
        self.neurons = neurons
        self.params : List[np.ndarray] = []
        self.param_grads : List[np.ndarray] = []
        self.operations : List[Operation_final] = []
        self.dropout = dropout


    
    def _setup_layer(self, input_ : np.ndarray) -> None:
        raise NotImplementedError() # to be filled in derived class
    
    def forward(self, input_ : np.ndarray, inference : bool = False) -> np.ndarray : 
        if self.first:
            self._setup_layer(input_)
            self.first = False
            
        self.input_ = input_
        
        for operation in self.operations: 
            
            input_ = operation.forward(input_, inference)
        
        self.output = input_
        return self.output
    
    def backward(self, output_grad : np.ndarray) -> np.ndarray :
        assert(self.output.shape == output_grad.shape)

        for operation in reversed(self.operations):
            output_grad = operation.backward(output_grad)
        
        input_grad = output_grad
        self._param_grads()
        return input_grad
    
    def _param_grads(self):
        self.param_grads = []
        for operation in self.operations:
            if issubclass(operation.__class__, ParamOperation_final):
                self.param_grads.append(operation.param_grad)
            
    def _params(self):
        self.params = []
        for operation in self.operations:
            if issubclass(operation.__class__, ParamOperation_final):
                self.params.append(operation.param)

        

In [372]:
class Dense_final(Layer_final):
    def __init__(self, neurons : int, activation : Operation_final = Sigmoid_final(), weight_init = "glorot", dropout : float = 1.) -> None:
        super().__init__(neurons, dropout)
        self.activation = activation
        self.weight_init = weight_init


    
    def _setup_layer(self, input_:np.ndarray) -> None:
        if self.seed:
            np.random.seed(self.seed)

        if self.weight_init == "glorot":
            scale = 2 / (input_.shape[1] + self.neurons)
        else:
            scale = 1.0
        
        self.params = []

        self.params.append(scale * np.random.randn(input_.shape[1],self.neurons))
        self.params.append(np.random.randn(1,self.neurons))
        self.operations = [weightMultiply_final(self.params[0]),
                           BiasAdd_final(self.params[1]),
                           self.activation]
        if self.dropout < 1.:
            self.operations.append(Dropout(self.dropout))

        
        return None 

In [373]:
# neural network class
class NeuralNetwork_final(object):
   def __init__(self,layers : List[Layer_final],
                loss : Loss, seed : float = 1):
      self.layers = layers
      self.loss = loss
      self.seed = seed
      for layer in self.layers:
         setattr(layer, "seed", self.seed)
    
   def forward(self, x_batch : np.ndarray, inference : bool = False) -> np.ndarray:
      x_out = x_batch

      for layer in self.layers:
         x_out = layer.forward(x_out,inference)
      
      return x_out

   def backward(self, loss_grad : np.ndarray) -> None:
      grad = loss_grad
      for layer in reversed(self.layers):
         grad = layer.backward(grad)
   
   def train_batch(self, X_batch : np.ndarray, y_batch : np.ndarray) -> float:
      predictions = self.forward(X_batch)
      loss = self.loss.forward(predictions, y_batch)
      loss_grad = self.loss.backward()
      self.backward(loss_grad)
      return loss
   
   def params(self):
      for layer in self.layers:
         yield from layer.params # get the parameters from the generator instead of writing a loop and getting the params for the batch

   def param_grads(self):
      for layer in self.layers:
         yield from layer.param_grads
         


In [374]:
class Trainer_final(object):
    def __init__(self, net : NeuralNetwork_final, optim : Optimizer2) -> None:
        self.net = net
        self.optim = optim
        self.best_loss = np.inf # initial value for best loss
        setattr(self.optim, "net",self.net)

    
    def generate_batches(self,X:np.ndarray, y:np.ndarray, batch_size : int = 32) -> Tuple[np.ndarray] :
        assert X.shape[0] == y.shape[0] # shape check
        N = X.shape[0]
        for i in range(0,N,batch_size):
            X_batch, y_batch = X[i:i+batch_size], y[i:i+batch_size]
            yield X_batch, y_batch
    
    def fit(self, X_train : np.ndarray, y_train : np.ndarray, 
            X_test : np.ndarray, y_test : np.ndarray, batch_size : int = 32,
            epochs : int = 100, eval_every : int = 100, seed : int = 42, restart : bool  = True) -> None:
        np.random.seed(seed)
        setattr(self.optim, "max_epochs", epochs)
        self.optim._setup_decay()
        if restart:
            for layer in self.net.layers:
                layer.first = True
            
            self.best_loss = np.inf
        
        for i in range(epochs):
            if (i+1)%eval_every == 0:
                last_model = deepcopy(self.net)
            
            X_train, y_train = permute_data(X_train, y_train)
            batch_generator = self.generate_batches(X_train, y_train)
            for ii, (X_batch, y_batch) in enumerate(batch_generator):
                self.net.train_batch(X_batch, y_batch)
                self.optim.step()
            if self.optim.final_lr:
                self.optim._decay_lr()
            
            if (i+1)%eval_every == 0:
                test_preds = self.net.forward(X_test, inference = True)
                loss = self.net.loss.forward(test_preds, y_test)

                if loss < self.best_loss:
                    print(f"Validation loss after {i+1} epochs is {loss:.3f}")
                    self.best_loss = loss
                else:
                    self.net = last_model
                    print(f"""Loss increased after epoch {i+1}, final loss was {self.best_loss:.3f}, using the model from epoch {i+1-eval_every}""")
                    setattr(self.optim, "net", self.net)
                    break

In [375]:
model_dropout = NeuralNetwork_final(
    layers=[
        Dense_final(neurons=89, activation=Tanh_final(), weight_init = "glorot",dropout = 0.8 ),
        Dense_final(neurons=10, activation=Linear_final(), weight_init = "glorot"),
    ],
    loss=SoftmaxCrossEntropy(),
    seed=42,
)

In [346]:
trainer = Trainer_final(model_dropout, optimizer2)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=50, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.198
Validation loss after 20 epochs is 0.179
Validation loss after 30 epochs is 0.175
Validation loss after 40 epochs is 0.172
Loss increased after epoch 50, final loss was 0.172, using the model from epoch 40


In [352]:
calc_accuracy_model(model_dropout, X_test)

The model validation accuracy is: 96.02%


let us try a bigger model -> dropout helps train bigger models

In [376]:
model_dropout_big = NeuralNetwork_final(
    layers=[
        Dense_final(neurons=178, activation=Tanh_final(), weight_init = "glorot",dropout = 0.8 ),
        Dense_final(neurons=46, activation=Tanh_final(), weight_init = "glorot",dropout = 0.8 ),
        Dense_final(neurons=10, activation=Linear_final(), weight_init = "glorot"),
    ],
    loss=SoftmaxCrossEntropy(),
    seed=42,
)

In [380]:
trainer = Trainer_final(model_dropout_big, optimizer2)
trainer.fit(X_train, y_train, X_test,y_test,batch_size=64,epochs=100, eval_every=10,seed=42)

Validation loss after 10 epochs is 0.204
Validation loss after 20 epochs is 0.198
Validation loss after 30 epochs is 0.171
Validation loss after 40 epochs is 0.168
Validation loss after 50 epochs is 0.165
Validation loss after 60 epochs is 0.162
Validation loss after 70 epochs is 0.155
Validation loss after 80 epochs is 0.153
Validation loss after 90 epochs is 0.149
Loss increased after epoch 100, final loss was 0.149, using the model from epoch 90


In [383]:
calc_accuracy_model(model_dropout_big, X_test)

The model validation accuracy is: 96.49%


we seen an improvement over the smaller models with dropout.  we could try with different dropout rates!