# L1 L2 Regularization

In [32]:
import numpy as np
import nnfs
import matplotlib.pyplot as plt
from nnfs.datasets import spiral_data

# Dense Layer

In [33]:
# creating Dense Layer class  with backpropogation
class Dense:
    # layer initialization
    def __init__(self,n_inputs,n_neurons,weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):
        super().__init__()
        self.weights =  0.01*np.random.randn(n_inputs,n_neurons)
        self.biases = np.zeros((1,n_neurons))
        # with regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l2 = bias_regularizer_l2
    
    # forward pass of Dense Layer
    def forward(self,inputs):
        # remember input values
        self.inputs = inputs
        # calculate output values from weights and inputs,weights,biases
        self.outputs = np.dot(inputs,self.weights) + self.biases
    
    # backward method or backpropogation 
    def backward(self,dvalues):
        # gradients on parameters
        # loss with respect to weights
        self.dweights = np.dot(self.inputs.T,dvalues)
        # loss with respect to biases
        self.dbiases = np.sum(dvalues,axis=0,keepdims=True)

        # l1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0 ] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        
        # l2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        
        # l1 on biasess
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0 ] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1

        # l2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
        
        # loss with respect to the inputs
        self.dinputs = np.dot(dvalues,self.weights.T)

# ReLU Layer

In [34]:
# creating relu class with backpropogation
class ReLU:
    # forward pass
    def forward(self,inputs):
        # remember input values
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
    
    # backward method in relu activation function
    def backward(self,dvalues):
        # since we need to modify original variable,
        # lets make a copy of value event first
        self.dinputs = dvalues.copy()
        # Zero gradient where input values are negative
        self.dinputs[self.inputs <= 0] = 0

# Softmax Layer

In [35]:
# creating softmax class
class Softmax:
    def forward(self,inputs):
        exp_values = np.exp(inputs - np.max(inputs,axis=1,keepdims=True))
        probabelities = exp_values/ np.sum(exp_values,axis=1,keepdims=True)
        self.output = probabelities

# Common Loss Function

In [36]:
# Common loss class
class Loss:
    # regularization loss
    def regularization_loss(self,layer):
        regularization_loss = 0
        # l1 regularization - weights
        # calculate only when factor > 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        # Return loss
        return data_loss

# Categorical Cross Entropy Loss

In [37]:
class CrossEntropyLoss(Loss):
    def forward(self,y_pred,y_true):
        # storing y_pred and y_true
        # self.y_true = y_true
        # self.y_pred = y_pred
        # number of samples in a batch
        samples = len(y_pred)

        # clip data to prevent division by 0
        # clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred,1e-7,1-1e-7)
        
        # probabelities for target values
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
            range(samples),
            y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped*y_true,axis=1
            )
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    # backward pass or backpropogation
    def backward(self,dvalues,y_true):
        # number of samples
        samples = len(dvalues)
        # number of labels in every sample
        # we will use the first sample to count them
        labels = len(dvalues[0])
        # if labels are sparse turn them into one hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        # calculate gradient
        self.dinputs = -y_true / dvalues
        # normalize gradient
        self.dinputs = self.dinputs/samples

# Cross Entropy with Softmax

In [38]:
# classifier - Softmax Classifier
# Loss Function - Categorical Cross Entropy Loss Function
class Activation_Softmax_Loss_CategoricalCrossEntropy:
    # initializing class or class constructor
    def __init__(self):
        # setting activation function to softmax
        self.activation = Softmax()
        # setting loss function to categorical cross entropy
        self.loss = CrossEntropyLoss()
    
    # forward method of class
    def forward(self,inputs,y_true):
        # applying activation function to inputs
        self.activation.forward(inputs)
        # getting the output valus of softmax function to get input for loss function
        self.output = self.activation.output
        # applying loss function to inputs and y_true
        return self.loss.calculate(self.output,y_true)
    
    # backward pss
    def backward(self, d_values,y_true):
        # number of samples 
        samples = len(d_values)
        # if tables are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true,axis=1)
        # copying values of dvalues into input
        self.dinputs = d_values.copy()
        # calculate gradient
        self.dinputs[range(samples),y_true] -= 1
        # normalize gradient
        self.dinputs = self.dinputs/samples

# Gradient Descent Optimizer

In [39]:
# Gradient Descent Optimizer
class GradientDescent:
    # initialize optimizer - set settings
    # learning rate of 1. is default for this optimizer
    def __init__(self,learning_rate = 1,decay=0.,momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iteration = 0
        self.momentum = momentum
    
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1./(1.+self.decay*self.iteration))
    
    # update parameters
    def update_parameters(self,layer):
        # if we use momentum
        if self.momentum:
            # if layer does not contain momentum array create them
            # filled with zero
            if not hasattr(layer,'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            # build weights updates with momentum - take previous
            # updates multiplied by retain factor and update with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        
        # vannila Gradient Descent (as before momentum update)
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # call once after any parameter update
    def post_update_params(self):
        self.iteration += 1

# Adagrad Optimizer

In [40]:
class Adagrad():
    def __init__(self,learning_rate=1.,decay=0.,epsilon=1e-7):
        super().__init__()
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.epsilon = epsilon
        self.iteration = 0
    
    # call once before parameters gets updated
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / ( 1 + self.decay * self.iteration))
    
    # update parameters
    def update_parameters(self,layer):
        if not hasattr(layer,'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.biases_cache = np.zeros_like(layer.biases)
        
        # update cache with squared current gradients
        layer.weight_cache += layer.dweights**2
        layer.biases_cache += layer.dbiases**2

        # vannila gradient descent parameter update + normalized with squared rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.biases_cache) + self.epsilon)

    # call once parameter gets updated
    def post_update_params(self):
        self.iteration += 1

# RmsProp Optimizer

In [41]:
class RmsProp:
    # initialize parameters
    def __init__(self,learning_rate=0.001,decay=0.,epsilon=1e-7,rho=0.9):
        super().__init__()
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.rho = rho
        self.epsilon = epsilon
        self.iteration = 0

    # call once before parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iteration))
    
    # update parameters
    def update_parameters(self,layer):
        # if does not contain cache array
        # create them filled with zero
        if not hasattr(layer,'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        # update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1-self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1-self.rho) * layer.dbiases**2
        
        # vannila gradient descent parameter update + normalization with squared rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # post update parameters
    def post_update_params(self):
        self.iteration += 1


# Adam Optimizer

In [42]:
# Adam optimizer
class Adam:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_parameters(self, layer):
        # If layer does not contain cache arrays, create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentum
        # self.iteration is 0 at first pass and we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


# Training Layer

In [43]:
# Less Training Data
# X,y = spiral_data(samples=100,classes=3)
# More Training Data
X,y = spiral_data(samples=1000,classes=3)

In [44]:
# create dense layer with 2 inputs and 3 layers
# without regularizer
# dense_1 = Dense(2,64)
# with regularizer
dense_1 = Dense(2,64,weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4)
# create activation ReLU(to be used with dense layer)
relu = ReLU()
# create second dense layer with 64 input feature (as we take output of previous layer here ) and 3  output values(output values)
dense_2 = Dense(64,3)
# create softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()
# add a gradient descent optimizer
# 1e-3 = 1 * 10^-3
# optimizer = GradientDescent(decay=1e-3,momentum=0.9)
# add a rmsProp Optimizer
# optimizer = RmsProp(learning_rate=0.02,decay=1e-5,rho=0.999)
# add a adam optimizer
optimizer = Adam(learning_rate=0.02,decay=5e-7)

# train in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense_1.forward(X)
    # perform a forward pass through activation function
    # take outpu of first dense layer here
    relu.forward(dense_1.outputs)
    # perform a forward passs through second dense layer
    # take output of relu layer
    dense_2.forward(relu.output)
    # perform a forward pass through activation function and loss function
    # takes output from second dense layer and return output here
    data_loss = loss_activation.forward(dense_2.outputs, y)

    # regularization loss
    regularization_loss = (
        loss_activation.loss.regularization_loss(dense_1)+
        loss_activation.loss.regularization_loss(dense_2)
    )

    # calculate overall loss
    loss = data_loss + regularization_loss

    # calculate accuracy from output of softmax activation and outputs
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output,axis=1)

    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, data_loss: {data_loss:.3f}, Learning Rate: {optimizer.current_learning_rate}')
    
    # backward pass
    loss_activation.backward(loss_activation.output,y)
    dense_2.backward(loss_activation.dinputs)
    relu.backward(dense_2.dinputs)
    dense_1.backward(relu.dinputs)

    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_parameters(dense_1)
    optimizer.update_parameters(dense_2)
    optimizer.post_update_params()

epoch: 0, acc: 0.356, loss: 1.099, data_loss: 1.099, Learning Rate: 0.02
epoch: 100, acc: 0.556, loss: 0.952, data_loss: 0.932, Learning Rate: 0.019999010049002574
epoch: 200, acc: 0.723, loss: 0.779, data_loss: 0.717, Learning Rate: 0.019998010197985302
epoch: 300, acc: 0.741, loss: 0.705, data_loss: 0.627, Learning Rate: 0.019997010446938183
epoch: 400, acc: 0.761, loss: 0.661, data_loss: 0.577, Learning Rate: 0.01999601079584623
epoch: 500, acc: 0.782, loss: 0.629, data_loss: 0.541, Learning Rate: 0.01999501124469445
epoch: 600, acc: 0.786, loss: 0.607, data_loss: 0.517, Learning Rate: 0.01999401179346786
epoch: 700, acc: 0.788, loss: 0.588, data_loss: 0.498, Learning Rate: 0.01999301244215147
epoch: 800, acc: 0.793, loss: 0.572, data_loss: 0.483, Learning Rate: 0.0199920131907303
epoch: 900, acc: 0.804, loss: 0.557, data_loss: 0.469, Learning Rate: 0.019991014039189386
epoch: 1000, acc: 0.828, loss: 0.542, data_loss: 0.454, Learning Rate: 0.019990014987513734
epoch: 1100, acc: 0.83

Accuracy comparision : <br>
Adam = 0.907 <br>
RmsProp = 0.893 <br>
Adagrad = 0.827 <br>
Gradient Descent = 0.757 <br>
Gradient Descent with momentum = 0.940 <br>

In [47]:
X_test,y_test = spiral_data(samples=100,classes=3)

In [48]:

# forward pass of our training data through this layer
dense_1.forward(X_test)
# perform a forward pass through activation function
# take outpu of first dense layer here
relu.forward(dense_1.outputs)
# perform a forward passs through second dense layer
# take output of relu layer
dense_2.forward(relu.output)
# perform a forward pass through activation function and loss function
# takes output from second dense layer and return output here
loss = loss_activation.forward(dense_2.outputs, y_test)

# calculate accuracy from output of softmax activation and outputs
# calculate values along first axis
predictions = np.argmax(loss_activation.output,axis=1)

if len(y.shape) == 2:
    y = np.argmax(y,axis=1)
accuracy = np.mean(predictions == y_test)

print(f"Validation Accuracy: {accuracy:.3f}, Loss:{loss:.3f}")

Validation Accuracy: 0.873, Loss:0.485
