In [1]:
class Optimizer_SGD: 
 
    # Initialize optimizer - set settings, 
    # learning rate of 1. is default for this optimizer 
    def __init__(self, learning_rate=1.0): 
        self.learning_rate = learning_rate 
 
    # Update parameters 
    def update_params(self, layer): 
        layer.weights += -self.learning_rate * layer.dweights 
        layer.biases += -self.learning_rate * layer.dbiases

In [2]:
#随机梯度下降

In [3]:
import numpy as np 
import nnfs 
from nnfs.datasets import spiral_data 
 
nnfs.init() 
 
 
# Dense layer 
class Layer_Dense: 
 
    # Layer initialization 
    def __init__(self, n_inputs, n_neurons): 
        # Initialize weights and biases 
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1, n_neurons)) 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs, weights and biases 
        self.output = np.dot(inputs, self.weights) + self.biases 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Gradients on parameters 
        self.dweights = np.dot(self.inputs.T, dvalues) 
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True) 
        # Gradient on values 
        self.dinputs = np.dot(dvalues, self.weights.T) 


In [4]:
# ReLU activation 
class Activation_ReLU: 
 
    # Forward pass 
    def forward(self, inputs): 
         # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs 
        self.output = np.maximum(0, inputs) 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Since we need to modify original variable, 
        # let's make a copy of values first 
        self.dinputs = dvalues.copy() 
 
        # Zero gradient where input values were negative 
        self.dinputs[self.inputs <= 0] = 0 


In [5]:
# Softmax activation 
class Activation_Softmax: 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
 
        # Get unnormalized probabilities 
        exp_values = np.exp(inputs - np.max(inputs, axis=1, 
                                            keepdims=True)) 
        # Normalize them for each sample 
        probabilities = exp_values / np.sum(exp_values, axis=1, 
                                            keepdims=True) 
 
        self.output = probabilities 
 
    # Backward pass 
    def backward(self, dvalues): 
 
        # Create uninitialized array 
        self.dinputs = np.empty_like(dvalues) 
 
        # Enumerate outputs and gradients 
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)): 
            # Flatten output array 
            single_output = single_output.reshape(-1, 1) 
            # Calculate Jacobian matrix of the output and 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
             # Calculate sample-wise gradient 
            # and add it to the array of sample gradients 
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [6]:
# Common loss class 
class Loss: 
 
    # Calculates the data and regularization losses 
    # given model output and ground truth values 
    def calculate(self, output, y): 
 
        # Calculate sample losses 
        sample_losses = self.forward(output, y) 
 
        # Calculate mean loss 
        data_loss = np.mean(sample_losses) 
 
        # Return loss 
        return data_loss 
# Cross-entropy loss 
class Loss_CategoricalCrossentropy(Loss): 
 
    # Forward pass 
    def forward(self, y_pred, y_true): 
 
        # Number of samples in a batch 
        samples = len(y_pred) 
 
        # Clip data to prevent division by 0 
        # Clip both sides to not drag mean towards any value 
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) 
 
        # Probabilities for target values - 
        # only if categorical labels 
        if len(y_true.shape) == 1: 
            correct_confidences = y_pred_clipped[ 
                range(samples), 
                y_true 
            ]
             # Mask values - only for one-hot encoded labels 
        elif len(y_true.shape) == 2: 
            correct_confidences = np.sum( 
                y_pred_clipped * y_true, 
                axis=1 
            ) 
 
        # Losses 
        negative_log_likelihoods = -np.log(correct_confidences) 
        return negative_log_likelihoods 
 
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
        # Number of labels in every sample 
        # We'll use the first sample to count them 
        labels = len(dvalues[0]) 
 
        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true] 
 
        # Calculate gradient 
        self.dinputs = -y_true / dvalues 
        # Normalize gradient 
        self.dinputs = self.dinputs / samples 
# Softmax classifier - combined Softmax activation 
# and cross-entropy loss for faster backward step 
class Activation_Softmax_Loss_CategoricalCrossentropy(): 
 
    # Creates activation and loss function objects 
    def __init__(self): 
        self.activation = Activation_Softmax() 
        self.loss = Loss_CategoricalCrossentropy() 
 
    # Forward pass 
    def forward(self, inputs, y_true): 
        # Output layer's activation function 
        self.activation.forward(inputs) 
        # Set the output 
        self.output = self.activation.output 
        # Calculate and return loss value 
        return self.loss.calculate(self.output, y_true)
     # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
 
        # If labels are one-hot encoded, 
        # turn them into discrete values 
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1) 
 
        # Copy so we can safely modify 
        self.dinputs = dvalues.copy() 
        # Calculate gradient 
        self.dinputs[range(samples), y_true] -= 1 
        # Normalize gradient 
        self.dinputs = self.dinputs / samples 

In [7]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimizer = Optimizer_SGD() 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y)
     # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2)

epoch: 0, acc: 0.360, loss: 1.099
epoch: 100, acc: 0.400, loss: 1.087
epoch: 200, acc: 0.417, loss: 1.077
epoch: 300, acc: 0.420, loss: 1.076
epoch: 400, acc: 0.400, loss: 1.074
epoch: 500, acc: 0.400, loss: 1.071
epoch: 600, acc: 0.417, loss: 1.067
epoch: 700, acc: 0.437, loss: 1.062
epoch: 800, acc: 0.430, loss: 1.055
epoch: 900, acc: 0.390, loss: 1.064
epoch: 1000, acc: 0.400, loss: 1.062
epoch: 1100, acc: 0.443, loss: 1.061
epoch: 1200, acc: 0.403, loss: 1.061
epoch: 1300, acc: 0.387, loss: 1.052
epoch: 1400, acc: 0.387, loss: 1.106
epoch: 1500, acc: 0.430, loss: 1.043
epoch: 1600, acc: 0.410, loss: 1.063
epoch: 1700, acc: 0.397, loss: 1.043
epoch: 1800, acc: 0.450, loss: 1.038
epoch: 1900, acc: 0.483, loss: 1.025
epoch: 2000, acc: 0.403, loss: 1.037
epoch: 2100, acc: 0.457, loss: 1.022
epoch: 2200, acc: 0.493, loss: 1.020
epoch: 2300, acc: 0.443, loss: 1.002
epoch: 2400, acc: 0.480, loss: 0.994
epoch: 2500, acc: 0.490, loss: 1.009
epoch: 2600, acc: 0.480, loss: 0.991
epoch: 2700, 

In [8]:
# SGD optimizer 
class Optimizer_SGD: 
 
    # Initialize optimizer - set settings, 
    # learning rate of 1. is default for this optimizer 
    def __init__(self, learning_rate=1., decay=0.): 
        self.learning_rate = learning_rate 
        self.current_learning_rate = learning_rate 
        self.decay = decay 
        self.iterations = 0 
 
    # Call once before any parameter updates 
    def pre_update_params(self): 
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations)) 
 
    # Update parameters 
    def update_params(self, layer): 
        layer.weights += -self.current_learning_rate * layer.dweights 
        layer.biases += -self.current_learning_rate * layer.dbiases 
 
    # Call once after any parameter updates 
    def post_update_params(self): 
        self.iterations += 1 

In [11]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimizer = Optimizer_SGD(decay=1e-3) 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y)
     # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 1000: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f} ' +
              f'lr: {optimizer.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.pre_update_params()
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.350, loss: 1.099 lr: 1.0
epoch: 1000, acc: 0.457, loss: 1.048 lr: 0.5002501250625312
epoch: 2000, acc: 0.507, loss: 1.035 lr: 0.33344448149383127
epoch: 3000, acc: 0.507, loss: 1.014 lr: 0.25006251562890724
epoch: 4000, acc: 0.500, loss: 0.985 lr: 0.2000400080016003
epoch: 5000, acc: 0.550, loss: 0.951 lr: 0.16669444907484582
epoch: 6000, acc: 0.573, loss: 0.918 lr: 0.1428775539362766
epoch: 7000, acc: 0.607, loss: 0.880 lr: 0.12501562695336915
epoch: 8000, acc: 0.660, loss: 0.842 lr: 0.11112345816201799
epoch: 9000, acc: 0.660, loss: 0.810 lr: 0.1000100010001
epoch: 10000, acc: 0.670, loss: 0.785 lr: 0.09091735612328393


In [12]:
# SGD optimizer 
class Optimizer_SGD: 
 
    # Initialize optimizer - set settings,  
    # learning rate of 1. is default for this optimizer 
    def __init__(self, learning_rate=1., decay=0., momentum=0.): 
        self.learning_rate = learning_rate 
        self.current_learning_rate = learning_rate 
        self.decay = decay 
        self.iterations = 0 
        self.momentum = momentum 
 
    # Call once before any parameter updates 
    def pre_update_params(self): 
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations)) 
 
    # Update parameters 
    def update_params(self, layer): 
 
        # If we use momentum 
        if self.momentum: 
 
            # If layer does not contain momentum arrays, create them 
            # filled with zeros 
            if not hasattr(layer, 'weight_momentums'): 
                layer.weight_momentums = np.zeros_like(layer.weights) 
                # If there is no momentum array for weights 
                # The array doesn't exist for biases yet either. 
                layer.bias_momentums = np.zeros_like(layer.biases) 
 
            # Build weight updates with momentum - take previous 
            # updates multiplied by retain factor and update with 
            # current gradients 
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights 
            layer.weight_momentums = weight_updates 
 
            # Build bias updates 
            bias_updates = self.momentum * layer.bias_momentums -  self.current_learning_rate * layer.dbiases 
            layer.bias_momentums = bias_updates
        else: 
            weight_updates = -self.current_learning_rate * layer.dweights 
            bias_updates = -self.current_learning_rate * layer.dbiases 
 
        # Update weights and biases using either 
        # vanilla or momentum updates 
        layer.weights += weight_updates 
        layer.biases += bias_updates 
 
    # Call once after any parameter updates 
    def post_update_params(self): 
        self.iterations += 1 

In [15]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimizer = Optimizer_SGD(decay=1e-3, momentum=0.9) 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y)
     # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 1000: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f} ' +
              f'lr: {optimizer.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.pre_update_params()
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.413, loss: 1.099 lr: 1.0
epoch: 1000, acc: 0.850, loss: 0.329 lr: 0.5002501250625312
epoch: 2000, acc: 0.900, loss: 0.239 lr: 0.33344448149383127
epoch: 3000, acc: 0.917, loss: 0.214 lr: 0.25006251562890724
epoch: 4000, acc: 0.917, loss: 0.196 lr: 0.2000400080016003
epoch: 5000, acc: 0.923, loss: 0.188 lr: 0.16669444907484582
epoch: 6000, acc: 0.930, loss: 0.183 lr: 0.1428775539362766
epoch: 7000, acc: 0.930, loss: 0.179 lr: 0.12501562695336915
epoch: 8000, acc: 0.930, loss: 0.175 lr: 0.11112345816201799
epoch: 9000, acc: 0.930, loss: 0.173 lr: 0.1000100010001
epoch: 10000, acc: 0.930, loss: 0.171 lr: 0.09091735612328393


In [16]:
# Adagrad optimizer 
class Optimizer_Adagrad: 
 
    # Initialize optimizer - set settings 
    def __init__(self, learning_rate=1., decay=0., epsilon=1e-7): 
        self.learning_rate = learning_rate 
        self.current_learning_rate = learning_rate 
        self.decay = decay 
        self.iterations = 0 
        self.epsilon = epsilon 
 
    # Call once before any parameter updates 
    def pre_update_params(self): 
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
     # Update parameters 
    def update_params(self, layer): 
 
        # If layer does not contain cache arrays, 
        # create them filled with zeros 
        if not hasattr(layer, 'weight_cache'): 
            layer.weight_cache = np.zeros_like(layer.weights) 
            layer.bias_cache = np.zeros_like(layer.biases) 
 
        # Update cache with squared current gradients 
        layer.weight_cache += layer.dweights**2 
        layer.bias_cache += layer.dbiases**2 
 
        # Vanilla SGD parameter update + normalization 
        # with square rooted cache 
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon) 
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon) 
 
    # Call once after any parameter updates 
    def post_update_params(self): 
        self.iterations += 1 

In [19]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
#optimizer = Optimizer_SGD(decay=8e-8, momentum=0.9) 
optimizer = Optimizer_Adagrad(decay=1e-4) 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 1000: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.pre_update_params() 
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2) 
    optimizer.post_update_params() 

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 1000, acc: 0.740, loss: 0.581, lr: 0.9091735612328392
epoch: 2000, acc: 0.797, loss: 0.440, lr: 0.8334027835652972
epoch: 3000, acc: 0.837, loss: 0.384, lr: 0.7692899453804138
epoch: 4000, acc: 0.840, loss: 0.353, lr: 0.7143367383384527
epoch: 5000, acc: 0.860, loss: 0.330, lr: 0.6667111140742716
epoch: 6000, acc: 0.853, loss: 0.309, lr: 0.6250390649415589
epoch: 7000, acc: 0.897, loss: 0.280, lr: 0.5882698982293076
epoch: 8000, acc: 0.913, loss: 0.264, lr: 0.5555864214678593
epoch: 9000, acc: 0.910, loss: 0.252, lr: 0.5263434917627243
epoch: 10000, acc: 0.910, loss: 0.243, lr: 0.5000250012500626


In [20]:
# RMSprop optimizer 
class Optimizer_RMSprop: 
 
    # Initialize optimizer - set settings 
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9): 
        self.learning_rate = learning_rate 
        self.current_learning_rate = learning_rate 
        self.decay = decay 
        self.iterations = 0 
        self.epsilon = epsilon 
        self.rho = rho 
 
    # Call once before any parameter updates 
    def pre_update_params(self): 
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations)) 
 
    # Update parameters 
    def update_params(self, layer): 
 
        # If layer does not contain cache arrays, 
        # create them filled with zeros 
        if not hasattr(layer, 'weight_cache'): 
            layer.weight_cache = np.zeros_like(layer.weights) 
            layer.bias_cache = np.zeros_like(layer.biases) 
 
        # Update cache with squared current gradients 
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2 
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2
         # Vanilla SGD parameter update + normalization 
        # with square rooted cache 
        layer.weights += -self.current_learning_rate * layer.dweights /  (np.sqrt(layer.weight_cache) + self.epsilon) 
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon) 
 
    # Call once after any parameter updates 
    def post_update_params(self): 
        self.iterations += 1

In [21]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
#optimizer = Optimizer_SGD(decay=8e-8, momentum=0.9) 
# optimizer = Optimizer_Adagrad(decay=1e-4)
# optimizer = Optimizer_RMSprop(decay=1e-4) 
optimizer = Optimizer_RMSprop(learning_rate=0.02, decay=1e-5, rho=0.999)
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 1000: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.pre_update_params() 
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2) 
    optimizer.post_update_params() 

epoch: 0, acc: 0.383, loss: 1.099, lr: 0.02
epoch: 1000, acc: 0.680, loss: 0.639, lr: 0.019802176259170884
epoch: 2000, acc: 0.813, loss: 0.483, lr: 0.019608035372895814
epoch: 3000, acc: 0.767, loss: 0.445, lr: 0.01941766424916747
epoch: 4000, acc: 0.780, loss: 0.428, lr: 0.019230954143789846
epoch: 5000, acc: 0.807, loss: 0.407, lr: 0.01904780045524243
epoch: 6000, acc: 0.810, loss: 0.406, lr: 0.018868102529269144
epoch: 7000, acc: 0.833, loss: 0.349, lr: 0.018691763474424996
epoch: 8000, acc: 0.850, loss: 0.322, lr: 0.01851868998787026
epoch: 9000, acc: 0.870, loss: 0.309, lr: 0.018348792190754044
epoch: 10000, acc: 0.867, loss: 0.306, lr: 0.018181983472577025


In [24]:
class Optimizer_Adam: 
 
    # Initialize optimizer - set settings 
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999): 
        self.learning_rate = learning_rate 
        self.current_learning_rate = learning_rate 
        self.decay = decay 
        self.iterations = 0 
        self.epsilon = epsilon 
        self.beta_1 = beta_1 
        self.beta_2 = beta_2 
 
    # Call once before any parameter updates 
    def pre_update_params(self): 
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations)) 
 
    # Update parameters 
    def update_params(self, layer): 
 
        # If layer does not contain cache arrays, 
        # create them filled with zeros 
        if not hasattr(layer, 'weight_cache'): 
            layer.weight_momentums = np.zeros_like(layer.weights) 
            layer.weight_cache = np.zeros_like(layer.weights) 
            layer.bias_momentums = np.zeros_like(layer.biases) 
            layer.bias_cache = np.zeros_like(layer.biases) 
 
        # Update momentum  with current gradients 
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights 
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases 
        # Get corrected momentum 
        # self.iteration is 0 at first pass 
        # and we need to start with 1 here 
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1)) 
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1)) 
        # Update cache with squared current gradients 
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2 
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
        # Get corrected cache 
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1)) 
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1)) 
 
        # Vanilla SGD parameter update + normalization 
        # with square rooted cache 
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon) 
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon) 
 
    # Call once after any parameter updates 
    def post_update_params(self): 
        self.iterations += 1 

In [25]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
#optimizer = Optimizer_SGD(decay=8e-8, momentum=0.9) 
# optimizer = Optimizer_Adagrad(decay=1e-4)
# optimizer = Optimizer_RMSprop(decay=1e-4) 
# optimizer = Optimizer_RMSprop(learning_rate=0.02, decay=1e-5, rho=0.999)
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 1000: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimizer.pre_update_params() 
    optimizer.update_params(dense1) 
    optimizer.update_params(dense2) 
    optimizer.post_update_params() 

epoch: 0, acc: 0.327, loss: 1.099, lr: 0.05
epoch: 1000, acc: 0.927, loss: 0.179, lr: 0.049975037468784345
epoch: 2000, acc: 0.950, loss: 0.148, lr: 0.04995007490013731
epoch: 3000, acc: 0.957, loss: 0.134, lr: 0.049925137256683606
epoch: 4000, acc: 0.957, loss: 0.130, lr: 0.049900224501110035
epoch: 5000, acc: 0.967, loss: 0.120, lr: 0.04987533659617785
epoch: 6000, acc: 0.970, loss: 0.119, lr: 0.04985047350472258
epoch: 7000, acc: 0.967, loss: 0.114, lr: 0.04982563518965381
epoch: 8000, acc: 0.967, loss: 0.104, lr: 0.04980082161395499
epoch: 9000, acc: 0.963, loss: 0.102, lr: 0.04977603274068329
epoch: 10000, acc: 0.977, loss: 0.095, lr: 0.04975126853296942
