<style>
    .info-card {
        max-width: 650px;
        margin: 25px auto;
        padding: 25px 30px;
        border: 1px solid #e0e0e0;
        border-radius: 12px;
        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
        font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
        background-color: #fdfdfd;
        color: #333;
    }
    .info-card .title {
        color: #1a237e; /* Dark Indigo */
        font-size: 24px;
        font-weight: 600;
        margin-top: 0;
        margin-bottom: 15px;
        text-align: center;
        border-bottom: 2px solid #e8eaf6; /* Light Indigo */
        padding-bottom: 10px;
    }
    .info-card .details-grid {
        display: grid;
        grid-template-columns: max-content 1fr;
        gap: 12px 20px;
        margin-top: 20px;
        font-size: 16px;
    }
    .info-card .label {
        font-weight: 600;
        color: #555;
        text-align: right;
    }
    .info-card .value {
        font-weight: 400;
        color: #222;
    }
</style>

<div class="info-card">
    <h2 class="title">Unit 4 Exercise</h2>
    <div class="details-grid">
        <div class="label">Name:</div>
        <div class="value">Ethan Jed V. Carbonell</div>
        <div class="label">Date:</div>
        <div class="value">October 17, 2025</div>
        <div class="label">Year & Section:</div>
        <div class="value">BSCS 3A AI</div>
        <div></div>
    </div>
</div>

## Library imports
### Set np.random.seed to 0 for fair comparison

In [318]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data


nnfs.init()
np.random.seed(0)

## Classes
### Hidden Layers

In [319]:
# Hidden Layers
# Dense
class Layer_Dense:
    # Layer initialization
    # randomly initialize weights and set biases to zero
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))


    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs, weight and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass/Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters:
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

### ReLU

In [320]:
# ReLU
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Make a copy of the original values first
        self.dinputs = dvalues.copy()
    
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

### Softmax with Categorical Cross Entropy

In [321]:
class Activation_Softmax_Loss_CategoricalCrossEntropy():
    # Creates activation and loss function objects
    def __init__(self):
        pass # No activation or loss objects needed separately

    # Forward pass
    def forward(self, inputs, y_true):
        # Remember inputs for backward pass
        self.inputs = inputs
        
        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
        # Calculate loss
        # Clip data to prevent division by 0
        y_pred_clipped = np.clip(self.output, 1e-7, 1 - 1e-7)
        
        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(len(self.output)), y_true]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
            
        # Calculate and return the mean loss
        negative_log_likelihoods = -np.log(correct_confidences)
        return np.mean(negative_log_likelihoods)

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
            
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient using the simplified and stable formula
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

### Optimizers

In [322]:
# SGD Optimizer (with learning rate decay and momentum)
class Optimizer_SGD:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1., decay=0., momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        # If we use momentum
        if self.momentum:
            # If layer does not contain momentum arrays, create them
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                # If there is no momentum array for biases
                # create it
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            # Build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # Build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
            
        # Vanilla SGD updates (as before momentum update)
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        
        # Update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


# AdaGrad optimizer
class Optimizer_Adagrad:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))
                
    # Update parameters
    def update_params(self, layer):
        # If layer does not contain cache arrays,
        # create them
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2
        
        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
            layer.dweights / \
            (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
            layer.dbiases / \
            (np.sqrt(layer.bias_cache) + self.epsilon)
            
    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

#### Data Loading

In [323]:
# Create the dataset
X, y = spiral_data(samples = 100, classes = 3)

#### NN Init

In [324]:
# Dense Layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64)

# ReLU activation for the Dense layer above
activation1 = Activation_ReLU()

# 2nd dense layer with 64 input and 3 output values (for 3 classes)
dense2 = Layer_Dense(64, 3)

loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

### Optimizer Selection & Training Loop

In [325]:
# Stochastic Gradient Descent (SGD)
# print("Running with: Vanilla SGD")
# optimizer = Optimizer_SGD(learning_rate=1.0)

# SGD with Learning Rate Decay
# print("Running with: SGD w LR Decay")
# optimizer = Optimizer_SGD(learning_rate=1.0, decay=1e-3)

# SGD with Momentum
print("Running with: SGD with Momentum")
optimizer = Optimizer_SGD(learning_rate=0.2, decay=1e-4, momentum=0.9)

# Adaptive Gradient (AdaGrad)
# print("Running with: AdaGrad")
# optimizer = Optimizer_Adagrad(learning_rate=1.5, decay=0)

epochs = 1001 # Set number of epochs

for epoch in range(epochs):

    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Pass the output of the dense layer through the activation function
    activation1.forward(dense1.output)
    # Pass on to the 2nd layer
    dense2.forward(activation1.output)
    # Activation function for the 2nd layer + Loss
    loss = loss_activation.forward(dense2.output, y)

    # --- Print progress every 100 epochs ---
    loss = loss_activation.forward(dense2.output, y)

    # Print progress
    if not epoch % 100:
        # Get predictions from the activation output
        predictions = np.argmax(loss_activation.output, axis=1)
        accuracy = np.mean(predictions == y)
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate:.4f}')

    # Backward pass from loss
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update learning rate (if decay is used)
    optimizer.pre_update_params()
    # Update the weights and biases of each layer
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    # Increment iteration count
    optimizer.post_update_params()

Running with: SGD with Momentum
epoch: 0, acc: 0.360, loss: 1.099, lr: 0.2000
epoch: 100, acc: 0.407, loss: 1.079, lr: 0.1980
epoch: 200, acc: 0.410, loss: 1.076, lr: 0.1961
epoch: 300, acc: 0.403, loss: 1.072, lr: 0.1942
epoch: 400, acc: 0.423, loss: 1.063, lr: 0.1923
epoch: 500, acc: 0.447, loss: 1.039, lr: 0.1905
epoch: 600, acc: 0.537, loss: 0.997, lr: 0.1887
epoch: 700, acc: 0.610, loss: 0.933, lr: 0.1869
epoch: 800, acc: 0.623, loss: 0.870, lr: 0.1852
epoch: 900, acc: 0.657, loss: 0.818, lr: 0.1835
epoch: 1000, acc: 0.713, loss: 0.785, lr: 0.1818
