**What is Regularization?**  
Regularization discourages overly large weights and improves generalization.

Total Regularization Cost:
- $ L_\text{reg} = \lambda_1 \sum |w| + \lambda_2 \sum w^2 + \dots $  
Where:
- $ \lambda_1 $ = L1 regularization coefficient
- $ \lambda_2 $ = L2 regularization coefficient


In [None]:
# Regularization loss
class Loss:
    def regularization_loss(self, layer):
        regularization_loss = 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss


**Categorical Cross-Entropy (CCE):**  
Loss for multi-class classification:

For sample $i$: $ L_i = -\log p(y_i) $

$ p(y_i) $ = predicted probability for the correct class.

If labels are one-hot:
$ p(y_i) = \sum_j (y_\text{true}[j] \cdot y_\text{pred}[j]) $


In [None]:
# Categorical Cross-Entropy
class Loss_CategoricalCrossentropy(Loss):
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        return -np.log(correct_confidences)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples


**Binary Cross-Entropy (BCE):**  
Loss for binary classification:

$ L_i = -[y_i \log p_i + (1-y_i) \log (1-p_i)] $


In [None]:
# Binary Cross-Entropy
class Loss_BinaryCrossentropy(Loss): 
    def forward(self, y_pred, y_true): 
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) 
        sample_losses = -(y_true * np.log(y_pred_clipped) + (1-y_true) * np.log(1-y_pred_clipped))
        sample_losses = np.mean(sample_losses, axis=-1) 
        return sample_losses

    def backward(self, dvalues, y_true): 
        samples = len(dvalues) 
        outputs = len(dvalues[0]) 
        clipped_dvalues = np.clip(dvalues, 1e-7, 1-1e-7)

        self.dinputs = -(y_true / clipped_dvalues - (1-y_true) / (1-clipped_dvalues))
        self.dinputs = self.dinputs / outputs
        self.dinputs = self.dinputs / samples


**Training Loop Concept:**  
- Forward Pass:
    - $ X \to \text{dense1.forward()} \to \text{activation1.forward()} $
    - $ \to \dots \to \text{loss.forward()} $
- Backward Pass:
    - Gradients move backward using the chain rule:
    $ \nabla_\theta L = \nabla_\theta L_\text{out} \cdot \dots \cdot \nabla_\theta L_\text{in} $
- Optimization:
    - Parameters updated using the chosen optimization technique


In [None]:
# Final Training Loop
for epoch in range(10001):
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)
    activation2.forward(dense2.output)

    data_loss = loss_function.calculate(activation2.output, y)
    regularization_loss = loss_function.regularization_loss(dense1) + loss_function.regularization_loss(dense2)

    loss = data_loss + regularization_loss
    predictions = (activation2.output > 0.5) * 1
    accuracy = np.mean(predictions == y)

    if epoch % 100 == 0:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f} (data_loss: {data_loss:.3f}, reg_loss: {regularization_loss:.3f}), lr: {optimizer.current_learning_rate}')

    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


**Validation:**  
Evaluate the trained model on a separate test set:

- Forward Pass:
    $ X_\text{test} \to \dots \to \text{activation2.output} $
- Compare predicted labels vs actual labels
- Output the final accuracy and loss


In [None]:
# Final Validation
dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)

loss = loss_function.calculate(activation2.output, y_test)
predictions = (activation2.output > 0.5) * 1
accuracy = np.mean(predictions == y_test)

print(f'Validation, acc: {accuracy:.3f}, loss: {loss:.3f}')
