## Import libraries

In [183]:
import numpy as np
from tqdm import trange

# Layers

A general layer class

In [184]:
class Layer:
    def __init__(self):
        self.input_ = None
        self.output = None

    def forward(self, input_: np.ndarray):
        raise NotImplementedError('This method should be implemented')

    def backward(self, upstream_grad: np.ndarray):
        raise NotImplementedError('This method should be implemented')

    def step(self, lr):
        pass

### Linear layer 

`Fully connected` label

To calculate forward phase:

### $Y = XW + B$

Where `Y` is `output`, `X` is `dataset` (input), `W` is `weight matrix` and `B` is `bias matrix`

- To update weights we use back propagation algorithm and gradient descend:

    $ \frac{\partial L}{\partial W} = \frac{\partial Y}{\partial W} . \frac{\partial L}{\partial Y}$

    $\frac{\partial Y}{\partial W} = X^T$

    => $\frac{\partial L}{\partial W} = X^T . \frac{\partial L}{\partial Y}$

- To calculate upstream gradient for the previous layer (downstream gradient):

    $ \frac{\partial L}{\partial X} = \frac{\partial L}{\partial Y} . \frac{\partial Y}{\partial X}$

    $\frac{\partial Y}{\partial X} = W^T$

    => $\frac{\partial L}{\partial X} = \frac{\partial L}{\partial Y} . W^T$

- For updating bias we can calculate:

    $ \frac{\partial L}{\partial B} = \sum_{i=1}^{m}\frac{\partial L}{\partial \mathbf{y}_i}$

    the gradient $\frac{\partial L}{\partial \mathbf{y}_i}$ represents the loss derivative with respect to the output for the i-th sample

Where `L` is Loss function and $\frac{\partial L}{\partial Y}$ is upstream gradient

In this notebook we will use Adam gradient update algorithm for updating weights and bias

Adam optimization is an extension to stochastic gradient descent that has recently seen broader adoption for deep learning applications in computer vision and natural language processing. The algorithm leverages the power of adaptive learning rates to find individual learning rates for each parameter.

The Adam update rules are as follows:

1. Compute the moving averages of the gradients and the squared gradients:
    $$
    m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t
    $$
    $$
    v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2
    $$

2. Correct the bias in the first and second moment estimates:
    $$
    \hat{m}_t = \frac{m_t}{1 - \beta_1^t}
    $$
    $$
    \hat{v}_t = \frac{v_t}{1 - \beta_2^t}
    $$

3. Update the parameters:
    $$
    \omega_t = \omega_{t-1} - \alpha \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}
    $$

Where:
- $\beta_1$ and $\beta_2$ are the decay rates for the moment estimates
- $\epsilon$ is a small constant to prevent division by zero.
- $\alpha$ is the learning rate.
- $g_t$ is the gradient at time step \( t \).

In [185]:
class Linear(Layer):
    def __init__(self, features_in, features_out, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.w = np.random.randn(features_in, features_out) * np.sqrt(2.0 / features_in)
        self.b = np.random.randn(1, features_out) * 0.1

        self.gradw = None
        self.gradb = None

        self.mw = np.zeros_like(self.w)
        self.vw = np.zeros_like(self.w)
        self.mb = np.zeros_like(self.b)
        self.vb = np.zeros_like(self.b)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0

    def forward(self, input_: np.ndarray):
        self.input_ = input_
        self.output = (input_ @ self.w) + self.b 
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        self.gradw = self.input_.T @ upstream_grad
        self.gradb = np.sum(upstream_grad, axis=0, keepdims=True)

        downstream_grad = upstream_grad @ self.w.T
        return downstream_grad

    def step(self, lr=0.1):
        # Adam optimaztion
        self.t += 1

        self.mw = self.beta1 * self.mw + (1 - self.beta1) * self.gradw
        self.vw = self.beta2 * self.vw + (1 - self.beta2) * (self.gradw ** 2)
        mw_hat = self.mw / (1 - self.beta1 ** self.t)
        vw_hat = self.vw / (1 - self.beta2 ** self.t)

        self.w -= lr * mw_hat / (np.sqrt(vw_hat) + self.epsilon)

        self.mb = self.beta1 * self.mb + (1 - self.beta1) * self.gradb
        self.vb = self.beta2 * self.vb + (1 - self.beta2) * (self.gradb ** 2)

        mb_hat = self.mb / (1 - self.beta1 ** self.t)
        vb_hat = self.vb / (1 - self.beta2 ** self.t)

        self.b -= lr * mb_hat / (np.sqrt(vb_hat) + self.epsilon)
    def __str__(self):
        return f'I am linear!'

### Sigmoid layer

Sigmoid function is defined as:
$$
\sigma(z) = \frac{1}{1 - e^{-z}}
$$
which is forward phase.

and for the backward phase we need to get the first derivative of sigmoid function which is:

$$
\frac{d\sigma(z)}{dz} = \sigma(z)\sigma(1 - z)
$$

In [186]:
class Sigmoid(Layer):
    def forward(self, input_: np.ndarray):
        clipped_input = np.clip(input_, -500, 500)
        self.output = 1 / (1 + np.exp(-input_))
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = upstream_grad * (self.output * (1 - self.output))
        return downstream_grad
    
    def __str__(self):
        return f'I am sigmoid!'

### RELU layer

Relu function is defined as:

$$
ReLU(z) = max(0, z)
$$
which is forward phase.

and for the backward phase we take the first derivative of Relu:

$$
\frac{d}{dz} \text{ReLU}(z) = \begin{cases} 
0 & \text{if } x \leq 0, \\
1 & \text{if } x > 0.
\end{cases}
$$

In [187]:
class ReLU(Layer):
    def forward(self, input_: np.ndarray):
        self.input_ = input_
        self.output = np.maximum(0, input_)
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = upstream_grad * (self.input_ > 0 ).astype(upstream_grad.dtype)
        return downstream_grad
    
    def __str__(self):
        return f'I am ReLU'

### Softmax Layer

Softmax function for the forward phase is defined as:

$$\text{Softmax}(\mathbf{z_i}) = \frac{e^{z_i}}{\sum_{j=1}^K e^{z_j}}$$


and for the backward phase we have to calculate the jaccobian matrix:

since
$\frac{\partial S_i}{\partial z_j} = -S_iS_j$
and
$\frac{\partial S_i}{\partial z_i} = S_i(1-S_i)$

$$
\frac{\partial S}{\partial z} = \mathbf{J} =
\begin{bmatrix}
S_1(1 - S_1) & -S_1 S_2 & -S_1 S_3 & \cdots & -S_1 S_C \\
-S_2 S_1 & S_2(1 - S_2) & -S_2 S_3 & \cdots & -S_2 S_C \\
-S_3 S_1 & -S_3 S_2 & S_3(1 - S_3) & \cdots & -S_3 S_C \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
-S_C S_1 & -S_C S_2 & -S_C S_3 & \cdots & S_C(1 - S_C)
\end{bmatrix}
$$


In [188]:
class Softmax(Layer):
    def forward(self, input_: np.ndarray):
        # to prevent overflow
        fixed_values = input_ - np.max(input_, axis=1, keepdims=True)
        scores = np.exp(fixed_values)
        sum_scores = np.sum(scores, axis=1, keepdims=True)

        self.output = scores / sum_scores
        return self.output
    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = np.empty_like(upstream_grad)

        for idx in range(upstream_grad.shape[0]):
            out = self.output[idx]
            j = np.diag(out) - np.outer(out, out) # create jacobian matrix
            
            downstream_grad[idx] = j @ upstream_grad[idx]

        return downstream_grad

# Loss classes

### Mean Squared Error

The mean squared error (MSE) function measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value.

The `MSE` class implements this function in both the forward and backward phases.

For the forward phase, the formula is:
$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_{pred_i} - y_{true_i})^2
$$

For the backward phase, the gradient of the loss with respect to the predictions is:
$$
\frac{\partial \text{MSE}}{\partial y_{pred}} = \frac{2}{n} (y_{pred} - y_{true})
$$

In [189]:
class MSE(Layer):
    def forward(self, y_true: np.ndarray, y_pred: np.ndarray):
        self.y_pred = y_pred
        self.y_true = y_true
        self.output = np.mean((y_pred - y_true) ** 2)
        return self.output

    def backward(self):
        upstream_grad = (2 / self.y_pred.shape[0]) * (self.y_pred - self.y_true)
        return upstream_grad
    
    def __str__(self):
        return f'I am MSE'

### Cross entropy

The Cross Entropy loss function is commonly used in classification problems. It measures the performance of a classification model whose output is a probability value between 0 and 1. The loss increases as the predicted probability diverges from the actual label.

For the forward phase, the formula is:
$$
L = -\frac{1}{N} \sum_{i=1}^{N} \sum_{c=1}^{C} y_{i,c} \log(\hat{y}_{i,c})
$$
where:
- $N$ is the number of samples
- $C$ is the number of classes
- $y_{i,c}$ is the true label (one-hot encoded) for sample \( i \) and class \( c \)
- $\hat{y}_{i,c}$ is the predicted probability for sample \( i \) and class \( c \)

For the backward phase, the gradient of the loss with respect to the predictions is:
$$
\frac{\partial L}{\partial \hat{y}} = -\frac{1}{N} \frac{y}{\hat{y}}
$$

In [None]:
class CrossEntropy(Layer):
    def forward(self, y_true: np.ndarray, y_pred: np.ndarray):
        self.y_pred = y_pred
        self.y_true = y_true

        # To prevent undefined values
        clip_y_pred = np.clip(y_pred, 1e-12, 1.0)

        self.clip_y_pred = clip_y_pred

        self.loss = -np.mean(np.sum(self.y_true * np.log(self.clip_y_pred), axis=1))
        return self.loss

    def backward(self):
        upstream_grad = -self.y_true / self.clip_y_pred / self.y_true.shape[0]
        return upstream_grad

# Multi Layer Perceptron

In [None]:
class MLP:
    """
    A simple Multi-Layer Perceptron (MLP) class for training neural networks.

    This class defines an MLP with customizable layers, a loss function, and an optimization procedure.
    It supports forward and backward propagation, as well as parameter updates using a specified learning rate.

    Attributes:
        layers (list[Layer]): A list of Layer objects defining the structure of the MLP.
        loss_method (Layer): A loss function layer used to compute the error and gradients during training.
        lr (float): The learning rate used for updating weights in the network.
        outputs (np.ndarray): A container for the output of the network during the forward pass.

    Methods:
        forward(input_: np.ndarray) -> np.ndarray:
            Performs a forward pass through the network using the given input data.

        backward() -> None:
            Computes the gradients via backpropagation starting from the loss function.

        update_weigths() -> None:
            Updates the weights of all layers in the network using the calculated gradients and learning rate.

        train(input_: np.ndarray, y: np.ndarray, epoches=100, batch_size=1) -> list:
            Trains the MLP on the provided data (`input_` and `y`) for a specified number of epochs.
            Returns the list of training losses over the epochs.

        predict(input_: np.ndarray) -> np.ndarray:
            Performs a forward pass for prediction with the given input.
    """

    def __init__(self, layers: list[Layer], loss_method: Layer, lr=0.01):
        """
        Initializes the MLP with the given layers, loss function, and learning rate.

        Args:
            layers (list[Layer]): A list of Layer objects defining the MLP architecture.
            loss_method (Layer): A loss function layer used to compute errors and gradients.
            lr (float, optional): The learning rate used for weight updates. Default is 0.01.
        """
        self.layers = layers
        self.loss_method = loss_method
        self.lr = lr
        self.outputs = None

    def forward(self, input_: np.ndarray):
        """
        Performs a forward pass through the MLP network.

        Args:
            input_ (np.ndarray): The input data to the network.

        Returns:
            np.ndarray: The output from the network after the forward pass.
        """
        self.outputs = input_
        for layer in self.layers:
            self.outputs = layer.forward(self.outputs)

        return self.outputs

    def backward(self):
        """
        Performs backpropagation to compute gradients for all layers.
        Starts from the loss function layer and propagates backwards through the network.
        """
        upstream_grad = self.loss_method.backward() 
        for layer in self.layers[::-1]:
            upstream_grad = layer.backward(upstream_grad)

    def update_weigths(self):
        """
        Updates the weights of all layers using the gradients calculated during backpropagation.

        Uses the specified learning rate for the weight updates.
        """
        for layer in self.layers:
            layer.step(self.lr)

    def train(self, input_: np.ndarray, y: np.ndarray, epoches=100, batch_size=1):
        """
        Trains the MLP on the provided data for the specified number of epochs.

        Args:
            input_ (np.ndarray): The input training data.
            y (np.ndarray): The target labels.
            epoches (int, optional): The number of epochs for training. Default is 100.
            batch_size (int, optional): The batch size used for training. Default is 1.

        Returns:
            list: A list of training losses for each epoch.
        """
        random_shuffle = np.random.permutation(input_.shape[0])
        shuffled_data_x = input_[random_shuffle]
        shuffled_data_y = y[random_shuffle]
        rows, columns = input_.shape

        train_losses = []

        for epoch in (pbar := trange(epoches)):
            random_indices = np.random.choice(rows, size=batch_size)
            batch_data_x = shuffled_data_x[random_indices]
            batch_data_y = shuffled_data_y[random_indices]

            predicts = self.forward(batch_data_x)
            
            train_loss = self.loss_method.forward(batch_data_y, predicts)
            train_losses.append(train_loss)

            self.backward()
            self.update_weigths()

        return train_losses

    def predict(self, input_: np.ndarray):
        """
        Makes predictions on new input data after training.

        Args:
            input_ (np.ndarray): The input data for prediction.

        Returns:
            np.ndarray: The predicted output for the given input.
        """
        return self.forward(input_)