In [183]:
import numpy as np
from tqdm import trange

# Layers

In [184]:
class Layer:
    def __init__(self):
        self.input_ = None
        self.output = None

    def forward(self, input_: np.ndarray):
        raise NotImplementedError('This method should be implemented')

    def backward(self, upstream_grad: np.ndarray):
        raise NotImplementedError('This method should be implemented')

    def step(self, lr):
        pass

### Linear layer 

In [185]:
class Linear(Layer):
    def __init__(self, features_in, features_out, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.w = np.random.randn(features_in, features_out) * np.sqrt(2.0 / features_in)
        self.b = np.random.randn(1, features_out) * 0.1

        self.gradw = None
        self.gradb = None

        self.mw = np.zeros_like(self.w)
        self.vw = np.zeros_like(self.w)
        self.mb = np.zeros_like(self.b)
        self.vb = np.zeros_like(self.b)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0

    def forward(self, input_: np.ndarray):
        self.input_ = input_
        self.output = (input_ @ self.w) + self.b 
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        self.gradw = self.input_.T @ upstream_grad
        self.gradb = np.sum(upstream_grad, axis=0, keepdims=True)

        downstream_grad = upstream_grad @ self.w.T
        return downstream_grad

    def step(self, lr=0.1):
        # Adam optimaztion
        self.t += 1

        self.mw = self.beta1 * self.mw + (1 - self.beta1) * self.gradw
        self.vw = self.beta2 * self.vw + (1 - self.beta2) * (self.gradw ** 2)
        mw_hat = self.mw / (1 - self.beta1 ** self.t)
        vw_hat = self.vw / (1 - self.beta2 ** self.t)

        self.w -= lr * mw_hat / (np.sqrt(vw_hat) + self.epsilon)

        self.mb = self.beta1 * self.mb + (1 - self.beta1) * self.gradb
        self.vb = self.beta2 * self.vb + (1 - self.beta2) * (self.gradb ** 2)

        mb_hat = self.mb / (1 - self.beta1 ** self.t)
        vb_hat = self.vb / (1 - self.beta2 ** self.t)

        self.b -= lr * mb_hat / (np.sqrt(vb_hat) + self.epsilon)
    def __str__(self):
        return f'I am linear!'

### Sigmoid layer

In [186]:
class Sigmoid(Layer):
    def forward(self, input_: np.ndarray):
        clipped_input = np.clip(input_, -500, 500)
        self.output = 1 / (1 + np.exp(-input_))
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = upstream_grad * (self.output * (1 - self.output))
        return downstream_grad
    
    def __str__(self):
        return f'I am sigmoid!'

### RELU layer

In [187]:
class ReLU(Layer):
    def forward(self, input_: np.ndarray):
        self.input_ = input_
        self.output = np.maximum(0, input_)
        return self.output

    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = upstream_grad * (self.input_ > 0 ).astype(upstream_grad.dtype)
        return downstream_grad
    
    def __str__(self):
        return f'I am ReLU'

### Softmax Layer

$\frac{\partial S_i}{\partial z_j} = -S_iS_j$

$\frac{\partial S_i}{\partial z_i} = S_i(1-S_i)$

$$
\frac{\partial S}{\partial z} = \mathbf{J} =
\begin{bmatrix}
S_1(1 - S_1) & -S_1 S_2 & -S_1 S_3 & \cdots & -S_1 S_C \\
-S_2 S_1 & S_2(1 - S_2) & -S_2 S_3 & \cdots & -S_2 S_C \\
-S_3 S_1 & -S_3 S_2 & S_3(1 - S_3) & \cdots & -S_3 S_C \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
-S_C S_1 & -S_C S_2 & -S_C S_3 & \cdots & S_C(1 - S_C)
\end{bmatrix}
$$


In [188]:
class Softmax(Layer):
    def forward(self, input_: np.ndarray):
        # to prevent overflow
        fixed_values = input_ - np.max(input_, axis=1, keepdims=True)
        scores = np.exp(fixed_values)
        sum_scores = np.sum(scores, axis=1, keepdims=True)

        self.output = scores / sum_scores
        return self.output
    def backward(self, upstream_grad: np.ndarray):
        downstream_grad = np.empty_like(upstream_grad)

        for idx in range(upstream_grad.shape[0]):
            out = self.output[idx]
            j = np.diag(out) - np.outer(out, out) # create jacobian matrix
            
            downstream_grad[idx] = j @ upstream_grad[idx]

        return downstream_grad

# Loss classes

### Mean Squared Error

In [189]:
class MSE(Layer):
    def forward(self, y_true: np.ndarray, y_pred: np.ndarray):
        self.y_pred = y_pred
        self.y_true = y_true
        self.output = np.mean((y_pred - y_true) ** 2)
        return self.output

    def backward(self):
        upstream_grad = (2 / self.y_pred.shape[0]) * (self.y_pred - self.y_true)
        return upstream_grad
    
    def __str__(self):
        return f'I am MSE'

### Cross entropy

In [190]:
class CrossEntropy(Layer):
    def forward(self, y_true: np.ndarray, y_pred: np.ndarray):
        self.y_pred = y_pred
        self.y_true = y_true

        # To prevent undefined values
        clip_y_pred = np.clip(y_pred, 1e-12, 1.0)

        self.clip_y_pred = clip_y_pred

        self.loss = -np.mean(np.sum(self.y_true * np.log(self.clip_y_pred), axis=1))
        return self.loss

    def backward(self):
        # print(self.y_pred)
        upstream_grad = -self.y_true / self.clip_y_pred / self.y_true.shape[0]
        return upstream_grad

# Multi Layer Perceptron

In [191]:
class MLP:
    def __init__(self, layers: list[Layer], loss_method: Layer, lr=0.01):
        self.layers = layers
        self.loss_method = loss_method
        self.lr = lr
        self.outputs = None

    def forward(self, input_: np.ndarray):
        self.outputs = input_
        for layer in self.layers:
            self.outputs = layer.forward(self.outputs)

        return self.outputs

    def backward(self):
        upstream_grad = self.loss_method.backward() 
        for layer in self.layers[::-1]:
            upstream_grad = layer.backward(upstream_grad)

    def update_weigths(self):
        for layer in self.layers:
            layer.step(self.lr)

    def train(self, input_: np.ndarray, y: np.ndarray, epoches=100, batch_size=1):
        random_shuffle = np.random.permutation(input_.shape[0])
        shuffled_data_x = input_[random_shuffle]
        shuffled_data_y = y[random_shuffle]
        rows, columns = input_.shape

        train_losses = []

        for epoch in (pbar := trange(epoches)):
            random_indices = np.random.choice(rows, size=batch_size)
            batch_data_x = shuffled_data_x[random_indices]
            batch_data_y = shuffled_data_y[random_indices]

            predicts = self.forward(batch_data_x)
            
            train_loss = self.loss_method.forward(batch_data_y, predicts)
            train_losses.append(train_loss)

            self.backward()
            self.update_weigths()

        return train_losses

    def predict(self, input_: np.ndarray):
        return self.forward(input_)