# **Feedforward Neural Network**

In [1]:
import numpy as np
import matplotlib.pyplot as plt

def sigmoid(X, j):
    if j==0:
        return 1 / (1 + np.exp(-X))
    else:
        s = sigmoid(X, 0)
        return s * (1 - s)

def tanh(X, j):
    if j==0:
        return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
    else:
        return 1 - ((np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))) ** 2

def relu(X, j):
    if j==0:
        return np.maximum(0.0, X)
    else:
        return np.where(X>0, 1.0, 0.0)

def leaky_relu(X, j):
    if j==0:
        return np.maximum(0.1 * X, X)
    else:
        return np.where(X>0, 1.0, 0.1)

def silu(X, j):
    if j==0:
        return X / (1 + np.exp(-X))
    else:
        s = 1 / (1 + np.exp(-X))
        return s * (1 + X * (1 - s))

def linear(X, j):
    if j==0:
        return X
    else:
        return 1

def mean_squared_error(Y1, Y2, j):
    if j == 0:
        return np.sum((Y1 - Y2) ** 2) / (2 * (Y1.shape[0]))
    else:
        return (Y1 -Y2) / Y1.shape[0]

class FeedForwardNeuralNetwork:
    def __init__(self, input_shape, layers, layers_activations):
        self.layer_count=len(layers)

        self.W = [np.random.normal(0, np.sqrt(2 / input_shape), size=(input_shape, layers[0]))]
        self.b = [np.zeros((1, layers[0]))]
        if layers_activations[0] == "silu":
            self.activations = [silu]
        elif layers_activations[0] == "relu":
            self.activations = [relu]
        elif layers_activations[0] == "leaky_relu":
            self.activations = [leaky_relu]
        elif layers_activations[0] == "sigmoid":
            self.activations = [sigmoid]
        elif layers_activations[0] == "tanh":
            self.activations = [tanh]
        elif layers_activations[0] == "linear":
            self.activations = [linear]
        else:
            self.activations = [silu]
            print("Error in activation function assignment in layer 1")

        for i in range(1, self.layer_count):
            self.W.append(np.random.normal(0, np.sqrt(2 / layers[i - 1]), size=(layers[i - 1], layers[i])))
            self.b.append(np.zeros((1, layers[i])))

            if layers_activations[i] == "silu":
                self.activations.append(silu)
            elif layers_activations[i] == "relu":
                self.activations.append(relu)
            elif layers_activations[i] == "leaky_relu":
                self.activations.append(leaky_relu)
            elif layers_activations[i] == "sigmoid":
                self.activations.append(sigmoid)
            elif layers_activations[i] == "tanh":
                self.activations.append(tanh)
            elif layers_activations[i] == "linear":
                self.activations.append(linear)
            else:
                self.activations.append(silu)
                print("Error in activation function assignment in layer", i + 1)

    def initialize_nadam(self):
        self.m_t=[]
        self.v_t=[]

        for i in range(self.layer_count):
            self.m_t.append(np.zeros(self.W[i].shape))
            self.m_t.append(np.zeros(self.b[i].shape))
            self.v_t.append(np.zeros(self.W[i].shape))
            self.v_t.append(np.zeros(self.b[i].shape))

    def update_nadam(self, dW, db):
        for i in range(self.layer_count):
            self.m_t[2 * i] = self.beta1 * self.m_t[2 * i] + (1 - self.beta1) * dW[self.layer_count - 1 - i]
            self.m_t[2 * i + 1] = self.beta1 * self.m_t[2 * i + 1] + (1 - self.beta1) * db[self.layer_count - 1 - i]
            self.v_t[2 * i] = self.beta2 * self.v_t[2 * i] + (1 - self.beta2) * (dW[self.layer_count - 1 - i] ** 2)
            self.v_t[2 * i + 1] = self.beta2 * self.v_t[2 * i + 1] + (1 - self.beta2) * (db[self.layer_count - 1 - i] ** 2)

    def nadam(self, i, j):
        return ((self.m_t[2 * i + j] / (1 - self.beta1 ** self.t)) / np.sqrt((self.v_t[2 * i + j] / (1 - self.beta2 ** self.t))+self.epsilon))

    def compile_model(self, optimizer, loss, learning_rate, alpha, beta1, beta2, epsilon):
        self.learning_rate = learning_rate
        self.alpha = alpha

        if optimizer == "nadam":
            self.optimizer = self.nadam
            self.update_optimizer = self.update_nadam
            self.initialize_optimizer=self.initialize_nadam
            self.beta1 = beta1
            self.beta2 = beta2
            self.epsilon = epsilon

        if loss == "mean_squared":
            self.loss = mean_squared_error

    def train(self, X_train, y_train, epochs, batch_size):
        self.t = 0
        self.error = []
        self.initialize_optimizer()

        X_train_size = X_train.shape[0]

        for p in range(epochs):
            permu = np.random.permutation(X_train_size)
            pos = 0

            while(pos < X_train_size):
                self.t += 1

                if pos + batch_size <= X_train_size:
                    X_mini_batch = X_train[permu[pos : pos + batch_size]]
                    y_mini_batch = y_train[permu[pos : pos + batch_size]]
                    pos += batch_size

                else:
                    X_mini_batch = X_train[np.concatenate((permu[pos : X_train_size], permu[0 : batch_size - X_train_size + pos]), axis=0)]
                    y_mini_batch = y_train[np.concatenate((permu[pos : X_train_size], permu[0 : batch_size - X_train_size + pos]), axis=0)]
                    pos += batch_size

                a = [X_mini_batch @ self.W[0] + self.b[0]]
                h = [self.activations[0](a[0], 0)]
                for i in range(1, self.layer_count):
                    a.append(h[i-1] @ self.W[i] + self.b[i])
                    h.append(self.activations[i](a[i], 0))

                self.error.append(self.loss(h[self.layer_count - 1], y_mini_batch, 0))

                dh = [(h[self.layer_count - 1] - y_mini_batch) / batch_size]
                da = []
                dW = []
                db = []
                for i in range(self.layer_count - 1, 0, -1):
                    da.append(dh[-1] * self.activations[i](a[i], 1))
                    dW.append(h[i-1].T @ da[-1])
                    db.append(np.sum(da[-1], axis=0, keepdims=True))
                    dh.append(da[-1] @ self.W[i].T)
                da.append(dh[-1] * self.activations[0](a[0], 1))
                dW.append(X_mini_batch.T @ da[-1])
                db.append(np.sum(da[-1], axis=0, keepdims=True))

                self.update_optimizer(dW, db)

                for i in range(self.layer_count):
                    self.W[i] -= self.learning_rate * ((self.optimizer(i, 0) + self.alpha * dW[self.layer_count - 1 - i]))
                    self.b[i] -= self.learning_rate * ((self.optimizer(i, 1) + self.alpha * db[self.layer_count - 1 - i]))

    def predict(self, X_new_data):
        a = X_new_data @ self.W[0] + self.b[0]
        h = self.activations[0](a, 0)

        for i in range(1, self.layer_count):
            a = h @ self.W[i] + self.b[i]
            h = self.activations[i](a, 0)

        return h