In [None]:
#@title Mount Gdrive to Access training data from Gdrive
from google.colab import drive

drive.mount(‘/content/drive’)

# Import Libraries

In [None]:
import cProfile as cp
import ctypes
import os
import time as tm
import warnings
import gzip
from typing import *

import dill
import numpy as np
from matplotlib import collections as mc, pyplot as plt

np.NONE = [np.array([None])]
warnings.filterwarnings('ignore')

CEND = '\33[0m'
CBOLD = '\33[1m'
CITALIC = '\33[3m'
CURL = '\33[4m'
CBLINK = '\33[5m'
CBLINK2 = '\33[6m'
CSELECTED = '\33[7m'

CBLACK = '\33[30m'
CRED = '\33[31m'
CGREEN = '\33[32m'
CYELLOW = '\33[33m'
CBLUE = '\33[34m'
CVIOLET = '\33[35m'
CBEIGE = '\33[36m'
CWHITE = '\33[37m'

CBLACKBG = '\33[40m'
CREDBG = '\33[41m'
CGREENBG = '\33[42m'
CYELLOWBG = '\33[43m'
CBLUEBG = '\33[44m'
CVIOLETBG = '\33[45m'
CBEIGEBG = '\33[46m'
CWHITEBG = '\33[47m'

CGREY = '\33[90m'
CRED2 = '\33[91m'
CGREEN2 = '\33[92m'
CYELLOW2 = '\33[93m'
CBLUE2 = '\33[94m'
CVIOLET2 = '\33[95m'
CBEIGE2 = '\33[96m'
CWHITE2 = '\33[97m'

CGREYBG = '\33[100m'
CREDBG2 = '\33[101m'
CGREENBG2 = '\33[102m'
CYELLOWBG2 = '\33[103m'
CBLUEBG2 = '\33[104m'
CVIOLETBG2 = '\33[105m'
CBEIGEBG2 = '\33[106m'
CWHITEBG2 = '\33[107m'


# Topologies

In [None]:
class Initializer:
    # for custom initializer
    def __init__(self, initializer, *args, **kwargs):
        self.initialize = initializer
        self.args = args
        self.kwargs = kwargs

    def initialize(self, shape, layers):
        pass

    @staticmethod
    def uniform(start=-1, stop=1):
        def initializer(shape, layers):
            biases = [np.random.uniform(start, stop, (shape[i], 1)).astype(dtype=np.float32)
                      for i in range(1, layers)]
            weights = [np.random.uniform(start, stop, (shape[i], shape[i - 1])).astype(dtype=np.float32)
                       for i in range(1, layers)]

            return np.NONE + biases, np.NONE + weights

        return Initializer(initializer)

    @staticmethod
    def normal(scale=1):
        def initializer(shape, layers):
            biases = [(np.random.default_rng().standard_normal((shape[i], 1), dtype=np.float32)) * scale
                      for i in range(1, layers)]
            weights = [(np.random.default_rng().standard_normal((shape[i], shape[i - 1]), dtype=np.float32)) * scale
                       for i in range(1, layers)]

            return np.NONE + biases, np.NONE + weights

        return Initializer(initializer)

    @staticmethod
    def xavier(he=1):
        def initializer(shape, layers):
            biases = [np.random.default_rng().standard_normal((shape[i], 1),
                                                              dtype=np.float32) * (he / shape[i - 1]) ** 0.5
                      for i in range(1, layers)]
            weights = [np.random.default_rng().standard_normal((shape[i], shape[i - 1]),
                                                               dtype=np.float32) * (he / shape[i - 1]) ** 0.5
                       for i in range(1, layers)]

            return np.NONE + biases, np.NONE + weights

        return Initializer(initializer)

    @staticmethod
    def normalized_xavier(he=6):
        def initializer(shape, layers):
            biases = [np.random.default_rng().standard_normal((shape[i], 1), dtype=np.float32) *
                      (he / (shape[i - 1] + shape[i])) ** 0.5
                      for i in range(1, layers)]
            weights = [np.random.default_rng().standard_normal((shape[i], shape[i - 1]), dtype=np.float32) *
                       (he / (shape[i - 1] + shape[i])) ** 0.5
                       for i in range(1, layers)]

            return np.NONE + biases, np.NONE + weights

        return Initializer(initializer)


class ActivationFunction:
    # for custom activation_function
    def __init__(self, activation, activated_derivative, *args, **kwargs):
        self.activations = activation, activated_derivative
        self.args = args
        self.kwargs = kwargs

    activations = None

    @staticmethod
    def sigmoid(smooth=1, offset=0):
        ONE = np.float32(1)
        E = np.float32(np.e)
        SMOOTH = np.float32(smooth)
        OFFSET = np.float32(offset)

        def activation(x):
            return ONE / (ONE + E ** (-SMOOTH * (x + OFFSET)))

        def activated_derivative(activated_x):
            return SMOOTH * (activated_x * (ONE - activated_x))

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def tanh(alpha=1):
        ALPHA = np.float32(alpha)

        def activation(x):
            return np.arctan(ALPHA * x)

        def activated_derivative(activated_x):
            return ALPHA * np.square(np.cos(activated_x))

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def relu():
        ONE = np.float32(1)

        def activation(x):
            return x * (x > 0)

        def activated_derivative(activated_x):
            return ONE * (activated_x != 0)

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def prelu(leak=0.1):
        ONE = np.float32(1)
        LEAK = np.float32(leak)

        def activation(x):
            return np.where(x > 0, x, LEAK * x)

        def activated_derivative(activated_x):
            return np.where(activated_x == 0, LEAK, ONE)

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def elu(alpha=1):
        ONE = np.float32(1)
        E = np.e
        ALPHA = np.float32(alpha)

        def activation(x):
            return np.where(x > 0, x, ALPHA * (E ** x - 1))

        def activated_derivative(activated_x):
            return np.where(activated_x != 0, ONE, activated_x + ALPHA)

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def softmax():
        E = np.float32(np.e)

        def activation(x):
            numerator = E ** (x - x.max(axis=1)[:, None])

            return numerator / np.einsum('lij->lj', numerator)[:, None]

        def activated_derivative(activated_x):
            jacobian = -np.einsum('lij,lkj->lik', activated_x, activated_x)
            diag_i = np.diag_indices(jacobian.shape[1])
            jacobian[:, diag_i[1], diag_i[0]] = np.einsum('lij,lij->li', activated_x, 1 - activated_x)

            return jacobian

        return ActivationFunction(activation, activated_derivative)

    @staticmethod
    def softplus():
        E = np.float32(np.e)
        ONE = np.float32(1)

        def activation(x):
            return np.log(ONE + E ** x)

        def activated_derivative(activated_x):
            return ONE - E ** -activated_x

        return ActivationFunction(activation, activated_derivative)


class LossFunction:
    # for custom loss_function
    def __init__(self, loss_function, *args, **kwargs):
        self.loss_function = loss_function
        self.args = args
        self.kwargs = kwargs

    def loss_function(self, output, target):
        pass

    @staticmethod
    def mean_square():
        def loss_function(output, target):
            loss = output - target

            return np.einsum('lij,lij->', loss, loss), loss

        return LossFunction(loss_function)

    # doesn't work
    @staticmethod
    def cross_entropy():
        def loss_function(output, target):
            return -np.einsum('lij,lij->', np.log(output), target), -target / output

        return LossFunction(loss_function)


class WBOptimizer:
    # for custom optimizer
    def __init__(self, optimizer, *args, **kwargs):
        self.optimize = optimizer
        self.args = args
        self.kwargs = kwargs

    def optimize(self, layer):
        pass

    @staticmethod
    def gradient_decent(this: 'ArtificialNeuralNetwork', learning_rate=0.01):
        LEARNING_RATE = np.float32(learning_rate)

        def optimizer(layer):
            this.delta_biases[layer] *= LEARNING_RATE
            this.delta_weights[layer] *= LEARNING_RATE

        return WBOptimizer(optimizer)

    @staticmethod
    def momentum(this: 'ArtificialNeuralNetwork', learning_rate=0.001, alpha=None):
        if alpha is None: alpha = learning_rate
        LEARNING_RATE = np.float32(learning_rate)
        ALPHA = np.float32(alpha)
        this.pdb, this.pdw = this.delta_initializer(1)  # pdb -> prev_delta_biases, pdw -> prev_delta_weights

        def optimizer(layer):
            this.delta_biases[layer] = this.pdb[layer] = ALPHA * this.pdb[layer] + \
                                                         LEARNING_RATE * this.delta_biases[layer]
            this.delta_weights[layer] = this.pdw[layer] = ALPHA * this.pdw[layer] + \
                                                          LEARNING_RATE * this.delta_weights[layer]

        return WBOptimizer(optimizer)

    @staticmethod
    def decay(this: 'ArtificialNeuralNetwork', learning_rate=0.01, alpha=None):
        if alpha is None: alpha = 1 / learning_rate
        LEARNING_RATE = np.float32(learning_rate)
        ALPHA = np.float32(alpha)
        this.decay_count = 0

        def optimizer(layer):
            k = LEARNING_RATE / (1 + this.decay_count / ALPHA)
            this.delta_biases[layer] *= k
            this.delta_weights[layer] *= k

            this.decay_count += 1 / this.batches

        return WBOptimizer(optimizer)

    @staticmethod
    def nesterov(this: 'ArtificialNeuralNetwork', learning_rate=0.001, alpha=None):
        if alpha is None: alpha = learning_rate
        LEARNING_RATE = np.float32(learning_rate)
        ALPHA = np.float32(alpha)
        this.pdb, this.pdw = this.delta_initializer(1)  # pdb -> prev_delta_biases, pdw -> prev_delta_weights

        def optimizer(layer):
            this.theta[layer] = this.weights[layer] - ALPHA * this.pdw[layer]
            this.delta_biases[layer] = this.pdb[layer] = ALPHA * this.pdb[layer] + \
                                                         LEARNING_RATE * this.delta_biases[layer]
            this.delta_weights[layer] = this.pdw[layer] = ALPHA * this.pdw[layer] + \
                                                          LEARNING_RATE * this.delta_weights[layer]

        return WBOptimizer(optimizer)

    @staticmethod
    def adagrad(this: 'ArtificialNeuralNetwork', learning_rate=0.01, epsilon=np.e ** -8):
        LEARNING_RATE = np.float32(learning_rate)
        EPSILON = np.float32(epsilon)
        this.initialize = True
        this.gsq_b, this.gsq_w = this.delta_initializer(1)  # gsq_b -> grad_square_biases, gsq_w -> grad_square_weights

        def optimizer(layer):
            if this.initialize:
                this.gsq_b, this.gsq_w = this.delta_initializer()
                this.initialize = False

            this.gsq_b[layer] += np.einsum('lij,lij->lij', this.delta_biases[layer], this.delta_biases[layer])
            this.gsq_w[layer] += np.einsum('ij,ij->ij', this.delta_weights[layer], this.delta_weights[layer])

            this.delta_biases[layer] *= LEARNING_RATE / np.sqrt(this.gsq_b[layer] + EPSILON)
            this.delta_weights[layer] *= LEARNING_RATE / np.sqrt(this.gsq_w[layer] + EPSILON)

        return WBOptimizer(optimizer)

    @staticmethod
    def rmsprop(this: 'ArtificialNeuralNetwork', learning_rate=0.001, beta=0.95, epsilon=np.e ** -8):
        LEARNING_RATE = np.float32(learning_rate)
        EPSILON = np.float32(epsilon)
        BETA = np.float32(beta)
        BETA_BAR = np.float32(1 - beta)
        this.initialize = True
        this.gsq_b, this.gsq_w = this.delta_initializer(1)  # gsq_b -> grad_square_biases, gsq_w -> grad_square_weights

        def optimizer(layer):
            if this.initialize:
                this.gsq_b, this.gsq_w = this.delta_initializer()
                this.initialize = False

            this.gsq_b[layer] = BETA * this.gsq_b[layer] + \
                                BETA_BAR * np.einsum('lij,lij->lij', this.delta_biases[layer], this.delta_biases[layer])
            this.gsq_w[layer] = BETA * this.gsq_w[layer] + \
                                BETA_BAR * np.einsum('ij,ij->ij', this.delta_weights[layer], this.delta_weights[layer])

            this.delta_biases[layer] *= LEARNING_RATE / np.sqrt(this.gsq_b[layer] + EPSILON)
            this.delta_weights[layer] *= LEARNING_RATE / np.sqrt(this.gsq_w[layer] + EPSILON)

        return WBOptimizer(optimizer)

    @staticmethod
    def adadelta(this: 'ArtificialNeuralNetwork', learning_rate=0.1, alpha=0.95, epsilon=np.e ** -8):
        LEARNING_RATE = np.float32(learning_rate)
        ALPHA = np.float32(alpha)
        ALPHA_BAR = np.float32(1 - alpha)
        EPSILON = np.float32(epsilon)
        this.initialize = True
        this.gsq_b, this.gsq_w = this.delta_initializer(1)  # gsq_b -> grad_square_biases, gsq_w -> grad_square_weights
        # dsq_b -> delta_square_biases, dsq_w -> delta_square_weights
        this.dsq_b, this.dsq_w = this.delta_initializer(1)

        def optimizer(layer):
            if this.initialize:
                this.gsq_b, this.gsq_w = this.delta_initializer()
                this.dsq_b, this.dsq_w = this.delta_initializer()
                this.initialize = False

            this.gsq_b[layer] = ALPHA * this.gsq_b[layer] + \
                                ALPHA_BAR * np.einsum('lij,lij->lij', this.delta_biases[layer],
                                                      this.delta_biases[layer])
            this.gsq_w[layer] = ALPHA * this.gsq_w[layer] + \
                                ALPHA_BAR * np.einsum('ij,ij->ij', this.delta_weights[layer], this.delta_weights[layer])

            this.delta_biases[layer] *= LEARNING_RATE * \
                                        np.sqrt((this.dsq_b[layer] + EPSILON) / (this.gsq_b[layer] + EPSILON))
            this.delta_weights[layer] *= LEARNING_RATE * \
                                         np.sqrt((this.dsq_w[layer] + EPSILON) / (this.gsq_w[layer] + EPSILON))

            this.dsq_b[layer] = ALPHA * this.dsq_b[layer] + \
                                ALPHA_BAR * np.einsum('lij,lij->lij', this.delta_biases[layer],
                                                      this.delta_biases[layer])
            this.dsq_w[layer] = ALPHA * this.dsq_w[layer] + \
                                ALPHA_BAR * np.einsum('ij,ij->ij', this.delta_weights[layer], this.delta_weights[layer])

        return WBOptimizer(optimizer)

    @staticmethod
    def adam(this: 'ArtificialNeuralNetwork', learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=np.e ** -8):
        LEARNING_RATE = np.float32(learning_rate)
        BETA1 = np.float32(beta1)
        BETA1_BAR = np.float32(1 - beta1)
        BETA2 = np.float32(beta2)
        BETA2_BAR = np.float32(1 - beta2)
        EPSILON = np.float32(epsilon)
        this.initialize = True
        this.gsq_b, this.gsq_w = this.delta_initializer(1)  # gsq_b -> grad_square_biases, gsq_w -> grad_square_weights
        this.gb, this.gw = this.delta_initializer(1)  # gb -> grad_biases, gw -> grad_weights

        def optimizer(layer):
            if this.initialize:
                this.gsq_b, this.gsq_w = this.delta_initializer()
                this.gb, this.gw = this.delta_initializer()
                this.initialize = False

            this.gb[layer] = BETA1 * this.gb[layer] + BETA1_BAR * this.delta_biases[layer]
            this.gw[layer] = BETA1 * this.gw[layer] + BETA1_BAR * this.delta_weights[layer]
            this.gsq_b[layer] = BETA2 * this.gsq_b[layer] + \
                                BETA2_BAR * np.einsum('lij,lij->lij', this.delta_biases[layer],
                                                      this.delta_biases[layer])
            this.gsq_w[layer] = BETA2 * this.gsq_w[layer] + \
                                BETA2_BAR * np.einsum('ij,ij->ij', this.delta_weights[layer], this.delta_weights[layer])

            div_1 = (1 - BETA1 ** (this.epoch + 1))
            div_2 = (1 - BETA2 ** (this.epoch + 1))
            gb_sq = this.gsq_b[layer] / div_2
            gw_sq = this.gsq_w[layer] / div_2

            this.delta_biases[layer] = LEARNING_RATE * this.gb[layer] / div_1 / np.sqrt(gb_sq + EPSILON)
            this.delta_weights[layer] = LEARNING_RATE * this.gw[layer] / div_1 / np.sqrt(gw_sq + EPSILON)

        return WBOptimizer(optimizer)

    # doesn't work
    @staticmethod
    def adamax(this: 'ArtificialNeuralNetwork', learning_rate=0.0001, beta1=0.9, beta2=0.999):
        LEARNING_RATE = np.float32(learning_rate)
        BETA1 = np.float32(beta1)
        BETA1_BAR = np.float32(1 - beta1)
        BETA2 = np.float32(beta2)
        this.decay_count = 0
        this.initialize = True
        this.nb, this.nw = this.delta_initializer(1)  # nb -> _biases, nw -> _weights
        this.gb, this.gw = this.delta_initializer(1)  # gb -> grad_biases, gw -> grad_weights

        def optimizer(layer):
            if this.initialize:
                this.nb, this.nw = this.delta_initializer()
                this.gb, this.gw = this.delta_initializer()
                this.initialize = False
            this.gb[layer] = BETA1 * this.gb[layer] + BETA1_BAR * this.delta_biases[layer]
            this.gw[layer] = BETA1 * this.gw[layer] + BETA1_BAR * this.delta_weights[layer]

            this.nb[layer] = np.maximum(BETA2 * this.nb[layer], np.absolute(this.delta_biases[layer]))
            this.nw[layer] = np.maximum(BETA2 * this.nw[layer], np.absolute(this.delta_weights[layer]))

            div = (1 - BETA1 ** (this.epoch + 1))
            vb = this.gb[layer] / div
            vw = this.gw[layer] / div

            this.delta_biases[layer] = LEARNING_RATE * vb / this.nb[layer]
            this.delta_weights[layer] = LEARNING_RATE * vw / this.nw[layer]

        return WBOptimizer(optimizer)


# database class for training / testing NN
class DataBase:
    def __init__(self, input_set, output_set):
        # class params declaration
        self.input_set = np.array(input_set, dtype=np.float32)
        self.output_set = np.array(output_set, dtype=np.float32)

        # prevent conflicting sizes of input_set and output_set
        size = len(self.input_set)
        if size != len(self.output_set):
            raise Exception("Both input_set and output_set should be of same size")

        # class vars initialization
        self.size = size
        self.pointer = 0
        self.block = False
        self.batch_size = None

        # shuffle database
        self.randomize()

    # scale data values within -1 to +1
    def normalize(self):
        input_scale = np.max(np.absolute(self.input_set))
        output_scale = np.max(np.absolute(self.output_set))
        self.input_set /= input_scale
        self.output_set /= output_scale

        return input_scale, output_scale

    # randomly shuffle order of data_sets
    def randomize(self):
        indices = [i for i in range(self.size)]
        np.random.shuffle(indices)
        self.input_set = self.input_set[indices]
        self.output_set = self.output_set[indices]

    # create generator object which yields a set of sequential data with fixed predetermined size everytime its called
    def batch_generator(self, batch_size):
        if self.block:
            raise PermissionError(
                "Access Denied: DataBase currently in use, 'end' previous generator before creating a new one")
        self.block = True
        self.batch_size = batch_size

        def generator():
            while 1:
                i = self.pointer + batch_size
                if i >= self.size:
                    i = self.size
                    r_val = self.__batch(i)
                    self.__return()

                    yield r_val
                    return
                signal = yield self.__batch(i)
                if signal == 'end': return self.__return()
                self.pointer += batch_size

        return generator()

    # returns fixed size of dataset from pointer sequentially
    def __batch(self, i):
        r_val = [self.input_set[self.pointer:i], self.output_set[self.pointer:i]]
        filled = i - self.pointer
        if filled != self.batch_size:
            vacant = self.batch_size - filled
            r_val[0] = \
                np.append(r_val[0], self.input_set[:vacant]).reshape([self.batch_size, *self.input_set.shape[1:]])
            r_val[1] = \
                np.append(r_val[1], self.output_set[:vacant]).reshape([self.batch_size, *self.output_set.shape[1:]])

        return r_val

    # reinitialize class vars after end of generator
    def __return(self):
        self.pointer = 0
        self.randomize()
        self.block = False
        self.batch_size = None

# Neural Networks

In [None]:
# ANN class
class ArtificialNeuralNetwork:
    def __init__(self, shape: Tuple[int, ...],
                 initializer: Initializer = None,
                 activation_function: ActivationFunction = None,
                 output_activation_function: ActivationFunction = None):
        # default params
        if initializer is None: initializer = Initializer.xavier(2)
        if activation_function is None: activation_function = ActivationFunction.elu()
        if output_activation_function is None: output_activation_function = ActivationFunction.softmax()

        # class params declaration
        self.shape = tuple(shape)
        self.initializer = initializer
        self.activation, self.activated_derivative = activation_function.activations
        self.output_activation, self.output_activated_derivative = output_activation_function.activations

        # declaration of weights and biases and its relatives
        self.layers = len(self.shape)
        self.biases, self.weights = self.initializer.initialize(self.shape, self.layers)
        self.biases_ones = np.NONE + [np.ones_like(bias, dtype=np.float32) for bias in self.biases[1:]]

        # derivation wrt
        self.theta = self.weights.copy()

        # class vars initialization
        self.delta_weights, self.delta_biases = None, None
        self.train_database = None
        self.epochs: int = 1  # total epochs for current training
        self.epoch: int = 0  # current epoch
        self.batch_size: int = 32  # fancy format allowed from params, ex: -1
        self.bs: int = 0  # actual batch_size
        self.batches: int = 0  # total batches for current epoch
        self.batch: int = 0  # current batch
        self.loss_function: LossFunction = LossFunction.mean_square()
        self.optimizer: WBOptimizer = WBOptimizer.adam(self)
        self.outputs = None
        self.target = None
        self.loss = None
        self.loss_derivative = None
        self.costs: List[List[float]] = []  # accumulation of all costs

    # recursive pass
    def __forward_pass(self, layer: int = 1):
        if layer == self.layers - 1:
            self.__fire(layer, self.output_activation)
        else:
            self.__fire(layer, self.activation)
            self.__forward_pass(layer + 1)

    # returns output, for online processing
    def process(self, inputs, layer: int = 1):
        self.outputs[layer - 1] = np.array(inputs, dtype=np.float32)
        self.__forward_pass(layer)

        return self.outputs[-1]

    # neuron fire(activation) at a layer
    def __fire(self, layer: int, activation):
        self.outputs[layer] = \
            activation(np.einsum('lkj,ik->lij', self.outputs[layer - 1], self.weights[layer]) + self.biases[layer])

    # neuron wire(updates to biases and weights) at a layer
    def __wire(self, layer: int):
        # optimization to sum on column(next line only), 5% time reduction
        self.biases[layer] -= (self.delta_biases[layer] * self.biases_ones[layer])[0]
        self.weights[layer] -= self.delta_weights[layer]
        self.theta = self.weights.copy()

    # recursive propagation
    def __back_propagation(self, activated_derivative, layer: int = -1):
        if layer <= -self.layers: return
        np.einsum('lij,lim->lij', self.loss_derivative[layer], activated_derivative(self.outputs[layer]),
                  out=self.delta_biases[layer])
        np.einsum('lkj,lij->ik', self.outputs[layer - 1], self.delta_biases[layer], out=self.delta_weights[layer])
        np.einsum('lij,ik->lkj', self.loss_derivative[layer], self.theta[layer], out=self.loss_derivative[layer - 1])
        self.optimizer.optimize(layer)
        self.__wire(layer)
        self.__back_propagation(self.activated_derivative, layer - 1)

    # declaring training params
    def trainer(self, train_database: DataBase = None,
                loss_function: LossFunction = None,
                optimizer: WBOptimizer = None,
                epochs: int = None,
                batch_size: int = None):
        # if new param sent, update existing class var, else use old param
        if train_database is not None: self.train_database = train_database
        if loss_function is not None: self.loss_function = loss_function
        if optimizer is not None: self.optimizer = optimizer
        if epochs is not None: self.epochs = epochs
        if batch_size is not None: self.batch_size = batch_size

        if self.batch_size < 0:
            self.bs = self.train_database.size - batch_size - 1
        else:
            self.bs = self.batch_size

        # pre memory allocation for faster training
        self.outputs = [np.zeros((self.bs, self.shape[layer], 1), dtype=np.float32) for layer in range(self.layers)]
        self.loss_derivative = self.outputs.copy()
        self.target = self.outputs[-1].copy()
        self.delta_biases, self.delta_weights = self.delta_initializer()

    # pre memory allocation and initializer of delta values for wire and optimizer
    def delta_initializer(self, bs=None):
        if bs is None: bs = self.bs
        delta_biases = np.NONE + [(np.zeros((bs, self.shape[i], 1), dtype=np.float32)) for i in range(1, self.layers)]
        delta_weights = Initializer.normal(0).initialize(self.shape, self.layers)[1]

        return delta_biases, delta_weights

    # start training after declaring trainer
    def train(self, profile=False):
        # if profiling requested run training with cProfile
        if not profile:
            costs = [0]
            tot_time = 0
            self.batches = int(np.ceil(self.train_database.size / self.bs))
            for self.epoch in range(self.epochs):
                batch_generator = self.train_database.batch_generator(self.bs)
                cost = 0
                time = tm.time()
                for self.batch in range(self.batches):
                    self.outputs[0], self.target = batch_generator.__next__()
                    self.__forward_pass()
                    self.loss, self.loss_derivative[-1] = \
                        self.loss_function.loss_function(self.outputs[-1], self.target)
                    self.__back_propagation(self.output_activated_derivative)
                    cost += self.loss
                time = tm.time() - time
                cost /= self.train_database.size
                costs.append(cost)
                tot_time += time
                print(end='\r')
                print(CBOLD + CBLUE + CURL + f'epoch:{self.epoch}' + CEND,
                      CYELLOW + f'cost:{cost}', f'time:{time}' + CEND,
                      CBOLD + f'eta:{tot_time / (self.epoch + 1) * (self.epochs - self.epoch - 1)}',
                      CEND, end='')
            print()
            print(CBOLD + CRED2 + f'tot_time:{tot_time}', f'avg_time:{tot_time / self.epochs}' + CEND)
            self.costs.append(costs[1:])
        else:
            cp.runctx("self.train()", globals=globals(), locals=locals())


# NN stats plotting
class PlotNeuralNetwork:
    @staticmethod
    def plot_cost_graph(nn):
        costs = []
        i = 0
        for cost_i in range(len(nn.costs)):
            cost = nn.costs[cost_i]
            if cost_i > 0: costs.append([costs[-1][-1], (i, cost[0])])
            costs.append([(c + i, j) for c, j in enumerate(cost)])
            i += len(cost)

        lc = mc.LineCollection(costs, colors=['red', 'red', 'green', 'green'], linewidths=1)
        sp = plt.subplot()
        sp.add_collection(lc)

        sp.autoscale()
        sp.margins(0.1)
        plt.show()


# save NN as dill pickle file(removes database from NN before saving)
class SaveNeuralNetwork:
    @staticmethod
    def save(this, fname=None):
        if fname is None: fname = 'nn'
        if len(fname) >= 4 and '.nns' == fname[-4:0]: fname.replace('.nns', '')
        cost = str(round(this.costs[-1][-1] * 100, 2))
        fname += 'c' + cost
        train_database = this.train_database
        this.train_database = None
        fpath = os.getcwd() + '/models/'
        spath = fpath + fname
        os.makedirs(fpath, exist_ok=True)
        dill.dump(this, open(spath + '.nns', 'wb'))
        this.train_database = train_database

        print(spath)

        return spath


# load NN as python dill object
class LoadNeuralNetwork:
    @staticmethod
    def load(fname, fpath=None):
        if fpath is None:
            return dill.load(open(os.getcwd() + '/models/' + fname, 'rb'))
        else:
            return dill.load(open(fpath, 'rb'))


# Load from *.gz files

In [None]:
input_set_file = '*.gz'
output_set_file = '*.gz'
def training_inputs():
    with gzip.open(input_set_file, 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of images
        image_count = int.from_bytes(f.read(4), 'big')
        # third 4 bytes is the row count
        row_count = int.from_bytes(f.read(4), 'big')
        # fourth 4 bytes is the column count
        column_count = int.from_bytes(f.read(4), 'big')
        # rest is the image pixel data, each pixel is stored as an unsigned byte
        # pixel values are 0 to 255
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8).reshape((image_count, row_count * column_count, 1))

        return images


def training_outputs():
    with gzip.open(output_set_file, 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of labels
        label_count = int.from_bytes(f.read(4), 'big')
        # rest is the label data, each label is stored as unsigned byte
        # label values are 0 to 9
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)

        return labels


input_set, output_set = training_inputs(), training_outputs()

# one-hot encoding
one_hot_set = np.zeros((output_set.size, output_set.max() + 1, 1))
one_hot_set[np.arange(0, output_set.size), output_set] = 1
output_set = one_hot_set

# uncomment to save at your path
# np.savez_compressed('*.nndb', input_set, output_set)


# Load from .nndb.npy

In [None]:
fpath = '*.nndb.npy'
nn_loader = np.load(fpath)
input_set, output_set = nn_loader['arr_0'], nn_loader['arr_1']

# Main

In [None]:
fname = None
fpath = None
nn = LoadNeuralNetwork.load(fname=fname,
                            fpath=fpath)

In [None]:
# Hyper params
shape = ?
initializer = None
activation_function = None
output_activation_function = None

# Initialize NN
nn = ArtificialNeuralNetwork(shape=shape,
                                   initializer=initializer,
                                   activation_function=activation_function,
                                   output_activation_function=output_activation_function)


In [None]:
# Hyper params
input_set = ?
output_set = ?
train_database = DataBase(input_set, output_set)
train_database.normalize()
epochs = None
batch_size = None
loss_function = None
optimizer = None

# Initialize trainer
nn.trainer(train_database=train_database,
           epochs=epochs,
           batch_size=batch_size,
           loss_function=loss_function,
           optimizer=optimizer)

In [None]:
nn.train()

In [None]:
PlotNeuralNetwork.plot_cost_graph(nn)

In [None]:
SaveNeuralNetwork.save(nn)