# **Neural Network with Numpy**
*(Author: Axel ROCHEL)*

# **Neural Network**


## **Importations**

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

## Exceptions

In [None]:
class WrongInput(Exception):
    """Exception raised when the array which feeds the layer is incorrect."""
    pass

class NotImplementedError(Exception):
    """Exception raised when a not implemented feature is required."""
    pass

class AlreadyBuiltError(Exception):
    """Exception raised when build function is called a second time."""
    pass

## **Activation functions**

In [None]:
class Activation():
    """Base class for activation functions."""
    def __init__(self):
        pass


class Linear(Activation):
    """
    Class defining the activation function : x -> x
    """
    def __init__(self):
        pass
    
    def __call__(self, x):
        """
        Apply the function defined by the class to the input.
        """
        return x
    
    def derivative(self, x):
        """
        Apply the derivative of the function defined by the class to the input.
        """
        return (x * 0 + 1)[..., np.newaxis] * np.eye(x.shape[1])


class Relu(Activation):
    """
    Class defining the activation function : x -> max(x, 0)
    """
    def __init__(self):
        pass
    
    def __call__(self, x):
        """
        Apply the function defined by the class to the input.
        """
        return np.maximum(x, 0)
    
    def derivative(self, x):
        """
        Apply the derivative of the function defined by the class to the input.
        """
        return ((x > 0) * 1)[..., np.newaxis] * np.eye(x.shape[1]) # Batch of diagonal matrix


class Softmax(Activation):
    """
    Class defining the softmax activation function.
    """
    def __init__(self):
        pass
    
    def __call__(self, x):
        """
        Apply the function defined by the class to the input.

        Notes:
            - The maximum is used to avoid to reach too high numbers with exp while giving same result.
            - [:, np.newaxis] convert 1D-array in 2D-array to make operation with 2d-arrays.
        """
        return np.exp(x - np.max(x, axis=1)[:, np.newaxis]) / np.sum(np.exp(x - np.max(x, axis=1)[:, np.newaxis]), axis=1)[:, np.newaxis] 
    
    def derivative(self, x):
        """
        Apply the derivative of the function defined by the class to the input.

        Returns:
            - matrice M with M[i][k] = self(x)[k] * ( (i==k)*1 - self(x[i]))
        """
        s = self(x)
        return (- s[..., np.newaxis] * s[:, np.newaxis, :]) + np.eye(s.shape[1]) * s[..., np.newaxis] # @ instead of * may be better

## Layers

In [None]:
class Layer():
    """Base class for layers."""
    def __init__(self):
        pass


class Dense(Layer):
    """
    Layer for fully connected networks.

    Attributes:
        - nb_neurons (int): Number of neurons of the layer.
        - activation (Activation): Activation function applied after linear combination (default=Linear).
        - name (string): Name given to the layer (default='Dense').
    """
    def __init__(self, nb_neurons, activation=Linear(), name='Dense'):
        self.type_name = 'Dense'
        self.nb = nb_neurons
        self.activation = activation
        self.name = name
        self.input_dim = 1
        self.z = None
        self.out = None

    def backward(self, dloss_dact, previous_act, weights):
        """
        Compute a gradient step on the layer given its weights, the activation on the previous layer and the loss derivative w.r.t current activation.

        Parameters:
            - dloss_dact (np.array): loss derivative w.r.t activation of the following layer.
            - previous_act (np.array): output of the previous layer.
            - weights (np.array): weights of the layer.
        
        Returns:
            - dloss_db (np.array): loss derivative w.r.t biases of the layer.
            - dloss_dw (np.array): loss derivative w.r.t weights of the layer.
            - dloss_dact (np.array): loss derivative w.r.t activation of the current layer.
        """
        ## Compute gradient ##
        # dloss/dw_jk(L) = dloss/da_j(L) * da_j(L)/dz_j(L) * dz_j/dw_jk(L)
        # or, (dz_j/dw_jk(L) = a_k(L-1)) so dloss/dw(L) = act(L) * (dloss/da(L) * da(L)/dz(L))
        dact_dz = self.activation.derivative(self.z) # shape = (b_s, nb_neurons, nb_neurons)

        dloss_db = np.squeeze(dact_dz @ dloss_dact[:,:,None])  # shape = (b_s, nb_neurons, nb_neurons) @ (bs, nb_neurons, 1) = (b_s, nb_neurons)      
        dloss_dw = previous_act[:,:,np.newaxis] @ dloss_db[:,np.newaxis,:] # shape = (bs, dim_in, 1) @ (b_s, 1, nb_neurons) = (b_s, dim_in, nb_neurons,)                     

        ## Precomputation for previous layer ##
        # dloss/da_k(L-1) = SOMME[j=0,n[(dz_j(L)/da_k(L-1) * da_j(L)/dz_j(L) *  dloss/da_j(L))
        # i.e. dloss/da(L-1) = dloss/da(L) * da(L)/dz(L) * dz(L)/da(L-1)
        dloss_dact = dloss_db @ weights.T # shape = (bs, nb_neurons) @ (dim_in, nb_neurons).T = (bs, dim_in)

        return dloss_db, dloss_dw, dloss_dact
    
    def compute_biases_shape(self):
        """
        Compute the shape of the biases array of the layer.
        """
        return (self.nb,)

    def compute_output_shape(self, input_shape=None):
        """
        Compute the shape of the output of the layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer (useless here, default=None).
        """
        return (self.nb,)
    
    def compute_weights_shape(self, input_shape):
        """
        Compute the shape of the weights array induced by the previous layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer.
        """
        if len(input_shape) != 1:   
            return (np.prod(input_shape), self.nb)
        return (*input_shape, self.nb) # inverted because used in a batch
    
    def forward(self, x, weights, biases):
        """
        Compute the activation of the layer given an input and some weights.

        Parameters:
            - x (np.array): batch of input vectors.
            - weights (np.array): weights used for the forward step.
            - biases (np.array): biases used for the forward step.
        
        Returns:
            - out (np.array): the computed output.
        """
        if len(x.shape) != 2:
            self.z = np.reshape(x, (x.shape[0], -1)) @ weights + biases
        else:
            self.z = x @ weights + biases
        self.out = self.activation(self.z)
        return self.out
    
    def init_weights(self, input_shape):
        """
        Compute random initial weights for the layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer.

        Returns:
            - init_weights (np.array): Array of the initialized weights.
            - init_biases (np.array): Array of the initialized biases.
        """
        weights_shape = self.compute_weights_shape(input_shape)
        biases_shape = self.compute_biases_shape()

        std = 1 / np.sqrt(2 * np.prod(input_shape)) # standard deviation

        init_weights = np.random.randn(*weights_shape) * std
        init_biases = np.random.randn(*biases_shape) * std

        return init_weights, init_biases


class Conv2D(Layer):
    """
    Layer for convolution applied to 2-dimensionnal inputs. (Padding not implemented yet) 

    Attributes:
        - nb_features (int): number of features computed by the layers.
        - kernel_size (int or tuple): shape of the kernel window with the following format: (height, width), height = width if int given.
        - activation (Activation): Activation function applied after linear combination. (default=Linear).
        - stride (int or tuple): stride applied during the convolution (default=(1, 1)).
        - name (string): Name given to the layer (default='Conv2D').
    """
    def __init__(self, nb_features, kernel_size, activation=Linear(), stride=(1,1), name='Conv2D'):
        self.type_name = 'Conv2D'
        self.nb = nb_features
        self.activation = activation
        self.kernel_size = kernel_size
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)       
        self.stride = stride
        if isinstance(stride, int):
            self.stride = (stride, stride)
        self.name = name
        self.input_dim = 3
        self.indices = None
        self.indices_bis = None
    
    def backward(self, dloss_dact, previous_act, weights):
        """
        Compute a gradient step on the layer given its weights, the activation on the previous layer and the loss derivative w.r.t current activation.

        Parameters:
            - dloss_dact (np.array): loss derivative w.r.t activation of the following layer.
            - previous_act (np.array): output of the previous layer.
            - weights (np.array): weight of the layer.
        
        Returns:
            - dloss_db (np.array): loss derivative w.r.t biases of the layer.
            - dloss_dw (np.array): loss derivative w.r.t weights of the layer.
            - dloss_dact (np.array): loss derivative w.r.t activation of the current layer.
        """
        ## 1st layer case ##
        if len(previous_act.shape) != 2:
            previous_act = np.reshape(previous_act, (previous_act.shape[0], -1))   
        
        ## Parameters ##
        c_in, h_in, w_in = self.input_shape
        nb, _, k0, k1, = self.weight_shape
        nb, h_out, w_out = self.output_shape
        
        ## Weights computation ##   
        if self.W is None:
            W = np.zeros((w_out*h_out*nb, w_in*h_in*c_in))
            W[self.indices, self.indices_bis] += np.reshape(weights, (-1,))[:, np.newaxis]
        else:
            W = self.W
            self.W = None
        
        ## Compute gradient ##
        # dloss/dw_jk(L) = dloss/da_j(L) * da_j(L)/dz_j(L) * dz_j/dw_jk(L)
        # or, (dz_j/dw_jk(L) = a_k(L-1)) donc dloss/dw(L) = act(L) * (dloss/da(L) * da(L)/dz(L)) 
        dact_dz = self.activation.derivative(self.z) # shape = (b_s, nb_neurons, nb_neurons)

        dloss_db = np.squeeze(dact_dz @ dloss_dact[:,:,None])  # shape = (b_s, nb_neurons, nb_neurons) @ (bs, nb_neurons, 1) = (b_s, nb_neurons)
        dloss_dw = previous_act[:, :, np.newaxis] @ dloss_db[:, np.newaxis,:] # shape = (bs, dim_in, 1) @ (b_s, 1, nb_neurons) = (b_s, dim_in, nb_neurons)                     

        ## Precomputation for previous layer ##
        # dloss/da_k(L-1) = SOMME[j=0,n[(dz_j(L)/da_k(L-1) * da_j(L)/dz_j(L) *  dloss/da_j(L))
        # i.e. dloss/da(L-1) = dloss/da(L) * da(L)/dz(L) * dz(L)/da(L-1)
        dloss_dact = dloss_db @ W # shape = (bs, nb_neurons) @ (dim_in, nb_neurons).T = (bs, dim_in)
        
        ## original shape
        bs = self.z.shape[0]
        
        dloss_db = np.sum(np.reshape(dloss_db, (bs, self.nb, -1)), axis=2)
        dloss_dweights = np.sum(dloss_dw[:, self.indices_bis, self.indices], axis=2)
        dloss_dweights = np.reshape(dloss_dweights, (bs, nb, c_in, k1, k0))        

        return dloss_db, dloss_dweights, dloss_dact
    
    def build(self, input_shape):
        """
        Set the parameters essential for forward and backward step (to compute convolution with matrix multiplication).
        """
        if len(input_shape) == 2:
            input_shape = (1, *input_shape)
        if len(input_shape) != 3:
            raise WrongInput('the dimension of the input is incorrect. Given : {} ; Expected : {} or {}.'.format(len(input_shape), 2, 3))
        h_out = (input_shape[1] - self.kernel_size[0]) // self.stride[0] + 1
        w_out = (input_shape[2] - self.kernel_size[1]) // self.stride[1] + 1
        self.input_shape = input_shape
        self.output_shape = (self.nb, h_out, w_out)
        
        c_in, h_in, w_in  = self.input_shape
        nb, _, k0, k1, = self.weight_shape
        
        offset = []
        for i in range(h_out):
            for j in range(w_out):
                offset += [j * self.stride[1] + i * (w_in) * self.stride[0]]
        offset = np.array(offset)
             
        indices = np.zeros((self.nb, w_out * h_out)) # repeat k0 * k1 fois
        indices_bis = np.zeros((self.nb * c_in * k0 * k1, w_out * h_out))
        for n_filter in range(self.nb):
            indices[n_filter] = np.arange(0, w_out * h_out) + n_filter*w_out*h_out
            for k_filter in range(c_in):
                for i_filter in range(k0):
                    for j_filter in range(k1):
                        indices_bis[j_filter + k1*i_filter + k0*k1*k_filter + k0*k1*c_in*n_filter] = j_filter + i_filter * w_in + k_filter * h_in * w_in + offset

        self.indices = np.repeat(indices, c_in*k0*k1, axis=0).astype(int)
        self.indices_bis = indices_bis.astype(int)
    
    def compute_biases_shape(self):
        """
        Compute the shape of the biases array of the layer.
        """
        return (self.nb,)
    
    def compute_output_shape(self, input_shape):
        """
        Compute the shape of the output of the layer and build the layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer.
        """
        self.build(input_shape)

        return self.output_shape
    
    def compute_weights_shape(self, input_shape):
        """
        Compute the shape of the weights array induced by the previous layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer.
        """
        if len(input_shape) not in (2, 3):
            raise WrongInput('the dimension of the input is incorrect. Given : {} ; Expected : {} or {}.'.format(len(input_shape), 2, 3))
        if len(input_shape) == 2:
            c_in = 1
        else:
            c_in = input_shape[0]
        self.weight_shape = (self.nb, c_in, self.kernel_size[0], self.kernel_size[1])
        return self.weight_shape
    
    def forward(self, x, weights, biases):
        """
        Compute the activation of the layer given an input and some weights.

        Parameters:
            - x (np.array): batch of input vectors.
            - weights (np.array): weights used for the forward step.
            - biases (np.array): biases used for the forward step.
        
        Returns:
            - out (np.array): the computed output.
        """
        if weights.shape != self.weight_shape:
            raise ValueError("Bad weight shape. Given : {} ; Expected : {}.".format(weights.shape, self.weight_shape))
        if biases.shape != (self.nb,):
            raise ValueError("Bad biases shape. Given : {} ; Expected : {}.".format(biases.shape, (self.nb,)))
        if len(x.shape) == 3:
            x = x[:, np.newaxis,...]
                             
        ## Parameters ##
        b_s, c_in, h_in, w_in = x.shape[0], *self.input_shape
        nb, c_in, k0, k1,  = self.weight_shape
        _, h_out, w_out = self.output_shape
        
        ## Weights computation ##      
        W = np.zeros((w_out*h_out*nb, w_in*h_in*c_in))
        W[self.indices, self.indices_bis] += np.reshape(weights, (-1,))[:, None]
        self.W = W # for backward
        
        B = np.repeat(biases, h_out * w_out)
        
        ## Output computation ##
        reshaped_x = np.reshape(x, (x.shape[0], -1)) # (bs, reshape_in)
        self.z = reshaped_x @ W.T + B
        self.out = self.activation(self.z)
        
        return np.reshape(self.out, (b_s, nb, h_out, w_out))
    
    def init_weights(self, input_shape):
        """
        Compute random initial weights for the layer.

        Parameters:
            - input_shape: shape of the features which will feed the layer.

        Returns:
            - init_weights (np.array): Array of the initialized weights.
            - init_biases (np.array): Array of the initialized biases.
        """
        weights_shape = self.compute_weights_shape(input_shape)
        biases_shape = self.compute_biases_shape()

        if len(input_shape) == 2:
            c_in = 1
        else:
            c_in = input_shape[0]

        std = 1 / np.sqrt(2 * c_in * self.kernel_size[0] * self.kernel_size[1]) # standard deviation

        init_weights = np.random.randn(*weights_shape) * std
        init_biases = np.random.randn(*biases_shape) * std

        return init_weights, init_biases

## **Loss functions**

In [None]:
class Loss():
    """Base class for loss functions."""
    def __init__(self):
        pass


class MSE(Loss):
    """
    Class of the Mean Squared Error.
    """
    def __init__(self):
        pass
    
    def __call__(self, pred, target):
        """
        Apply the loss function to the prediction.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.

        Returns:
            - losses (np.array with shape=(pred.shape[0],)): array containing the losses of each sample.
        """
        return np.mean(np.square(pred - target), axis=1)
    
    def compute_accuracy(self, pred, targets):
        """
        Compute the accuracy of the model on given data.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.
         
         Returns:
            - accuracy (float): ratio between good and bad predictions according to the targets.
        """
        return np.sum(np.argmax(pred, axis=1)==np.argmax(targets, axis=1)) / pred.shape[0]
 
    def derivative(self, pred, target):
        """
        Compute the derivative of the loss w.r.t to the prediction.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.
        
        Returns:
            - dloss_dpred (np.array with the same shape as pred): array such as dloss_dpred[i][j] is the derivative of the ith prediction of the batch w.r.t the jth output neuron.
        """
        return 2 * (pred - target) / pred.shape[1]
    

class CategoricalCrossentropy(Loss):
    """
    Class of the crossentropy applied to one-hot vectors.

    Attributes:
        - epsilon (float): small constant preventing numerical instability (default=1e-7).
    """
    def __init__(self, epsilon=1e-7):
        self.epsilon = epsilon
    
    def __call__(self, pred, target):
        """
        Apply the loss function to the prediction.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.
        
        Returns:
            - losses (np.array with shape=(pred.shape[0],)): array containing the losses of each sample.
        """
        #return - SOMMEi=1,n(traget[i] * log(pred[i]))
        return - np.sum(target * np.log(np.clip(pred, self.epsilon, None)), axis=1)

    def compute_accuracy(self, pred, targets):
        """
        Compute the accuracy of the model on given data.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.
         
         Returns:
            - accuracy (float): ratio between good and bad predictions according to the targets.
        """
        return np.sum(np.argmax(pred, axis=1)==np.argmax(targets, axis=1)) / pred.shape[0]

    def derivative(self, pred, target):
        """
        Compute the derivative of the loss w.r.t to the prediction.

        Parameters:
            - pred (np.array): array corresponding to the output of a neural network.
            - target (np.array): array corresponding to the ideal output of the neural network.
        
        Returns:
            - dloss_dpred (np.array with the same shape as pred): array such as dloss_dpred[i][j] is the derivative of the ith prediction of the batch w.r.t the jth output neuron.
        """
        return - target / np.clip(pred, self.epsilon, None)

## **Optimizers**

In [None]:
class Optimizer():
    """Base class for the optimizers."""
    def __init__(self):
        pass


class SGD(Optimizer):
    """
    Class of the Stochastic gradient descent.

    Attributes:
        - learning_rate (float): Factor reducing the gradient step (default=1e-2).
        - momentum (float): hyperparameter >= 0 that accelerates gradient descent in the relevant direction and dampens oscillations (default=0).
        - nesterov (boolean): either nesterov momentum is applied or not.
    """
    def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False):
        assert (momentum >= 0.0), 'momentum should be positive or null'
        self.lr = learning_rate
        self.momentum = momentum
        self.delta_w = None # last gradient step for weights
        self.delta_b = None # last gradient step for biases
        self.nesterov = nesterov

    def update_weights(self, weights, biases, dloss_dw, dloss_db):
        """
        Perform a gradient step by computing the new values of parameters.

        Parameters:
            - weights (array, shape=(batch size, _)): Weights of the model before gradient step.
            - biases  (array, shape=(batch size, _)): Biases of the model before gradient step.
            - dloss_dw  (array, shape=(batch size, _)): Derivatives of the loss with regard to the different weights of the model.
            - dloss_db (array, shape=(batch size, _)): Derivatives of the loss with regard to the different biases of the model.

        Returns:
            - (new_weight, new biases): parameters values after gradient steps.
        """
        if self.delta_w is None:
            self.delta_w = [0 for i in range(len(weights))]
            self.delta_b = [0 for i in range(len(weights))]

        new_weight = []
        new_biases = []
        delta_w = []
        delta_b = []
        for i in range(len(weights)):
            delta_w.append(self.momentum * self.delta_w[i] - self.lr * np.mean(dloss_dw[i], axis=0))
            delta_b.append(self.momentum * self.delta_b[i] - self.lr * np.mean(dloss_db[i], axis=0))
            if self.nesterov:
                new_weight.append(weights[i] + self.momentum * delta_w[i] - self.lr * np.mean(dloss_dw[i], axis=0))
                new_biases.append(biases[i] + self.momentum * delta_b[i] - self.lr * np.mean(dloss_db[i], axis=0))
            else:
                new_weight.append(weights[i] + delta_w[i])
                new_biases.append(biases[i] + delta_b[i])
        self.delta_w, self.delta_b = delta_w, delta_b
        return new_weight, new_biases


class Adam(Optimizer):
    """
    Class of the Adaptative Moment estimation.

    Attributes:
        - learning_rate (float): Factor reducing the gradient step (default=1e-3).
        - beta_1 (float): exponential decay rate for the 1st moment estimates (default=0.9).
        - beta_2 (float): exponential decay rate for the 2nd moment estimates (default=0.999).
        - epsilon (float): small constant preventing numerical instability (default=1e-7).

    """
    def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-7):
        self.lr = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.eps = epsilon
        self.ms_w = None
        self.ms_b = None
        self.vs_w = None
        self.vs_b = None
        self.nb_it = 1
    
    def update_weights(self, weights, biases, dloss_dw, dloss_db):
        """
        Perform a gradient step by computing the new values of parameters.

        Parameters:
            - weights (array, shape=(batch size, _)): Weights of the model before gradient step.
            - biases  (array, shape=(batch size, _)): Biases of the model before gradient step.
            - dloss_dw  (array, shape=(batch size, _)): Derivatives of the loss with regard to the different weights of the model.
            - dloss_db (array, shape=(batch size, _)): Derivatives of the loss with regard to the different biases of the model.

        Returns:
            - (new_weight, new biases): parameters values after gradient steps.
        """
        if self.ms_w is None:
            self.ms_w = [0 for i in range(len(weights))]
            self.ms_b = [0 for i in range(len(weights))]
            self.vs_w = [0 for i in range(len(weights))]
            self.vs_b = [0 for i in range(len(weights))]

        new_weight = []
        new_biases = []
        ms_w = []
        ms_b = []
        vs_w = []
        vs_b = []
        for i in range(len(weights)):
            grad_w = np.mean(dloss_dw[i], axis=0)
            grad_b = np.mean(dloss_db[i], axis=0)

            ms_w.append(self.beta_1 * self.ms_w[i] + (1 - self.beta_1) * grad_w)
            ms_b.append(self.beta_1 * self.ms_b[i] + (1 - self.beta_1) * grad_b)

            vs_w.append(self.beta_2 * self.vs_w[i] + (1 - self.beta_2) * np.square(grad_w))
            vs_b.append(self.beta_2 * self.vs_b[i] + (1 - self.beta_2) * np.square(grad_b))

            new_weight.append(weights[i] - (self.lr / (1 - self.beta_1 ** self.nb_it) * ms_w[i] / (self.eps + np.sqrt(vs_w[i] / (1 - self.beta_2 ** self.nb_it)))))
            new_biases.append(biases[i] - (self.lr / (1 - self.beta_1 ** self.nb_it) * ms_b[i] / (self.eps + np.sqrt(vs_b[i] / (1 - self.beta_2 ** self.nb_it)))))

        self.ms_w, self.ms_b, self.vs_w, self.vs_b = ms_w, ms_b, vs_w, vs_b
        self.nb_it += 1
        return new_weight, new_biases


class RMSProp(Optimizer):
    """
    Class of the Root Mean Squared Propagation.

    Attributes:
        - learning_rate (float): Factor reducing the gradient step (default=1e-3).
        - momentum (float): hyperparameter >= 0 that accelerates gradient descent in the relevant direction and dampens oscillations (default=0).
        - rho (float >= 0): Discounting factor for the history/coming gradient (default=0.9).
        - epsilon (float): small constant preventing numerical instability (default=1e-7).
    """
    def __init__(self, learning_rate=1e-3, rho=0.9, momentum=0.0, epsilon=1e-7):
        assert (momentum >= 0.0), 'momentum should be positive or null'
        self.lr = learning_rate
        self.rho = rho
        self.momentum = momentum
        self.eps = epsilon
        self.sqr_w = None
        self.sqr_b = None
    
    def update_weights(self, weights, biases, dloss_dw, dloss_db):
        """
        Perform a gradient step by computing the new values of parameters.

        Parameters:
            - weights (array, shape=(batch size, _)): Weights of the model before gradient step.
            - biases  (array, shape=(batch size, _)): Biases of the model before gradient step.
            - dloss_dw  (array, shape=(batch size, _)): Derivatives of the loss with regard to the different weights of the model.
            - dloss_db (array, shape=(batch size, _)): Derivatives of the loss with regard to the different biases of the model.

        Returns:
            - (new_weight, new biases): parameters values after gradient steps.
        """
        if self.sqr_w is None:
            self.sqr_w = [0 for i in range(len(weights))]
            self.sqr_b = [0 for i in range(len(weights))]

        new_weight = []
        new_biases = []
        sqr_w = []
        sqr_b = []
        for i in range(len(weights)):
            grad_w = np.mean(dloss_dw[i], axis=0)
            grad_b = np.mean(dloss_db[i], axis=0)

            sqr_w.append(self.rho * self.sqr_w[i] + (1 - self.rho) * np.square(grad_w))
            sqr_b.append(self.rho * self.sqr_b[i] + (1 - self.rho) * np.square(grad_b))

            new_weight.append(weights[i] - self.lr * grad_w / (np.sqrt(sqr_w[i]) + self.eps))
            new_biases.append(biases[i] - self.lr * grad_b / (np.sqrt(sqr_b[i]) + self.eps))

        self.sqr_w, self.sqr_b = sqr_w, sqr_b
        return new_weight, new_biases

## **Models**

In [None]:
class Model():
    def __init__(self):
        self.layers = []
        self.weights = []
        self.biases = []
        self.built = False
    
    def __call__(self, x):
        return self.forward(x)
    
    def add(self, layer):
        """
        Add a new layer to the graph model.

        Parameters:
            - layer (Layer): the layer added.
        """
        self.layers.append(layer)
    
    def build(self, input_dim, optimizer, loss):
        """
        Initialize weights and trainings parameters.

        Parameters:
            - input_dim (tuple): shape of each sample which will feed the model.
            - optimizer (Optimizer): Optimizer which will be used during training.
            - loss (Loss): function which will be decreased during training.
        
        Raises:
            - AlreadyBuiltError: Model already built.
        """
        if self.built:
            raise AlreadyBuiltError("Model already built.")
        self.input_dim = input_dim
        for i in range(len(self.layers)):
            init_weights, init_biases = self.layers[i].init_weights(input_dim)
            self.weights.append(init_weights)
            self.biases.append(init_biases)
            input_dim = self.layers[i].compute_output_shape(input_dim)
        self.loss = loss
        self.opt = optimizer
        self.dloss_dw = [None for i in range(len(self.layers))]
        self.dloss_db = [None for i in range(len(self.layers))]
        self.built = True

    def evaluate(self, data, target):
        """
        Compute loss and accuracy of the model on given data.

        Parameters:
            - data (np.array): array containing inputs whose shape are equals to self.input_dim.
            - target (np.array): array corresponding to the ideal output of the neural network given the input data.
        
        Returns:
            - loss, accuracy.
        """
        return np.mean(self.loss(self(data), target)), self.loss.compute_accuracy(self(data), target)
    
    def fit(self, data, targets, epochs=1, validation_split=None, batch_size=32, shuffle=True, verbose='training'):
        """
        Train the model.

        Parameters:
            - data (np.array): Input of the neural network.
            - targets: Output that should be returned by the model according to the input data.
            - epochs (int): number of epochs. (default=1)
            - validation_split (float or None): ratio between the number of validation data and training data. None is equivalent to 0.0 (default=None)
            - batch_size: number of data evaluated between each gradient step. (default=32)
            - shuffle (boolean): if True, data are shuffle at each epoch. (default=True)
            - verbose: Define the information displayed in the terminal (default: 'training' ; use 'debug to see all 
 intermediate results).

        Return:
            - history (dict): recorded loss and accuracy of the training.
        """
        if validation_split==None:
            data_train, targets_train = data, targets
            hist = {'acc':[], 'loss':[]}
        else:
            data_train, data_valid, targets_train, targets_valid = train_test_split(data, targets, test_size=validation_split, random_state=31, stratify=targets)
            hist = {'acc':[], 'loss':[], 'val_acc':[], 'val_loss':[]}

        for epoch in range(epochs):
            print('Epoch {} / {}'.format(epoch+1, epochs))

            batch_index = np.arange(0, len(data_train))
            if shuffle:
                np.random.shuffle(batch_index) # the last batch is passed even if it is incomplete
            current_index = 0
            batch_total = (len(data_train)-1) // batch_size + 1
            current_batch_nb = 0
            while current_index < len(data_train):
                if current_index+batch_size > len(data_train):
                    indexes = batch_index[current_index:len(data_train)]
                else:
                    indexes = batch_index[current_index:current_index+batch_size]

                data_batch = data_train[indexes]
                targets_batch = targets_train[indexes]
            
                pred_batch = self(data_batch)
                loss = np.mean(self.loss(pred_batch, targets_batch))
                current_batch_nb += 1                              
                                    
                print("\r{}/{} [".format(current_batch_nb, batch_total) + (20*current_batch_nb//batch_total-1) * "=" + ">" + (20-20*current_batch_nb//batch_total) * "." + "]", end="")
                print(" loss: ", round(loss, 3), end="")

                ## Backpropagation ##
                dloss_dact = self.loss.derivative(pred_batch, targets_batch) #  shape = (bs, nb_neurons)
                for i in range(len(self.layers) - 1, -1, -1):
                    previous_act = self.layers[i-1].out
                    if i == 0:
                        previous_act = data_batch
                    self.dloss_db[i], self.dloss_dw[i], dloss_dact = self.layers[i].backward(dloss_dact, previous_act, self.weights[i])
                self.weights, self.biases = self.opt.update_weights(self.weights, self.biases, self.dloss_dw, self.dloss_db)
                current_index += batch_size

                ## Debug
                if verbose == 'debug':
                    print('\ndloss_dw\n', self.dloss_dw)
                    print('\ndloss_db\n', self.dloss_db)

            ## Evaluate ##
            print("\r{}/{} [".format(current_batch_nb, batch_total) + (20*current_batch_nb//batch_total-1) * "=" + ">" + (20-20*current_batch_nb//batch_total) * "." + "]", end="")

            loss, acc = self.evaluate(data_train, targets_train)
            hist['loss'].append(loss)
            hist['acc'].append(acc)
            if validation_split==None:
                print(" loss: {} ; acc: {}".format(loss, acc))
            else:
                val_loss, val_acc = self.evaluate(data_valid, targets_valid)       
                print(" loss: {} ; acc: {} ; val_loss: {} ; val_acc: {}".format(loss, acc, val_loss, val_acc))
                hist['val_loss'].append(val_loss)
                hist['val_acc'].append(val_acc)

        return hist

    def forward(self, x):
        """
        Apply a forward step on the given input.

        Parameters:
            - x (np.array): array containing inputs which shape are equals to self.input_dim.
        
        Return:
            - out (np.array): The output of the forward pass.
        
        Raises:
            - WrongInput: The dimension of the inputs aren't equal to the input dimension of the model.
        """
        if x.shape != (x.shape[0], *self.input_dim):
            raise WrongInput("The dimension of the inputs aren't equal to the input dimension of the model. (Given: {}, Expected:{})".format(x[0].shape, self.input_dim))
        
        out = x
        for i in range(len(self.layers)):
            out = self.layers[i].forward(out, self.weights[i], self.biases[i])
        return out
    
    def summary(self):
        """
        Prints a summary of the network.
        """
        if self.built:
            print("layer | input shape | output shape | nb_param |")
            input_dim = self.input_dim
            for i in range(len(self.layers)):
                print('-' * 47)
                print(self.layers[i].type_name, end=(6-len(self.layers[i].type_name))*" " + "| ")
                
                nb_space_in = 12 - len(str(input_dim))
                print(input_dim, end=nb_space_in*" " + "| ")
                
                output_dim = self.layers[i].compute_output_shape(input_dim)
                nb_space_out = 13 - len(str(output_dim))
                print(output_dim, end=nb_space_out*" " + "| ")
                
                nb_param = np.prod(self.weights[i].shape) + np.prod(self.biases[i].shape)
                print(nb_param, end= (8-int(np.log10(nb_param))) * " " + "|\n")
                input_dim = output_dim
        
        else:
            print('MODEL UNBUILT')
            print("layer    |")
            for i in range(len(self.layers)):
                print('-' * 10)
                nb_char = len(self.layers[i].type_name)
                print(self.layers[i].type_name, end=(9-nb_char)*" " + "|\n")
        print("\n")

# **Tests**

## Unit Tests

### Fully connected

Init & forward tests

In [None]:
inputs = np.array([[1, -1, 1],
                  [2, 1, 4],
                  [-1, 2, 1],
                  [2, 2, 1],
                  [4, 1, -4]])

target = np.array([[0, 0, 1, 0],
                   [0, 1, 0, 0],
                   [1, 0, 0, 0],
                   [1, 0, 0, 0],
                   [0, 0, 1, 0]])

print('#### BUILDING TESTS ####')
model = Model()
model.add(Dense(2, activation=Relu()))
model.add(Dense(4, activation=Softmax()))

model.summary()
model.build(inputs[0].shape, SGD(), MSE())
model.summary()

print('weights :\n', model.weights)
print('biases :\n', model.biases)



print('\n\n#### FORWARD TESTS ####')
model.weights = [np.array([[0, 1],
                          [2, 1],
                          [-1, 0.5]]),
                np.array([[1, -1, 2, 1],
                          [0, -2, -1, 0.5]])]

model.biases = [np.array([1, 0]), np.array([0.5, -1, 0, 2])]
print('weights :\n', model.weights)
print('biases :\n', model.biases)

output = model(inputs)


for i in range(len(model.layers)):
    print('\nLayer', i+1)
    print('z:', model.layers[i].z)
    print('activation:', model.layers[i].out)

print('\noutput :\n', output)
print('\n loss:', model.loss(output, target))
print('\n eval:', model.evaluate(inputs, target))


print("\n\n\n### BACKWARD TESTS ###\n")

model.fit(inputs, target, epochs=1, verbose='debug')

print(model.biases)

#### BUILDING TESTS ####
MODEL UNBUILT
layer    |
----------
Dense    |
----------
Dense    |


layer | input shape | output shape | nb_param |
-----------------------------------------------
Dense | (3,)        | (2,)         | 8        |
-----------------------------------------------
Dense | (2,)        | (4,)         | 12       |


weights :
 [array([[-0.24594327, -0.70327267],
       [ 0.25816348, -0.3770528 ],
       [ 0.19506191, -0.16303806]]), array([[ 0.1791997 ,  1.53668955, -0.32253881, -0.69005981],
       [-0.70885682, -0.39474291, -0.49277723,  0.42043044]])]
biases :
 [array([ 0.59425682, -0.01436838]), array([0.00626433, 0.63498125, 0.19963605, 0.29689209])]


#### FORWARD TESTS ####
weights :
 [array([[ 0. ,  1. ],
       [ 2. ,  1. ],
       [-1. ,  0.5]]), array([[ 1. , -1. ,  2. ,  1. ],
       [ 0. , -2. , -1. ,  0.5]])]
biases :
 [array([1, 0]), array([ 0.5, -1. ,  0. ,  2. ])]

Layer 1
z: [[-2.   0.5]
 [-1.   5. ]
 [ 4.   1.5]
 [ 4.   4.5]
 [ 7.   3. ]]
activati

Test training

In [None]:
input = np.array([[1, -1, 1],
                  [2, 1, 4],
                  [-1, 2, 1],
                  [2, 2, 1],
                  [4, 1, -4]])

target = np.array([[0, 0, 1, 0],
                   [0, 1, 0, 0],
                   [1, 0, 0, 0],
                   [1, 0, 0, 0],
                   [0, 0, 1, 0]])

model = Model()
model.add(Dense(2, activation=Relu()))
model.add(Dense(4, activation=Softmax()))

model.build(input[0].shape, SGD(), MSE())


model.weights = [np.array([[0, 1],
                          [2, 1],
                          [-1, 0.5]]),
                np.array([[1, -1, 2, 1],
                          [0, -2, -1, 0.5]])]

model.biases = [np.array([1, 0]), np.array([0.5, -1, 0, 2])]

hist = model.fit(input, target, epochs=1000)

Epoch 1 / 1000
Epoch 2 / 1000
Epoch 3 / 1000
Epoch 4 / 1000
Epoch 5 / 1000
Epoch 6 / 1000
Epoch 7 / 1000
Epoch 8 / 1000
Epoch 9 / 1000
Epoch 10 / 1000
Epoch 11 / 1000
Epoch 12 / 1000
Epoch 13 / 1000
Epoch 14 / 1000
Epoch 15 / 1000
Epoch 16 / 1000
Epoch 17 / 1000
Epoch 18 / 1000
Epoch 19 / 1000
Epoch 20 / 1000
Epoch 21 / 1000
Epoch 22 / 1000
Epoch 23 / 1000
Epoch 24 / 1000
Epoch 25 / 1000
Epoch 26 / 1000
Epoch 27 / 1000
Epoch 28 / 1000
Epoch 29 / 1000
Epoch 30 / 1000
Epoch 31 / 1000
Epoch 32 / 1000
Epoch 33 / 1000
Epoch 34 / 1000
Epoch 35 / 1000
Epoch 36 / 1000
Epoch 37 / 1000
Epoch 38 / 1000
Epoch 39 / 1000
Epoch 40 / 1000
Epoch 41 / 1000
Epoch 42 / 1000
Epoch 43 / 1000
Epoch 44 / 1000
Epoch 45 / 1000
Epoch 46 / 1000
Epoch 47 / 1000
Epoch 48 / 1000
Epoch 49 / 1000
Epoch 50 / 1000
Epoch 51 / 1000
Epoch 52 / 1000
Epoch 53 / 1000
Epoch 54 / 1000
Epoch 55 / 1000
Epoch 56 / 1000
Epoch 57 / 1000
Epoch 58 / 1000
Epoch 59 / 1000
Epoch 60 / 1000
Epoch 61 / 1000
Epoch 62 / 1000
Epoch 63 / 1000
E

Crossentropy

In [None]:
input = np.array([[1, -1, 1],
                  [2, 1, 4],
                  [-1, 2, 1],
                  [2, 2, 1],
                  [4, 1, -4]])

target = np.array([[0, 0, 1, 0],
                   [0, 1, 0, 0],
                   [1, 0, 0, 0],
                   [1, 0, 0, 0],
                   [0, 0, 1, 0]])

print('#### BUILDING TESTS ####')
model = Model()
model.add(Dense(2, activation=Relu()))
model.add(Dense(4, activation=Softmax()))

model.build(input[0].shape, SGD(), CategoricalCrossentropy())

print('\n\n#### FORWARD TESTS ####')
model.weights = [np.array([[0, 1],
                          [2, 1],
                          [-1, 0.5]]),
                np.array([[1, -1, 2, 1],
                          [0, -2, -1, 0.5]])]

model.biases = [np.array([1, 0]), np.array([0.5, -1, 0, 2])]

output = model(input)
print('\noutput :\n', output)
print('\n loss:', model.loss(output, target))
print('\n eval:', model.evaluate(input, target))


print("\n\n\n### BACKWARD TESTS ###\n")

dloss_dact = model.loss.derivative(output, target)
print(dloss_dact)

#### BUILDING TESTS ####


#### FORWARD TESTS ####

output :
 [[1.38800845e-01 1.13934671e-02 5.10619771e-02 7.98743711e-01]
 [1.79848847e-02 1.82188565e-07 7.35001131e-05 9.81941433e-01]
 [5.59384366e-02 2.08463152e-07 4.13332246e-01 5.30729109e-01]
 [2.27847725e-02 2.10473189e-10 8.38204936e-03 9.68833178e-01]
 [1.84498479e-02 8.48518721e-12 6.10975051e-01 3.70575101e-01]]

 loss: [ 2.97471515 15.51822361  2.88350354  3.78166284  0.49269915]

 eval: (5.130160858770255, 0.2)



### BACKWARD TESTS ###

[[ 0.00000000e+00  0.00000000e+00 -1.95840439e+01  0.00000000e+00]
 [ 0.00000000e+00 -5.48881868e+06  0.00000000e+00  0.00000000e+00]
 [-1.78767957e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-4.38889615e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00 -1.63672804e+00  0.00000000e+00]]


### Convolution

In [None]:
inputs = np.array([[[1, 0, 0, 1],
                   [1, 1, 1, 0],
                   [1, 1, 0, 0],
                   [1, 1, 0, 1]],
                  [[1, 0, 0, 0],
                  [0, 0, 1, 1],
                  [0, 0, 1, 0],
                  [0, 1, 0, 1]]])

target = np.array([[0, 0, 1, 0],
                   [0, 1, 0, 0]])

print('#### BUILDING TESTS ####')
model = Model()
model.add(Conv2D(3, (2, 2), activation=Relu()))
model.add(Conv2D(4, (2, 2), activation=Relu()))
model.add(Dense(4, activation=Softmax()))

model.summary()
model.build(inputs[0].shape, SGD(), MSE())
model.summary()

print('weights :\n', model.weights)
print('biases :\n', model.biases)



print('\n\n#### FORWARD TESTS ####')
model.weights = [np.array([[[[1, 1], [1, 0]]],
                          [[[0, 1], [1, 2]]],
                          [[[2, 0], [1, 1]]]]),
                 np.array([[[[1, 1], [1, 0]],
                            [[-1, 0], [0, 0]],
                            [[1, 1], [2, -2]]],
                           [[[0, 1], [1, 0]],
                            [[2, 0], [1, 1]],
                            [[-1, 1], [2, 2]]],
                           [[[1, 1], [1, 1]],
                            [[1, 1], [0, 1]],
                            [[1, 2], [-2, 2]]],
                           [[[1, 1], [1, 2]],
                            [[0, 0], [0, 1]],
                            [[1, 1], [2, 2]]]]),
                 np.array([[1, 1, 1, -1],
                          [1, 1, 1, -1],
                          [2, 1, 1, -1],
                          [1, -2, 1, -1],
                          [1, 1, 1, -1],
                          [-1, -1, 1, -1],
                          [2, 2, 1, -1],
                          [1, 2, 1, -1],
                          [2, 2, 2, -1],
                          [1, 2, 1, -1],
                          [1, 1, 1, 1],
                          [1, 0, 1, 2],
                          [1, 1, 1, 0],
                          [1, 1, 1, 1],
                          [1, 0, 1, 1],
                          [1, 1, 1, 1]])]
                          

model.biases = [np.array([1, 0, 0]), np.array([1, -1, 0, 1]), np.array([1, 1, 1, 0])]
print('\nweights :\n', model.weights)
print('biases :\n', model.biases)

output = model(inputs)

bs = inputs.shape[0]
for i in range(len(model.layers)):
    print('\nLayer', i+1)
    if model.layers[i].type_name == 'Conv2D':
        print('z:', np.reshape(model.layers[i].z, (bs, *model.layers[i].output_shape)))
        print('activation:', np.reshape(model.layers[i].out, (bs, *model.layers[i].output_shape)))
    else:
        print('z:', model.layers[i].z)
        print('activation:', model.layers[i].out)

print('\noutput :\n', output)
print('\n loss:', model.loss(output, target))
print('\n eval:', model.evaluate(inputs, target))


print("\n\n\n### BACKWARD TESTS ###\n")

model.fit(inputs, target, epochs=1, verbose='debug')

print(model.biases)

#### BUILDING TESTS ####
MODEL UNBUILT
layer    |
----------
Conv2D   |
----------
Conv2D   |
----------
Dense    |


layer | input shape | output shape | nb_param |
-----------------------------------------------
Conv2D| (4, 4)      | (3, 3, 3)    | 15       |
-----------------------------------------------
Conv2D| (3, 3, 3)   | (4, 2, 2)    | 52       |
-----------------------------------------------
Dense | (4, 2, 2)   | (4,)         | 68       |


weights :
 [array([[[[-0.11254056,  0.05279407],
         [-0.15371227, -0.5420581 ]]],


       [[[-0.32647085, -0.17883377],
         [-0.22291275,  0.9972099 ]]],


       [[[ 0.68085006, -0.09189219],
         [ 0.12763752,  0.18045348]]]]), array([[[[ 0.03778367,  0.46696359],
         [-0.13264557,  0.17687973]],

        [[ 0.54934818,  0.03572331],
         [-0.13299696,  0.15365858]],

        [[ 0.12703576, -0.02233472],
         [ 0.26935876, -0.26143466]]],


       [[[-0.12379367,  0.25293387],
         [-0.25859583,  0.02211

Test stride

In [None]:
inputs = np.array([[[1, 0, 0, 1],
                   [1, 1, 1, 0],
                   [1, 1, 0, 0],
                   [1, 1, 0, 1]],
                  [[1, 0, 0, 0],
                  [0, 0, 1, 1],
                  [0, 0, 1, 0],
                  [0, 1, 0, 1]]])

target = np.array([[0, 0, 1, 0],
                   [0, 1, 0, 0]])

print('#### BUILDING TESTS ####')
model = Model()
model.add(Conv2D(3, (2, 2), activation=Relu(), stride=(2, 2)))
model.add(Conv2D(4, (2, 2), activation=Relu()))
model.add(Dense(4, activation=Softmax()))

model.summary()
model.build(inputs[0].shape, SGD(), MSE())
model.summary()

print('weights :\n', model.weights)
print('biases :\n', model.biases)



print('\n\n#### FORWARD TESTS ####')
model.weights = [np.array([[[[1, 1], [1, 0]]],
                          [[[0, 1], [1, 2]]],
                          [[[2, 0], [1, 1]]]]),
                 np.array([[[[1, 1], [1, 0]],
                            [[-1, 0], [0, 0]],
                            [[1, 1], [2, -2]]],
                           [[[0, 1], [1, 0]],
                            [[2, 0], [1, 1]],
                            [[-1, 1], [2, 2]]],
                           [[[1, 1], [1, 1]],
                            [[1, 1], [0, 1]],
                            [[1, 2], [-2, 2]]],
                           [[[1, 1], [1, 2]],
                            [[0, 0], [0, 1]],
                            [[1, 1], [2, 2]]]]),
                 np.array([[1, 1, 1, -1],
                          [1, 1, 1, -1],
                          [2, 1, 1, -1],
                          [1, -2, 1, -1]])]
                          

model.biases = [np.array([1, 0, 0]), np.array([1, -1, 0, 1]), np.array([1, 1, 1, 0])]
print('\nweights :\n', model.weights)
print('biases :\n', model.biases)

output = model(inputs)

bs = inputs.shape[0]
for i in range(len(model.layers)):
    print('\nLayer', i+1)
    if model.layers[i].type_name == 'Conv2D':
        print('z:', np.reshape(model.layers[i].z, (bs, *model.layers[i].output_shape)))
        print('activation:', np.reshape(model.layers[i].out, (bs, *model.layers[i].output_shape)))
    else:
        print('z:', model.layers[i].z)
        print('activation:', model.layers[i].out)

print('\noutput :\n', output)
print('\n loss:', model.loss(output, target))
print('\n eval:', model.evaluate(inputs, target))


print("\n\n\n### BACKWARD TESTS ###\n")

model.fit(inputs, target, epochs=1, verbose='debug')

print(model.biases)

#### BUILDING TESTS ####
MODEL UNBUILT
layer    |
----------
Conv2D   |
----------
Conv2D   |
----------
Dense    |


layer | input shape | output shape | nb_param |
-----------------------------------------------
Conv2D| (4, 4)      | (3, 2, 2)    | 15       |
-----------------------------------------------
Conv2D| (3, 2, 2)   | (4, 1, 1)    | 52       |
-----------------------------------------------
Dense | (4, 1, 1)   | (4,)         | 20       |


weights :
 [array([[[[ 0.2184747 ,  0.43265017],
         [ 0.02172403,  0.40529544]]],


       [[[-0.23991677,  0.17781006],
         [-0.54910038, -0.03093817]]],


       [[[ 0.32930873, -0.10264767],
         [-0.04072907,  0.32201755]]]]), array([[[[-0.26565349,  0.28105277],
         [-0.40999466, -0.00688628]],

        [[ 0.05170115, -0.06348626],
         [ 0.07487241,  0.13129386]],

        [[ 0.03173987,  0.00078636],
         [-0.05959256, -0.1662526 ]]],


       [[[ 0.1190331 , -0.14731489],
         [ 0.12584461, -0.09916

## **Tests MNIST**

### Fully connected

#### Models Creation

Création model numpy

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

np.random.seed(0)

(x, y), (x_test_ori, y_test_ori) = db.load_data()

model = Model()
model.add(Dense(64, activation=Relu()))
model.add(Dense(32, activation=Relu()))
model.add(Dense(10, activation=Softmax()))

x_flat = x.reshape(x.shape[0], -1) / 255
model.build(x_flat[0].shape, SGD(learning_rate=1e3), MSE())

target = to_categorical(y)

print(x_flat.shape, target.shape)

(60000, 784) (60000, 10)


Création modèle torch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
np.random.seed(0)

torch_x, torch_y = map(torch.tensor, (x_flat[0:6], target[0:6]))

class Torch_Model(nn.Module):
    def __init__(self):
        super(Torch_Model, self).__init__()

        # Couches complètement connectées
        self.fc1 = nn.Linear(784, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 10)

    def forward(self, x):
        x = x.float() # Prevent RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #3 'mat1' in call to _th_addmm_
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))

        return x

    def fit(self, data, targets, epochs, opt, loss_func, batch_size, validation_split=None, debug=False, print_debug="No print specified"):
        train_dl, valid_dl = self.get_train_valid_data(data, targets, batch_size, validation_split)

        for epoch in range(epochs):
            self.train() # Tell the model that training begins (important for some specific operations as Dropout)

            for xb, yb in train_dl:
                pred = self.forward(xb)
                loss = loss_func(pred.float(), yb.float())

                # Gradient descent
                loss.backward()
                opt.step()

                if debug:
                    print("\rEpoch : {} -> ".format(epoch), end="")
                    print(self.fc3.bias.grad)

                opt.zero_grad() # Reset gradient, otherwise future gradients will be added to the current one.

            self.eval() #Tell the model that evaluation begins
            with torch.no_grad():
                valid_loss = sum(loss_func(self(xb), yb) for xb, yb in valid_dl)
            if validation_split is not None:
                print("\rEpoch : {} -> ".format(epoch), end="")
                print("  valid loss : {}".format(valid_loss / len(valid_dl)))

    def get_train_valid_data(self, data, targets, batch_size, validation_split):
        if validation_split is not None:
            data_train, data_valid, labels_train, labels_valid = train_test_split(data, targets, test_size=validation_split, random_state=31, stratify=targets)
        else:
            data_train, data_valid, labels_train, labels_valid = data, torch.empty(0, *data[0].shape), targets, torch.empty(0, *targets[0].shape)


        train_ds = TensorDataset(data_train, labels_train)
        train_dl = DataLoader(train_ds, batch_size=batch_size)

        valid_ds = TensorDataset(data_valid, labels_valid)
        valid_dl = DataLoader(valid_ds, batch_size=2 * batch_size) # double batch for better performance

        return train_dl, valid_dl

model_torch = Torch_Model()

W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
for i in range(len(model.layers)):
    W[i].weight.data = torch.Tensor(model.weights[i].T)
    W[i].bias.data = torch.Tensor(model.biases[i])

#### Weights and gradients comparison

In [None]:
reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(32, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x_flat[0].shape, SGD(learning_rate=1e2), MSE())

    model_torch = Torch_Model()
    W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
    for i in range(len(model.layers)):
        W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])

print('\033[4m' + 'Modèle Torch:' + '\033[0m')
model_torch.fit(torch_x, torch_y, opt=optim.SGD(params=model_torch.parameters(), lr=1e2), loss_func=nn.MSELoss(), batch_size=3, epochs=2, debug=True)


print('\n\n\n')

print('\033[4m' + 'Modèle perso:' + '\033[0m')
model.fit(x_flat[0:6], target[0:6], batch_size=3, epochs=2, shuffle=False)
print(np.mean(model.dloss_db[2], axis=0))


print('\n\n\n')

print('\033[31mTorch biases after training:\n', model_torch.fc3.bias)
print('\033[32m\nModel biases after training:\n', model.biases[2])

[4mModèle Torch:[0m
Epoch : 0 -> tensor([-0.0048,  0.0015,  0.0028,  0.0017, -0.0048, -0.0046,  0.0021,  0.0018,
         0.0024,  0.0018])
Epoch : 0 -> tensor([ 0.0084, -0.0039, -0.0018, -0.0008,  0.0055, -0.0006, -0.0008, -0.0009,
        -0.0009, -0.0042])
Epoch : 1 -> tensor([-7.7176e-06, -5.5411e-03, -2.8895e-03, -5.3693e-03, -3.0289e-04,
        -7.7922e-03, -8.5926e-04, -1.1771e-03, -1.4431e-03,  2.5382e-02])
Epoch : 1 -> tensor([-3.6020e-09, -2.2135e-03, -4.8729e-08, -1.0583e-03, -2.9424e-10,
         3.2718e-03, -7.2836e-11, -3.7218e-10, -3.3013e-10, -5.0353e-38])




[4mModèle perso:[0m
Epoch 1 / 2
Epoch 2 / 2
[-3.60209892e-09 -2.21350255e-03 -4.87301983e-08 -1.05829509e-03
 -2.94243680e-10  3.27185104e-03 -7.28379628e-11 -3.72188766e-10
 -3.30131368e-10 -5.03562933e-38]




[31mTorch biases after training:
 Parameter containing:
tensor([-0.3453,  0.9126,  0.3509,  0.4812, -0.0335,  0.8738, -0.0346, -0.0409,
         0.0550, -2.2935], requires_grad=True)
[32m
Model 



#### Test with different optimizers: SGD, RMSprop, Adam *(loss=MSE, layers=Dense)*

SGD with momentum

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(32, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x_flat[0].shape, SGD(learning_rate=1e-2, momentum=0.9, nesterov=True), MSE())

    model_torch = Torch_Model()
    W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
    for i in range(len(model.layers)):
        W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])


(x, y), (x_test_ori, y_test_ori) = db.load_data()
x_flat = x.reshape(x.shape[0], -1) / 255
target = to_categorical(y)

torch_x, torch_y = map(torch.tensor, (x_flat, target))
opt = optim.SGD(model_torch.parameters(), lr=1e-2, momentum=0.9, nesterov=True)
loss_func = nn.MSELoss()

model_torch.fit(torch_x, torch_y, 10, opt, loss_func, 32, validation_split=0.2)

print('\n\n')
model.fit(x_flat, target, batch_size=32, epochs=10, validation_split=0.2)



Epoch : 0 ->   valid loss : 0.07985232025384903
Epoch : 1 ->   valid loss : 0.03507859259843826
Epoch : 2 ->   valid loss : 0.020565800368785858
Epoch : 3 ->   valid loss : 0.016691530123353004
Epoch : 4 ->   valid loss : 0.015024054795503616
Epoch : 5 ->   valid loss : 0.01399229932576418
Epoch : 6 ->   valid loss : 0.01325113233178854
Epoch : 7 ->   valid loss : 0.012665868736803532
Epoch : 8 ->   valid loss : 0.012179220095276833
Epoch : 9 ->   valid loss : 0.011748121120035648



Epoch 1 / 10
Epoch 2 / 10
Epoch 3 / 10
Epoch 4 / 10
Epoch 5 / 10
Epoch 6 / 10
Epoch 7 / 10
Epoch 8 / 10
Epoch 9 / 10
Epoch 10 / 10


{'acc': [0.5409166666666667,
  0.7824375,
  0.8809583333333333,
  0.9004791666666667,
  0.9082708333333334,
  0.91375,
  0.9196458333333334,
  0.9214791666666666,
  0.92575,
  0.9279375],
 'loss': [0.08031369906980175,
  0.0354616990223877,
  0.020063161534635752,
  0.016209217711997743,
  0.014485622332380294,
  0.013498531060108314,
  0.012692934791403137,
  0.012252619332588874,
  0.011659250852908367,
  0.01131702564326929],
 'val_acc': [0.5396666666666666,
  0.7778333333333334,
  0.8773333333333333,
  0.8956666666666667,
  0.9024166666666666,
  0.90875,
  0.9160833333333334,
  0.9173333333333333,
  0.9216666666666666,
  0.9233333333333333],
 'val_loss': [0.08035272188383143,
  0.03564199464924612,
  0.020338931871585704,
  0.016573054437732486,
  0.014901559102399427,
  0.013885067756809198,
  0.013151025052418212,
  0.012763931909420425,
  0.01213135887058454,
  0.011882402780658519]}

RMSprop

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(32, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x_flat[0].shape, RMSProp(learning_rate=1e-2, rho=0.9), MSE())

    model_torch = Torch_Model()
    W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
    for i in range(len(model.layers)):
        W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])


(x, y), (x_test_ori, y_test_ori) = db.load_data()
x_flat = x.reshape(x.shape[0], -1) / 255
target = to_categorical(y)

torch_x, torch_y = map(torch.tensor, (x_flat, target))
opt = optim.RMSprop(model_torch.parameters(), lr=1e-2, alpha=0.9, eps=1e-7)
loss_func = nn.MSELoss()

model_torch.fit(torch_x, torch_y, 10, opt, loss_func, 32, validation_split=0.2)


print('\n\n')
model.fit(x_flat, target, batch_size=32, epochs=10, validation_split=0.2)



Epoch : 1 ->   valid loss : 0.009461630135774612
Epoch : 2 ->   valid loss : 0.009961865842342377
Epoch : 3 ->   valid loss : 0.009508982300758362
Epoch : 4 ->   valid loss : 0.009421045891940594
Epoch : 5 ->   valid loss : 0.01046204287558794
Epoch : 6 ->   valid loss : 0.01166702713817358
Epoch : 7 ->   valid loss : 0.011157970875501633
Epoch : 8 ->   valid loss : 0.010938932187855244
Epoch : 9 ->   valid loss : 0.00858311727643013



Epoch 1 / 10
Epoch 2 / 10
Epoch 3 / 10
Epoch 4 / 10
Epoch 5 / 10
Epoch 6 / 10
Epoch 7 / 10
Epoch 8 / 10
Epoch 9 / 10
Epoch 10 / 10


{'acc': [0.9427291666666666,
  0.9501666666666667,
  0.9520208333333333,
  0.9536041666666667,
  0.95425,
  0.9595625,
  0.95675,
  0.9582708333333333,
  0.9590416666666667,
  0.9610625],
 'loss': [0.009284994106108627,
  0.00816418799773018,
  0.00787672052824159,
  0.008328451219446057,
  0.008362729142673764,
  0.0074572630032309194,
  0.007970491480494786,
  0.007803162636766614,
  0.007675581280830335,
  0.007430626391343687],
 'val_acc': [0.9396666666666667,
  0.9439166666666666,
  0.9493333333333334,
  0.94475,
  0.9486666666666667,
  0.95225,
  0.9484166666666667,
  0.95,
  0.9495833333333333,
  0.9523333333333334],
 'val_loss': [0.00999487455804005,
  0.009039026327388278,
  0.0084595947178186,
  0.009806585210980816,
  0.009298338742890589,
  0.00884804422455337,
  0.009380050374081511,
  0.009233081982079751,
  0.009447714894157253,
  0.008990808663455498]}

Adam

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(32, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x_flat[0].shape, Adam(), MSE())

    model_torch = Torch_Model()
    W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
    for i in range(len(model.layers)):
        W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])


(x, y), (x_test_ori, y_test_ori) = db.load_data()
x_flat = x.reshape(x.shape[0], -1) / 255
target = to_categorical(y)

torch_x, torch_y = map(torch.tensor, (x_flat, target))
opt = optim.Adam(model_torch.parameters(), eps=1e-7)
loss_func = nn.MSELoss()

model_torch.fit(torch_x, torch_y, 10, opt, loss_func, 32, validation_split=0.2)

print('\n\n')
model.fit(x_flat, target, batch_size=32, epochs=10, validation_split=0.2)



Epoch : 0 ->   valid loss : 0.010441718623042107
Epoch : 1 ->   valid loss : 0.00850758422166109
Epoch : 2 ->   valid loss : 0.006298528052866459
Epoch : 3 ->   valid loss : 0.00586102157831192
Epoch : 4 ->   valid loss : 0.00538274459540844
Epoch : 5 ->   valid loss : 0.004819015972316265
Epoch : 6 ->   valid loss : 0.004964479710906744
Epoch : 7 ->   valid loss : 0.00511875981464982
Epoch : 8 ->   valid loss : 0.005127946846187115
Epoch : 9 ->   valid loss : 0.004757270682603121



Epoch 1 / 10
Epoch 2 / 10
Epoch 3 / 10
Epoch 4 / 10
Epoch 5 / 10
Epoch 6 / 10
Epoch 7 / 10
Epoch 8 / 10
Epoch 9 / 10
Epoch 10 / 10


{'acc': [0.9387083333333334,
  0.96125,
  0.9713541666666666,
  0.9756041666666667,
  0.9715,
  0.9753125,
  0.9797083333333333,
  0.9850625,
  0.9868541666666667,
  0.9878958333333333],
 'loss': [0.009649704995136668,
  0.006248801552844896,
  0.004658047847108141,
  0.0039605144820486155,
  0.004583744783797805,
  0.0039312892851325115,
  0.003331136459504414,
  0.0024773987884121724,
  0.0021874198933776617,
  0.002064684934489073],
 'val_acc': [0.9354166666666667,
  0.9561666666666667,
  0.96125,
  0.9665,
  0.9606666666666667,
  0.96275,
  0.9665,
  0.9699166666666666,
  0.9730833333333333,
  0.9714166666666667],
 'val_loss': [0.010162384294815809,
  0.006908236830858858,
  0.005861256997749946,
  0.005245693705862712,
  0.006096147702271335,
  0.005846090735484642,
  0.005421346135729039,
  0.004754468531897233,
  0.004318116315585321,
  0.004495118669840775]}

#### Test with Crossentropy loss

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(32, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x_flat[0].shape, Adam(), CategoricalCrossentropy())

    model_torch = Torch_Model()
    W = [model_torch.fc1, model_torch.fc2, model_torch.fc3]
    for i in range(len(model.layers)):
        W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])


(x, y), (x_test_ori, y_test_ori) = db.load_data()
x_flat = x.reshape(x.shape[0], -1) / 255
target = to_categorical(y)

model.fit(x_flat, target, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1 / 10
Epoch 2 / 10
Epoch 3 / 10
Epoch 4 / 10
Epoch 5 / 10
Epoch 6 / 10
Epoch 7 / 10
Epoch 8 / 10
Epoch 9 / 10
Epoch 10 / 10


{'acc': [0.9469583333333333,
  0.9630833333333333,
  0.9710416666666667,
  0.9782083333333333,
  0.9785833333333334,
  0.9853541666666666,
  0.9871041666666667,
  0.9900416666666667,
  0.991625,
  0.9896875],
 'loss': [0.18599681664680726,
  0.1275125155110217,
  0.09665136023961003,
  0.07032215290004781,
  0.0661972863158912,
  0.04770082863233563,
  0.041585438796825386,
  0.03273594335532735,
  0.027800180620587412,
  0.031399127772178466],
 'val_acc': [0.9428333333333333,
  0.9575,
  0.9623333333333334,
  0.9685833333333334,
  0.968,
  0.9698333333333333,
  0.9700833333333333,
  0.97225,
  0.97225,
  0.9700833333333333],
 'val_loss': [0.19534211113577477,
  0.14693358748812727,
  0.12694034218090486,
  0.10902708831202618,
  0.10904985999588356,
  0.10290455023792977,
  0.10452949597340595,
  0.09948765722796336,
  0.09524671833367082,
  0.1062253472052106]}

### Convolution

#### Models Creation

Création convolution

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

np.random.seed(0)

(x, y), (x_test_ori, y_test_ori) = db.load_data()

model = Model()
model.add(Conv2D(32, (5, 5), stride=(2, 2), activation=Relu()))
model.add(Conv2D(64, (3, 3), activation=Relu()))
model.add(Dense(64, activation=Relu()))
model.add(Dense(10, activation=Softmax()))

model.build(x[0].shape, SGD(learning_rate=1e-3), MSE())

target = to_categorical(y)

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
layer | input shape | output shape | nb_param |
-----------------------------------------------
Conv2D| (28, 28)    | (32, 12, 12) | 832      |
-----------------------------------------------
Conv2D| (32, 12, 12)| (64, 10, 10) | 18496    |
-----------------------------------------------
Dense | (64, 10, 10)| (64,)        | 409664   |
-----------------------------------------------
Dense | (64,)       | (10,)        | 650      |




Création convolution Torch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
np.random.seed(0)

x_reshaped = np.expand_dims(x, 1)
torch_x, torch_y = map(torch.tensor, (x_reshaped[0:6]/255, target[0:6]))

class Torch_Model(nn.Module):
    def __init__(self):
        super(Torch_Model, self).__init__()

        self.conv1 = nn.Conv2d(1, 16, (5, 5), stride=(2, 2))
        self.conv2 = nn.Conv2d(16, 32, (3, 3))
        
        self.fc1 = nn.Linear(32 * 10 * 10, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x, debug=False):
        bs = x.shape[0]

        x = x.float() # Prevent RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #3 'mat1' in call to _th_addmm_
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        
        x = x.view(bs, -1)
        
        x = F.relu(self.fc1(x))
        if debug:
            x = self.fc2(x) # To see gradient
        else:
            x = F.softmax(self.fc2(x))

        return x

    def fit(self, data, targets, epochs, opt, loss_func, batch_size, validation_split=None, debug=False):
        train_dl, valid_dl = self.get_train_valid_data(data, targets, batch_size, validation_split)

        for epoch in range(epochs):
            self.train() # Tell the model that training begins (important for some specific operations as Dropout)

            for xb, yb in train_dl:
                pred = self.forward(xb)
                loss = loss_func(pred.float(), yb.float())

                # Gradient descent
                loss.backward()
                opt.step()

                if debug:
                    print("\rEpoch : {} -> ".format(epoch), end="")
                    print(self.fc2.bias.grad)

                opt.zero_grad() # Reset gradient, otherwise future gradients will be added to the current one.
                    

            self.eval() #Tell the model that evaluation begins
            with torch.no_grad():
                valid_loss = sum(loss_func(self(xb), yb) for xb, yb in valid_dl)
            if validation_split is not None:
                print("\rEpoch : {} -> ".format(epoch), end="")
                print("  valid loss : {}".format(valid_loss / len(valid_dl)))
            else:
                with torch.no_grad():
                    train_loss = sum(loss_func(self(xb), yb) for xb, yb in train_dl)
                print("\rEpoch : {} -> ".format(epoch), end="")
                print("  loss : {}".format(train_loss / len(train_dl)))

    def get_train_valid_data(self, data, targets, batch_size, validation_split):
        if validation_split is not None:
            data_train, data_valid, labels_train, labels_valid = train_test_split(data, targets, test_size=validation_split, random_state=0, stratify=targets)
        else:
            data_train, data_valid, labels_train, labels_valid = data, torch.empty(0, *data[0].shape), targets, torch.empty(0, *targets[0].shape)


        train_ds = TensorDataset(data_train, labels_train)
        train_dl = DataLoader(train_ds, batch_size=batch_size)

        valid_ds = TensorDataset(data_valid, labels_valid)
        valid_dl = DataLoader(valid_ds, batch_size=2 * batch_size) # double batch for better performance

        return train_dl, valid_dl

model_torch = Torch_Model()

W = [model_torch.conv1, model_torch.conv2, model_torch.fc1, model_torch.fc2]
for i in range(len(model.layers)):
    if model.layers[i].type_name == 'Conv2D':
        W[i].weight.data = torch.Tensor(model.weights[i])
    else:
        W[i].weight.data = torch.Tensor(model.weights[i].T)
    W[i].bias.data = torch.Tensor(model.biases[i])

#### Weights and gradients comparison

Step by step

In [None]:
np.random.seed(0)
reset = True
if reset:
    model = Model()
    model.add(Conv2D(16, (5, 5), stride=(2, 2), activation=Relu()))
    model.add(Conv2D(32, (3, 3), activation=Relu()))
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x[0].shape, SGD(learning_rate=1e2), MSE())

    model_torch = Torch_Model()
    W = [model_torch.conv1, model_torch.conv2, model_torch.fc1, model_torch.fc2]
    for i in range(len(model.layers)):
        if model.layers[i].type_name == 'Conv2D':
            W[i].weight.data = torch.Tensor(model.weights[i])
        else:
            W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])


### Step by step
# Torch
torch_x, torch_y = map(torch.tensor, (np.expand_dims(x[0:6], 1)/255, target[0:6]))
pred_torch = model_torch.forward(torch_x, debug=False)
loss_func = nn.MSELoss()
loss_torch = loss_func(pred_torch.float(), torch_y.float())
loss_torch.backward()
opt = optim.SGD(model_torch.parameters(), lr=1e2, momentum=0.0)
opt.step()

# Numpy
output = model(x[0:6]/255)
loss = np.mean(model.loss(output, target[0:6]))
model.fit(x[0:6] / 255, target[0:6], batch_size=6, epochs=1)

# print
print('pred:\n', output)
print('\npred_torch:\n', pred_torch)

print('\n\nloss:', loss)
print('loss_torch', loss_torch)

print('\n\ngradient:\n', np.mean(model.dloss_db[1], axis=0))
print('\ngradient torch:\n', model_torch.conv2.bias.grad)

print('\n\n\033[32mModel biases after backward:\n', model.biases[0])
print('\033[31mTorch biases after step:\n', model_torch.conv1.bias)



Epoch 1 / 1
pred:
 [[0.08703774 0.10198284 0.09616278 0.10318064 0.12153796 0.09577594
  0.08182547 0.10356675 0.10220511 0.10672475]
 [0.08502192 0.10510033 0.09715579 0.1067434  0.1198817  0.09339803
  0.0814874  0.10181985 0.10051626 0.10887531]
 [0.08400857 0.10165665 0.10484243 0.10448844 0.11376131 0.09752665
  0.08441966 0.10130325 0.10115998 0.10683306]
 [0.08588052 0.10522487 0.10197052 0.10365024 0.11582657 0.09680488
  0.08416125 0.10200282 0.10148975 0.10298858]
 [0.08658191 0.1041941  0.10140924 0.10360541 0.11274054 0.09720112
  0.08754627 0.09973386 0.09861468 0.10837288]
 [0.08847483 0.10329268 0.10175962 0.10156762 0.11620692 0.10000738
  0.08438389 0.09827838 0.09914488 0.10688381]]

pred_torch:
 tensor([[0.0870, 0.1020, 0.0962, 0.1032, 0.1215, 0.0958, 0.0818, 0.1036, 0.1022,
         0.1067],
        [0.0850, 0.1051, 0.0972, 0.1067, 0.1199, 0.0934, 0.0815, 0.1018, 0.1005,
         0.1089],
        [0.0840, 0.1017, 0.1048, 0.1045, 0.1138, 0.0975, 0.0844, 0.1013, 0.101

Fit

In [None]:
np.random.seed(0)
reset = True
if reset:
    model = Model()
    model.add(Conv2D(16, (5, 5), stride=(2, 2), activation=Relu()))
    model.add(Conv2D(32, (3, 3), activation=Relu()))
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    #model.build(x[0].shape, SGD(learning_rate=1e1), MSE())
    model.build(x[0].shape, Adam(learning_rate=1e-3), MSE())

    model_torch = Torch_Model()
    W = [model_torch.conv1, model_torch.conv2, model_torch.fc1, model_torch.fc2]
    for i in range(len(model.layers)):
        if model.layers[i].type_name == 'Conv2D':
            W[i].weight.data = torch.Tensor(model.weights[i])
        else:
            W[i].weight.data = torch.Tensor(model.weights[i].T)
        W[i].bias.data = torch.Tensor(model.biases[i])

### Fit
print('Before training:')
print('\033[31mTorch biases before step:\n', model_torch.conv2.weight.detach().numpy()[0:2,0])
print('\033[32mModel biases before backward:\n', model.weights[1][0:2,0])

torch_x, torch_y = map(torch.tensor, (np.expand_dims(x[0:1000], 1)/255, target[0:1000]))
opt = optim.Adam(model_torch.parameters())
loss_func = nn.MSELoss()
model_torch.fit(torch_x, torch_y, 3, opt, loss_func, 32)

model.fit(x[0:1000]/255, target[0:1000], batch_size=32, epochs=3, shuffle=False)

print('\nAfter training:')
print('\033[31mTorch biases after step:\n', model_torch.conv2.weight.detach().numpy()[0:2,0])
print('\033[32mModel biases after backward:\n', model.weights[1][0:2, 0])

Before training:
[31mTorch biases before step:
 [[[ 0.02185107  0.00837107  0.08956656]
  [ 0.10132777  0.05477161  0.03430791]
  [-0.12342567  0.00729038 -0.00766663]]

 [[ 0.11851443 -0.00262781  0.01149459]
  [-0.1049796  -0.04295937  0.01158226]
  [ 0.0209043   0.03635039  0.0005084 ]]]
[32mModel biases before backward:
 [[[ 0.02185107  0.00837107  0.08956656]
  [ 0.10132777  0.05477161  0.03430791]
  [-0.12342567  0.00729038 -0.00766663]]

 [[ 0.11851444 -0.00262781  0.01149459]
  [-0.1049796  -0.04295937  0.01158226]
  [ 0.0209043   0.03635039  0.0005084 ]]]




Epoch : 0 ->   loss : 0.03038010746240616
Epoch : 1 ->   loss : 0.018230507150292397
Epoch : 2 ->   loss : 0.012022891081869602
Epoch 1 / 3
Epoch 2 / 3
Epoch 3 / 3

After training:
[31mTorch biases after step:
 [[[ 0.05097174  0.04354589  0.12376972]
  [ 0.13105668  0.09378874  0.07115868]
  [-0.11383     0.02368421  0.01637953]]

 [[ 0.14092134  0.01167447  0.0285505 ]
  [-0.10264634 -0.06674211 -0.01634342]
  [ 0.0286122   0.03320579  0.00622831]]]
[32mModel biases after backward:
 [[[ 0.0518132   0.04463733  0.12420274]
  [ 0.13239241  0.09464379  0.07077545]
  [-0.11193715  0.02539659  0.01643408]]

 [[ 0.14208556  0.01237022  0.02756373]
  [-0.10344194 -0.06687652 -0.01619571]
  [ 0.02630244  0.03266892  0.00650498]]]


#### Final test

In [None]:
from keras.datasets import mnist as db
from keras.utils import to_categorical

reset=True

np.random.seed(0)
if reset:
    model = Model()
    model.add(Conv2D(16, (5, 5), stride=(2, 2), activation=Relu()))
    model.add(Conv2D(32, (3, 3), activation=Relu()))
    model.add(Dense(64, activation=Relu()))
    model.add(Dense(10, activation=Softmax()))
    model.build(x[0].shape, Adam(), CategoricalCrossentropy())

(x, y), (x_test_ori, y_test_ori) = db.load_data()
target = to_categorical(y)

np.random.seed(293)
model.fit(x/ 255, target, batch_size=32, epochs=4, validation_split=0.2)

Epoch 1 / 4
Epoch 2 / 4
Epoch 3 / 4
Epoch 4 / 4


{'acc': [0.98075, 0.9875, 0.9917916666666666, 0.993875],
 'loss': [0.06094920705960944,
  0.040677762868495594,
  0.026613825190110654,
  0.01936644721573445],
 'val_acc': [0.9780833333333333,
  0.9828333333333333,
  0.9864166666666667,
  0.9863333333333333],
 'val_loss': [0.0704177378765918,
  0.06050314471876044,
  0.04686432838715824,
  0.04767690732682733]}