## Team: AdAnSo

Members:
1. Adarsh Anand (2003101)
2. Aniket Chaudhri (2003104)
3. Somesh Agrawal (2003326)

In [19]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, OneHotEncoder
from tqdm import trange


In [3]:
class MultiplicationLayer:
    """
    Inputs : X in R^(1xd) , W in R^(dxK)
    This layer takes X & W as input and perform these 2 tasks: 
    1. Forward Pass : Matrix multiplication,  Z = XW 
    2. Backward Pass : dZ/dX , dZ/dW 
    """

    def __init__(self, X, W):
        self.X = X
        self.W = W

    def __str__(self,):
        return " An instance of Muliplication Layer."

    def forward(self):
        self.Z = np.dot(self.X, self.W)

    def backward(self):
        self.dZ_dW = (self.X).T  # dZ/dW
        self.dZ_daZ_prev = self.W  # dZ/dX


In [4]:
class BiasAdditionLayer : 
    """
    Inputs : Z in R^(1xK), B in R^(1xK)
    This layer takes output Z of forward pass of Multiplication Layer as input and perform these 2 operations : 
    1. Forward Pass :  Z = Z + B
    2. Backward Pass : dZ/dB
    """
    def __init__(self, Z : np.ndarray , bias : np.ndarray ):
        self.B = bias
        self.Z = Z
    
    def __str__(self,):
        return "An instance of Bias Addition Layer."
    
    def forward(self,):
        self.Z = self.Z + self.B
    
    def backward(self,):
        self.dZ_dB = np.identity( self.B.shape[1] )

In [5]:
class MeanSquaredLossLayer:
    """
    This layer implements Mean Square Loss Layer.
    Inputs : Y in R^(1xK) , Y_hat in R^(1xK)  where K --> dimesion of output layer 
    This layer takes prediction Y_hat and true Y as input and perform these 2 opearations : 
    1. Forward Pass : L = (1/n) * || Y_hat - Y||**2 
    2. Backward Pass : dL/dY_hat = (2/n)*(Y_hat - Y).T   Note :Here instead of dL/dY_hat , I used dL/daZ symbol which denote 
                                                             derivative of loss w.r.t. output of previous activation layer
    """

    def __init__(self, Y: np.ndarray, Y_hat: np.ndarray):
        self.Y = Y
        self.aZ = Y_hat

    def __str__(self,):
        return "An instance of Mean Squared Loss Layer"

    def forward(self, ):
        self.L = np.mean((self.aZ - self.Y)**2)

    def backward(self,):
        self.dL_daZ = (2/len(self.Y))*(self.aZ - self.Y).T


In [6]:
class SoftMaxActivation : 
    """
    This layer implements SoftMax Activation Function.
    Input : a numpy array Z in R^(1XK)  
    1. Forward Pass : Apply Softmax Activation function, aZ = softmax(Z).T
    2. Backward Pass : daZ/dZ  = diag(aZ) - sZ*transpose(aZ)  --> here diag(aZ) is diagonal matrix with 
                                                                   i-th diagnoal entry replaced by sZ_i value
    """
    def __init__(self, Z):
        self.Z = Z 
        
    def __str__(self,):
        return "An instance of Softmax Activation Layer"
        
    def forward(self,):
        self.aZ = self.softmax(self.Z)
    
    def backward(self,):
        self.daZ_dZ = np.diag( self.aZ.reshape(-1) ) - (self.aZ.T)@( (self.aZ))  # Shape = (K,K) where K = len( sZ )
    
    @staticmethod
    def softmax(Z : np.ndarray):
        max_Z = np.max( Z, axis=1 ,keepdims=True )
        return (np.exp(Z - max_Z ))/np.sum( np.exp(Z - max_Z), axis=1 , keepdims=True)

In [7]:
class SigmoidActivation:
    """
    This layer implements Sigmoid Activation Function. 
    Input : a numpy array Z of shape Kx1 
    1. Forward Pass : aZ = sigmoid( Z )  
    2. Backward Pass : daZ/dZ = diagonal matrix with entries aZ_i*(1-aZ_i) --> sigZ_i means i-th component of sigZ
    """

    def __init__(self, Z):
        self.Z = Z

    def __str__(self,):
        return "An instance of Sigmoid Activation Layer"

    def forward(self,):
        self.aZ = self.sigmoid(self.Z)  # sigmoid calculation

    def backward(self,):
        diag_entries = np.multiply(self.aZ, 1-self.aZ).reshape(-1)
        self.daZ_dZ = np.diag(diag_entries)

    @staticmethod
    def sigmoid(Z: np.ndarray):
        return 1./(1 + np.exp(-Z))


In [8]:
class CrossEntropyLossLayer:
    """
    This layer implements Cross Entropy Loss Layer. 
    Inputs : Y in R^(1xK) , Y_pred in R^(1xK)  where K --> dimesion of output layer 
    This layer takes prediction Y_pred and true Y as input and perform these 2 opearations : 
    1. Forward Pass : L = -1 * dot product of Y & log(Y_pred)    
    2. Backward Pass : dL/dY_pred in R^(Kx1)
    """

    def __init__(self, Y, Y_pred):
        self.Y = Y
        self.aZ = Y_pred
        self.epsilon = 1e-40

    def __str__(self, ):
        return "An instance of Cross Entropy Loss Layer"

    def forward(self, ):
        self.L = - np.sum(self.Y * np.log(self.aZ+self.epsilon))

    def backward(self, ):
        self.dL_daZ = -1*(self.Y/(self.aZ + self.epsilon)
                          ).T  # Element wise division


In [9]:
class LinearActivation : 
    """
    Implementation of linear activation function.
    Input : Z in R^(1xn)
    Ouput : linear(Z) = Z 
    """
    def __init__(self, Z):
        self.Z = Z 
        
    def __str__(self,):
        return "An instance of Linear Activation."
    
    def forward(self, ):
        self.aZ = self.Z 
    
    def backward(self,):
        self.daZ_dZ = np.identity( self.Z.shape[1] )

In [10]:
class tanhActivation:
    """
    Implementation of tanh activation function
    Input : a numpy array Z in R^(1xK)
    1. Forward Pass : aZ = tanh(Z)
    2. Backward Pass : daZ/dZ = np.diag(1 - aZ**2)   --> R^(KxK)
    """

    def __init__(self, Z):
        self.Z = Z

    def __str__(self,):
        return "An instance of tanhActivation class."

    def forward(self,):
        self.aZ = np.tanh(self.Z)

    def backward(self,):
        self.daZ_dZ = np.diag(1 - self.aZ.reshape(-1)**2)


In [11]:
class ReLUActivation : 
    """
    Implementation of relu activatino function
    Input : a numpy array Z in R^(1xK)
    1. Forward Pass aZ = max(Z,0)
    2. Backward Pass : daZ_dZ = diag_matrix( 1 if aZ_i>0 else 0 )
    """
    def __init__(self, Z): 
        self.Z = Z 
        self.Leak = 0.01
    
    def __str__(self,):
        return "An instance of ReLU activation"
    
    def forward(self,):
        self.aZ = np.maximum(self.Z,0)
    
    def backward(self,):
        self.daZ_dZ = np.diag( [1. if x>=0 else self.Leak for x in self.aZ.reshape(-1)])

In [26]:
def load_data(dataset_name='boston',
              normalize_X=False,
              normalize_y=False,
              one_hot_encode_y=False,
              test_size=0.2):
    if dataset_name == 'boston':
        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data_boston = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        result_boston = raw_df.values[1::2, 2]
        data = {'data': data_boston, 'target': result_boston}
        

        # data = load_boston()
    elif dataset_name == 'iris':
        data = load_iris()
    elif dataset_name == 'mnist':
        data = load_digits()
        data['data'] = 1*(data['data'] >= 8)

    X = data['data']
    y = data['target'].reshape(-1, 1)

    if normalize_X == True:
        normalizer = Normalizer()
        X = normalizer.fit_transform(X)

    if normalize_y == True:
        normalizer = Normalizer()
        y = normalizer.fit_transform(y)

    if one_hot_encode_y == True:
        encoder = OneHotEncoder()
        y = encoder.fit_transform(y).toarray()
        # y = np.eye(3)[y.reshape(-1)]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size)
    return X_train, y_train, X_test, y_test


In [12]:
class Layer:
    """
    Input - activation : Activation Layer Name ,n_inp : dimension of input ,  n_out :  Number of output neurons 
    """

    def __init__(self, n_inp, n_out, activation_name="linear", seed=42):

        np.random.seed(seed)  # for reproducability of code

        self.n_inp = n_inp
        self.n_out = n_out

        # random initialization of input X  and output Z
        self.X = np.random.random((1, n_inp))   # assigned during SGD
        self.Z = np.random.random((1, n_out))

        # Initialize W & B with some scaling to avoid over-flow
        self.W = np.random.random((n_inp, n_out)) * \
            np.sqrt(2 / (n_inp + n_out))
        self.B = np.random.random((1, n_out))*np.sqrt(2 / (1 + n_out))

        # define multiplication layer, bias addition layer , and activation layer
        self.multiply_layer = MultiplicationLayer(self.X, self.W)
        self.bias_add_layer = BiasAdditionLayer(self.B, self.B)

        if activation_name == 'linear':
            self.activation_layer = LinearActivation(self.Z)
        elif activation_name == 'sigmoid':
            self.activation_layer = SigmoidActivation(self.Z)
        elif activation_name == 'softmax':
            self.activation_layer = SoftMaxActivation(self.Z)
        elif activation_name == 'tanh':
            self.activation_layer = tanhActivation(self.Z)
        elif activation_name == 'relu':
            self.activation_layer = ReLUActivation(self.Z)

    def forward(self,):
        self.multiply_layer.X = self.X
        self.multiply_layer.forward()

        self.bias_add_layer.Z = self.multiply_layer.Z
        self.bias_add_layer.forward()

        self.activation_layer.Z = self.bias_add_layer.Z
        self.activation_layer.forward()

        self.Z = self.activation_layer.aZ  # output of given layer

    def backward(self,):
        self.activation_layer.backward()
        self.bias_add_layer.backward()
        self.multiply_layer.backward()


In [13]:
class NeuralNetwork(Layer):
    """
    Input  - layers : list of layer objects , loss_name : Name of loss layer
    """

    # [ "mean_squared", "cross_entropy"]
    def __init__(self, layers, loss_name="mean_squared", learning_rate=0.01, seed=42):
        np.random.seed(seed)

        self.layers = layers
        self.n_layers = len(layers)  # number of layers in neural network
        self.learning_rate = learning_rate

        self.inp_shape = self.layers[0].X.shape
        self.out_shape = self.layers[-1].Z.shape

        # random initialization of input X  and output Z
        self.X = np.random.random(self.inp_shape)   # assigned during SGD
        self.Y = np.random.random(self.out_shape)  # output of neural network

        # define loss layer
        if loss_name == "mean_squared":
            self.loss_layer = MeanSquaredLossLayer(self.Y, self.Y)
        if loss_name == "cross_entropy":
            self.loss_layer = CrossEntropyLossLayer(self.Y, self.Y)

    def forward(self,):
        self.layers[0].X = self.X
        self.loss_layer.Y = self.Y

        self.layers[0].forward()
        for i in range(1, self.n_layers):
            self.layers[i].X = self.layers[i-1].Z
            self.layers[i].forward()

        self.loss_layer.aZ = self.layers[-1].Z
        self.loss_layer.forward()

    def backward(self,):

        self.loss_layer.Z = self.Y
        self.loss_layer.backward()
        self.grad_nn = self.loss_layer.dL_daZ
        for i in range(self.n_layers-1, -1, -1):
            self.layers[i].backward()

            dL_dZ = np.dot(
                self.layers[i].activation_layer.daZ_dZ, self.grad_nn)
            dL_dW = np.dot(self.layers[i].multiply_layer.dZ_dW, dL_dZ.T)
            dL_dB = np.dot(self.layers[i].bias_add_layer.dZ_dB, dL_dZ).T

            # Update W & B
            self.layers[i].W -= self.learning_rate*dL_dW
            self.layers[i].B -= self.learning_rate*dL_dB

            # Update outer_grad
            self.grad_nn = np.dot(
                self.layers[i].multiply_layer.dZ_daZ_prev, dL_dZ)

            del dL_dZ, dL_dW, dL_dB


In [14]:
def createLayers(inp_shape, layers_sizes, layers_activations):
    layers = []
    n_layers = len(layers_sizes)
    layer_0 = Layer(inp_shape, layers_sizes[0], layers_activations[0])
    layers.append(layer_0)
    inp_shape_next = layers_sizes[0]
    for i in range(1, n_layers):
        layer_i = Layer(inp_shape_next, layers_sizes[i], layers_activations[i])
        layers.append(layer_i)
        inp_shape_next = layers_sizes[i]

    out_shape = inp_shape_next
    return inp_shape, out_shape, layers


In [15]:
def SGD_NeuralNetwork(X_train,
                      y_train,
                      X_test,
                      y_test,
                      nn,
                      inp_shape=1,   # dimension of input
                      out_shape=1,   # dimension of output
                      n_iterations=1000,
                      task="regression"  # [ "regression", "classification"]
                      ):
    iterations = trange(n_iterations, desc="Training ...", ncols=100)

    for iteration, _ in enumerate(iterations):
        randomIndx = np.random.randint(len(X_train))
        X_sample = X_train[randomIndx, :].reshape(1, inp_shape)
        Y_sample = y_train[randomIndx, :].reshape(1, out_shape)

        nn.X = X_sample
        nn.Y = Y_sample

        nn.forward()  # Forward Pass
        nn.backward()  # Backward Pass

    # Lets run ONLY forward pass for train and test data and check accuracy/error

    if task == "regression":
        nn.X = X_train
        nn.Y = y_train
        nn.forward()
        train_error = nn.loss_layer.L
        nn.X = X_test
        nn.Y = y_test

        nn.forward()

        test_error = nn.loss_layer.L

        if isinstance(nn.loss_layer, MeanSquaredLossLayer):
            print("Mean Squared Loss Error (Train Data)  : %0.5f" % train_error)
            print("Mean Squared Loss Error (Test Data)  : %0.5f" % test_error)

    if task == "classification":
        nn.X = X_train
        nn.Y = y_train
        nn.forward()
        y_true = np.argmax(y_train, axis=1)
        y_pred = np.argmax(nn.loss_layer.aZ, axis=1)
        acc = 1*(y_true == y_pred)
        print("Classification Accuracy (Training Data ): {0}/{1} = {2} %".format(
            sum(acc), len(acc), sum(acc)*100/len(acc)))

        nn.X = X_test
        nn.Y = y_test
        nn.forward()
        y_true = np.argmax(y_test, axis=1)
        y_pred = np.argmax(nn.loss_layer.aZ, axis=1)
        acc = 1*(y_true == y_pred)
        print("Classification Accuracy (Testing Data ): {0}/{1} = {2} %".format(
            sum(acc), len(acc), sum(acc)*100/len(acc)))


## Boston Dataset

In [21]:
X_train, y_train, X_test, y_test = load_data('boston', normalize_X=True, normalize_y=False, test_size=0.2)


#### One output neural with Linear activation and least mean squared error

In [22]:
inp_shape = X_train.shape[1]
layers_sizes = [1]
layers_activations = ['linear']

inp_shape, out_shape, layers = createLayers(inp_shape, layers_sizes, layers_activations)
loss_nn = 'mean_squared'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.1)

SGD_NeuralNetwork(X_train,y_train,X_test,y_test,nn,inp_shape, out_shape,n_iterations=11111,task="regression")

Training ...: 100%|█████████████████████████████████████████| 11111/11111 [00:01<00:00, 7642.55it/s]


Mean Squared Loss Error (Train Data)  : 48.00306
Mean Squared Loss Error (Test Data)  : 75.11091


#### 2 layers. Layer 1 with 13 output neurons with sigmoid activation. Layer 2 with one output neuron and linear activation. use mean squared loss

In [23]:
inp_shape = X_train.shape[1]
layers_sizes = [13,1]
layers_activations = ['sigmoid','linear']

inp_shape, out_shape, layers = createLayers(inp_shape, layers_sizes, layers_activations)
loss_nn = 'mean_squared'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.01)

SGD_NeuralNetwork(X_train,y_train,X_test,y_test,nn,inp_shape, out_shape,n_iterations=1000,task="regression")

Training ...: 100%|███████████████████████████████████████████| 1000/1000 [00:00<00:00, 7959.41it/s]

Mean Squared Loss Error (Train Data)  : 58.02224
Mean Squared Loss Error (Test Data)  : 86.06957





#### Three layers. Layer 1 with 13 output neurons with sigmoid activation. Layer 2 with 13 output neurons and sigmoid activation. Layer 3 with one output neuron and linear activation. use mean squared loss

In [24]:
inp_shape = X_train.shape[1]
layers_sizes = [13,13,1]
layers_activations = ['sigmoid','sigmoid','linear']

inp_shape, out_shape, layers = createLayers(inp_shape, layers_sizes, layers_activations)
loss_nn = 'mean_squared'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.001)

SGD_NeuralNetwork(X_train,y_train,X_test,y_test,nn,inp_shape, out_shape,n_iterations=1000,task="regression")

Training ...: 100%|███████████████████████████████████████████| 1000/1000 [00:00<00:00, 6571.26it/s]

Mean Squared Loss Error (Train Data)  : 79.62934
Mean Squared Loss Error (Test Data)  : 107.37404





## MNIST Dataset

In [36]:
X_train, y_train, X_test, y_test = load_data('mnist', one_hot_encode_y=True, test_size=0.3)

#### Two layers. Layer 1 with 89 output neurons with tanh activation. Layer 2 with ten output neuron and sigmoid activation. use mean squared loss

In [28]:
inp_shape = X_train.shape[1]
layers_sizes = [89,10]
layers_activations = ['tanh','sigmoid']

inp_shape, out_shape, layers = createLayers(inp_shape, layers_sizes, layers_activations)
loss_nn = 'mean_squared'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.1)

SGD_NeuralNetwork(X_train,y_train,X_test,y_test,nn,inp_shape, out_shape,n_iterations=10000,task="classification")

Training ...: 100%|█████████████████████████████████████████| 10000/10000 [00:01<00:00, 7283.20it/s]

Classification Accuracy (Training Data ): 1194/1257 = 94.98806682577566 %
Classification Accuracy (Testing Data ): 490/540 = 90.74074074074075 %





#### Two layers. Layer 1 with 89 output neurons with tanh activation. Layer 2 with ten output neuron and linear activation. use softmax with cross entropy loss.

In [37]:
inp_shape = X_train.shape[1]
layers_sizes = [89,10]
layers_activations = ['tanh','softmax']

inp_shape, out_shape, layers = createLayers(inp_shape, layers_sizes, layers_activations)
loss_nn = 'cross_entropy'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.01)

SGD_NeuralNetwork(X_train,y_train,X_test,y_test,nn,inp_shape, out_shape,n_iterations=10000,task="classification")

Training ...: 100%|█████████████████████████████████████████| 10000/10000 [00:01<00:00, 7491.59it/s]


Classification Accuracy (Training Data ): 1191/1257 = 94.74940334128878 %
Classification Accuracy (Testing Data ): 497/540 = 92.03703703703704 %


## CNN Model

In [30]:
# Assuming we are given single channel input and initial filter to be a 3x3 matrix:

def convolutional_layer(zero_pad_input, l_filter):

    l = len(inp)  # length of input matrix
    m = len(l_filter)  # length of filter
    c = len(zero_pad_input)  # size of zero-padded matrix
    s = (c - m) + 1  # to be used for loop for filtering
    out = np.zeros((l, l))  # output after convolution

    # filtering-
    for i in range(s):
        for j in range(s):
            temp = np.zeros((m, m))
            row, col = np.indices((m, m))
            temp = np.multiply(zero_pad_input[row+i, col+j], l_filter)

            out[i][j] = np.sum(temp)

    return out

# -----------------------------------------------------------------------------------


# Forward pass implementation-
def Forward_pass(inp, l_filter):
    l = len(inp)
    # Zero-padding of input layer-
    zero_pad_input = np.zeros((l+2, l+2))
    zero_pad_input[1:l+1, 1:l+1] = inp

    f_out = convolutional_layer(zero_pad_input, l_filter)
    return f_out


# -----------------------------------------------------------------------------------

# Function to Rotate
# the matrix by 180 degree
def rotateMatrix(mat):
    N = len(mat)
    rot_mat = np.zeros((N, N))
    k = N - 1
    t1 = 0
    while (k >= 0 and t1 < 3):
        j = N - 1
        t2 = 0
        while (j >= 0 and t2 < N):
            rot_mat[t1][t2] = mat[k][j]
            j = j - 1
            t2 = t2 + 1
        k = k - 1
        t1 = t1 + 1

    return rot_mat
# -----------------------------------------------------------------------------------


# Backward pass implementation-

def Backward_pass(inp, output, l_filter):
    l = len(inp)

# --------------------------------Backward Pass---------------------------------------
    # Zero-padding of input layer-
    zero_pad_input = np.zeros((l+2, l+2))
    zero_pad_input[1:l+1, 1:l+1] = inp

    grad_filter = convolutional_layer(zero_pad_input, output)
    # we can use gradient of filter coefficient matrix to update the filter matrix:
    # -- l_filter - l_filter - alpha*grad_filter ,where alpha is learning rate

    # for gradient of loss w.r.t input, we need to rotate the filter by 180° and apply convolution.
    rotated_filter = rotateMatrix(l_filter)
    zero_pad_output = np.zeros((l+2, l+2))
    zero_pad_output[1:l+1, 1:l+1] = output
    grad_X = convolutional_layer(zero_pad_output, rotated_filter)

    return grad_filter, grad_X

# -----------------------------------------------------------------------------------


# flatten operation:

def flatten(inp_mat):
    flatten_vector = []

    for i in range(len(inp_mat)):  # number of rows
        for j in range(len(inp_mat[0])):  # number of columns
            flatten_vector.append(inp_mat[i][j])

    flatten_vector = np.array(flatten_vector)
    return flatten_vector

# -----------------------------------------------------------------------------------


In [31]:
class ConvolutionalLayer:
    """
    Implementation of Convolutional Layer consist of Convolution  followed by flattening  and Activation operation
    """

    def __init__(self,
                 # inp_shape = (input_channels, input_height, input_width )
                 inp_shape,
                 activation='tanh',
                 # filter_shape = (filter_height, filter_width)
                 filter_shape=(1, 1),
                 lr=0.01,
                 Co=1,
                 seed=42):                                                # number of output channels

        np.random.seed(seed)
        # Check if filter is valid or NOT
        assert (inp_shape[1] >= filter_shape[0] and inp_shape[2] >= filter_shape[1]), \
            "Error : Input {} incompatible with filter {}".format(
                inp.shape, filter_shape)

        self.inp = np.random.rand(*inp_shape)
        self.inp_shape = inp_shape
        # number of channels in input here denoted as inp
        self.Ci = self.inp.shape[0]
        # number of output channels
        self.Co = Co
        self.filters_shape = (self.Co, self.Ci,  *filter_shape)
        self.out_shape = (
            self.Co, self.inp.shape[1] - filter_shape[0] + 1, self.inp.shape[2] - filter_shape[1] + 1)
        self.flatten_shape = self.out_shape[0] * \
            self.out_shape[1]*self.out_shape[2]
        self.lr = lr

        # Randomly initialize filters, biases, output, flatten output
        self.filters = np.random.rand(*self.filters_shape)
        self.biases = np.random.rand(*self.out_shape)
        self.out = np.random.rand(*self.out_shape)
        self.flatten_out = np.random.rand(1, self.flatten_shape)

        # Define activation function
        if activation == 'tanh':
            self.activation_layer = tanhActivation(self.out)

    def forward(self, ):
        self.out = np.copy(self.biases)  # add bias to output
        for i in range(self.Co):
            for j in range(self.Ci):
                self.out[i] += self.convolve(self.inp[j], self.filters[i, j])

        self.flatten()
        self.activation_layer.Z = self.flatten_out
        self.activation_layer.forward()

    def backward(self, grad_nn):

        self.activation_layer.backward()
        loss_gradient = np.dot(self.activation_layer.daZ_dZ, grad_nn)
        # reshape to (Co, H_out, W_out)
        loss_gradient = np.reshape(loss_gradient, self.out_shape)

        # dL/dKij for each filter  Kij    1<=i<=Ci , 1<=j<=Co
        self.filters_gradient = np.zeros(self.filters_shape)
        self.input_gradient = np.zeros(self.inp_shape)  # dL/dXj
        self.biases_gradient = loss_gradient  # dL/dBi  = dL/dYi
        padded_loss_gradient = np.pad(loss_gradient, ((
            0, 0), (self.filters_shape[2]-1, self.filters_shape[2]-1), (self.filters_shape[3]-1, self.filters_shape[3]-1)))

        for i in range(self.Co):
            for j in range(self.Ci):
                self.filters_gradient[i, j] = self.convolve(
                    self.inp[j], loss_gradient[i])  # dL/dKij = convolution( Xj, dL/dYi)
                rot180_Kij = np.rot90(
                    np.rot90(self.filters[i, j], axes=(0, 1)), axes=(0, 1))
                # dL/dXj = convolution ( padded dL/dYi , Kij rotated by 180 anit-clockwise )
                self.input_gradient[j] += self.convolve(
                    padded_loss_gradient[i], rot180_Kij)

        # update filters and biases
        self.filters -= self.lr*self.filters_gradient
        self.biases -= self.lr*self.biases_gradient

    # flattening output to 1 Dimension so it can be fed int neural network

    def flatten(self, ):
        self.flatten_out = self.out.reshape(1, -1)

    # convolutional operation with stride=1
    def convolve(self, x, y):
        x_conv_y = np.zeros(
            (x.shape[0] - y.shape[0] + 1, x.shape[1] - y.shape[1] + 1))
        for i in range(x.shape[0]-y.shape[0] + 1):
            for j in range(x.shape[1] - y.shape[1] + 1):
                tmp = x[i:i+y.shape[0], j:j+y.shape[1]]
                tmp = np.multiply(tmp, y)
                x_conv_y[i, j] = np.sum(tmp)
        return x_conv_y


In [32]:
class CNN : 
    """
    Implementation of Convolutional Neural Network
    """
    def __init__(self, 
                convolutional_layer,                   # convolutional layer 
                nn,                                    # feed forward neural network
                seed = 42): 

        self.nn = nn 
        self.convolutional_layer = convolutional_layer 
        self.X = _ # assigned during SGD 
        self.Y = _ # assigned during SGD 
    
    def forward(self,):
        # forward pass of convolutional layer 
        self.convolutional_layer.inp = self.X 
        self.convolutional_layer.forward()

        # forward pass of neural network 
        self.nn.X = self.convolutional_layer.activation_layer.aZ
        self.nn.Y = self.Y 
        self.nn.forward()  
    
    def backward(self,): 
        # backward pass of neural network 
        self.nn.backward() 

        # backward pass of convolutional network 
        self.convolutional_layer.backward( self.nn.grad_nn )  

In [38]:
def SGD_CNN(X_train,
            y_train,
            X_test,
            y_test,
            cnn,
            inp_shape,
            out_shape,
            n_iterations=1000,
            task="classification"):

    iterations = trange(n_iterations, desc="Training ...", ncols=100)

    for iteration, _ in enumerate(iterations):
        randomIndx = np.random.randint(len(X_train))
        X_sample = X_train[randomIndx, :].reshape(inp_shape)
        Y_sample = y_train[randomIndx, :].reshape(out_shape)

        cnn.X = X_sample
        cnn.Y = Y_sample

        cnn.forward()  # Forward Pass
        cnn.backward()  # Backward Pass

    # Lets run ONLY forward pass for train and test data and check accuracy/error

    if task == "classification":
        X_train = X_train.reshape(-1, 8, 8)
        y_true = np.argmax(y_train, axis=1)
        acc = 0
        for i in range(len(X_train)):
            cnn.X = X_train[i][np.newaxis, :, :]
            cnn.Y = y_train[i]
            cnn.forward()
            y_pred_i = np.argmax(cnn.nn.loss_layer.aZ, axis=1)
            if (y_pred_i == y_true[i]):
                acc += 1
        
        print("Classification Accuracy (Training Data ):" + str(acc) + "/" + str(len(y_true)) + " = " + str(acc*100/len(y_true)) + " %" )

        X_test = X_test.reshape(-1, 8, 8)
        y_true = np.argmax(y_test, axis=1)
        acc = 0
        for i in range(len(X_test)):
            cnn.X = X_test[i][np.newaxis, :, :]
            cnn.Y = y_test[i]
            cnn.forward()
            y_pred_i = np.argmax(cnn.nn.loss_layer.aZ, axis=1)
            if (y_pred_i == y_true[i]):
                acc += 1
        
        print("Classification Accuracy (Testing Data ):" + str(acc) + "/" + str(len(y_true)) + " = " + str(acc*100/len(y_true)) + " %" )


In [34]:
X_train, y_train, X_test, y_test = load_data('mnist', one_hot_encode_y=True)


In [35]:
conv_inp_shape = (1,8,8)   # sklearn digit dataset has images of shape 1 x 8 x 8
Co = 16  # 16 channel output 
conv_filter_shape = (3,3)
conv_activation = 'tanh'
convolutional_layer = ConvolutionalLayer(conv_inp_shape, 
                                        filter_shape = conv_filter_shape, 
                                        Co = Co,
                                        activation = conv_activation,
                                        lr = 0.01)
nn_inp_shape = convolutional_layer.flatten_shape 
layers_sizes = [10]
layers_activations = ['softmax']

nn_inp_shape, nn_out_shape, layers = createLayers(nn_inp_shape, layers_sizes, layers_activations)
loss_nn = 'cross_entropy'

nn = NeuralNetwork(layers, loss_nn, learning_rate=0.01)

cnn = CNN( convolutional_layer, nn)
out_shape =  (1, layers_sizes[-1])  # one_hot encoded ouptut 

SGD_CNN(X_train,y_train,X_test,y_test, cnn,conv_inp_shape, out_shape,n_iterations=5000)

Training ...: 100%|█████████████████████████████████████████████| 5000/5000 [03:53<00:00, 21.45it/s]


Classification Accuracy (Training Data ): 1242/1437 = 86.43006263048017 %
Classification Accuracy (Testing Data ): 304/360 = 84.44444444444444 %
