In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import copy

In [2]:
class MyNeuralNetwork():
    """
    My implementation of a Neural Network Classifier.
    """

    acti_fns = ['relu', 'sigmoid', 'linear', 'tanh', 'softmax']
    weight_inits = ['zero', 'random', 'normal']

    def __init__(self, n_layers, layer_sizes, activation, learning_rate, weight_init, batch_size, num_epochs):
        """
        Initializing a new MyNeuralNetwork object

        Parameters
        ----------
        n_layers : int value specifying the number of layers

        layer_sizes : integer array of size n_layers specifying the number of nodes in each layer

        activation : string specifying the activation function to be used
                     possible inputs: relu, sigmoid, linear, tanh

        learning_rate : float value specifying the learning rate to be used

        weight_init : string specifying the weight initialization function to be used
                      possible inputs: zero, random, normal

        batch_size : int value specifying the batch size to be used

        num_epochs : int value specifying the number of epochs to be used
        """ 
        self.n_layers = n_layers
        self.layer_sizes = layer_sizes
        self.activation = activation
        self.learning_rate = learning_rate
        self.weight_init = weight_init
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.weights = None
        self.biases = None
        
        
        activation_fn_mapping = { 'relu': self.relu, 'sigmoid': self.sigmoid, 'linear': self.linear, 'tanh': self.tanh, 'softmax': self.softmax}
        derivative_fn_mapping = { 'relu': self.relu_grad, 'sigmoid': self.sigmoid_grad, 'linear': self.linear_grad, 'tanh': self.tanh_grad, 'softmax': self.softmax_grad}
        weight_init_mapping = {'zero': self.zero_init, 'random': self.random_init, 'normal': self.normal_init}
        
        self.activation_fn = activation_fn_mapping[activation]
        self.ac_derivation_fn = derivative_fn_mapping[activation]
        self.weight_init_fn = weight_init_mapping[weight_init]
        

        if activation not in self.acti_fns:
            raise Exception('Incorrect Activation Function')

        if weight_init not in self.weight_inits:
            raise Exception('Incorrect Weight Initialization Function')
        pass

    def relu(self, X):
        """
        Calculating the ReLU activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return X * (X>=0)


    def relu_grad(self, X):
        """
        Calculating the gradient of ReLU activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return 1 * (X>=0)


    def sigmoid(self, X):
        """
        Calculating the Sigmoid activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return 1/(1 + np.exp(-X)) 


    def sigmoid_grad(self, X):
        """
        Calculating the gradient of Sigmoid activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return self.sigmoid(X)*(1-self.sigmoid(X))
#         return X*(1-X)

    def linear(self, X):
        """
        Calculating the Linear activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return None

    def linear_grad(self, X):
        """
        Calculating the gradient of Linear activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return None

    def tanh(self, X):
        """
        Calculating the Tanh activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return 2/(1+(np.exp(-X)*np.exp(-X)))-1
                
    def tanh_grad(self, X):
        """
        Calculating the gradient of Tanh activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return 1 - self.tanh(X)*self.tanh(X)

#     def softmax(self, X):
#         """
#         Calculating the softmax activation for a particular layer

#         Parameters
#         ----------
#         X : 1-dimentional numpy array 

#         Returns
#         -------
#         x_calc : 1-dimensional numpy array after calculating the necessary function over X
#         """
#         summ = np.sum(np.exp(X))
#         print(X)
#         print(summ)
#         print(np.exp(X)/summ)
# #         print("yoyoyo")
#         return np.exp(X)/summ
    def softmax(self, X):
        exps = np.exp(X - np.max(X, axis=1, keepdims=True))
        return exps/np.sum(exps, axis=1, keepdims=True)
    
    
                      
    def softmax_grad(self, X):
        """
        Calculating the gradient of Softmax activation for a particular layer

        Parameters
        ----------
        X : 1-dimentional numpy array 

        Returns
        -------
        x_calc : 1-dimensional numpy array after calculating the necessary function over X
        """
        return None

    def zero_init(self, shape):
        """
        Calculating the initial weights after Zero Activation for a particular layer

        Parameters
        ----------
        shape : tuple specifying the shape of the layer for which weights have to be generated 

        Returns
        -------
        weight : 2-dimensional numpy array which contains the initial weights for the requested layer
        """

        return np.zeros(shape)

    def random_init(self, shape):
        """
        Calculating the initial weights after Random Activation for a particular layer

        Parameters
        ----------
        shape : tuple specifying the shape of the layer for which weights have to be generated 

        Returns
        -------
        weight : 2-dimensional numpy array which contains the initial weights for the requested layer
        """
        return 0.01 * np.random.rand(shape[0], shape[1])

    def normal_init(self, shape):
        """
        Calculating the initial weights after Normal(0,1) Activation for a particular layer

        Parameters
        ----------
        shape : tuple specifying the shape of the layer for which weights have to be generated 

        Returns
        -------
        weight : 2-dimensional numpy array which contains the initial weights for the requested layer
        """
        return  np.random.randn(shape[0], shape[1])

    def fit(self, X, y):
        """
        Fitting (training) the linear model.

        Parameters
        ----------
        X : 2-dimensional numpy array of shape (n_samples, n_features) which acts as training data.

        y : 1-dimensional numpy array of shape (n_samples,) which acts as training labels.
        
        Returns
        -------
        self : an instance of self
        """

        # fit function has to return an instance of itself or else it won't work with test.py
        
        num_labels = len(np.unique(y))
        new_y = []
        for c in y:
            a = []
            for i in range(num_labels):
                a.append(0)
            a[c] = 1
            a = np.array(a)
            new_y.append(a)
        new_y = np.array(new_y)
        y = new_y
        
        n_rows, n_cols = X.shape
        self.weights = [0]
        self.biases = [0]
        

        for i in range(0, self.n_layers -1):
            self.weights.append(self.weight_init_fn((self.layer_sizes[i], self.layer_sizes[i+1])))
            self.biases.append(np.zeros((1, self.layer_sizes[i+1])))
                                                           
        for i in tqdm(range(self.num_epochs)):    
#             batch_indices = np.random.choice(n_rows, size = self.batch_size, replace = False)
#             X_batch = X[batch_indices, :]
#             y_batch = y[batch_indices, :]
            A = self.forwardPhase(X)
            print("Loss:", self.cross_entropy_loss(A, y))
            self.backwardPhase(X, y , A)
            
            
        return self

    def forwardPhase(self, X):
        
        input = X
        
        A = [X]
        
        for i in range(self.n_layers-1):
                
            z = np.matmul(input, self.weights[i+1]) + self.biases[i+1] 
            if i != self.n_layers-2:
                a = self.activation_fn(z)
            else:
                a = self.softmax(z)
            A.append(a)
            input = a
        
        return A
        
    
    def backwardPhase(self, X, y, A):
        da = (A[-1] - y)/len(X)
        prev_weights = copy.deepcopy(self.weights[-1])
        self.weights[-1] -= self.learning_rate * np.matmul(A[-2].T, da)
        self.biases[-1] -= self.learning_rate * np.sum(da, axis = 0, keepdims = True)
        for i in range(self.n_layers-2, 0, -1):
            dz = np.matmul(da, prev_weights.T)
            da = dz * self.ac_derivation_fn(A[i])
            prev_weights = copy.deepcopy(self.weights[i])
            self.weights[i] -= self.learning_rate * np.matmul(A[i-1].T, da)
            self.biases[i] -= self.learning_rate * np.sum(da, axis = 0)

            
    def predict_proba(self, X):
        """
        Predicting probabilities using the trained linear model.

        Parameters
        ----------
        X : 2-dimensional numpy array of shape (n_samples, n_features) which acts as testing data.

        Returns
        -------
        y : 2-dimensional numpy array of shape (n_samples, n_classes) which contains the 
            class wise prediction probabilities.
        """

        # return the numpy array y which contains the predicted values
        return None

    def predict(self, X):
        """
        Predicting values using the trained linear model.

        Parameters
        ----------
        X : 2-dimensional numpy array of shape (n_samples, n_features) which acts as testing data.

        Returns
        -------
        y : 1-dimensional numpy array of shape (n_samples,) which contains the predicted values.
        """

        # return the numpy array y which contains the predicted values
        
        A = self.forwardPhase(X)
        preds = np.argmax(A[-1], axis = 1)
#         return A[-1]
        return preds

    def score(self, X, y):
        """
        Predicting values using the trained linear model.

        Parameters
        ----------
        X : 2-dimensional numpy array of shape (n_samples, n_features) which acts as testing data.

        y : 1-dimensional numpy array of shape (n_samples,) which acts as testing labels.

        Returns
        -------
        acc : float value specifying the accuracy of the model on the provided testing set
        """
        
        preds = self.predict(X)
        temp = (preds == y)
        

        # return the numpy array y which contains the predicted values
        return temp.sum()/len(X)
    
    def cross_entropy_loss(self, A, y):
        n = len(y)
        logp = - np.log(A[-1][np.arange(n), y.argmax(axis=1)])
        loss = np.sum(logp)/n
        return loss



In [3]:
train_df = pd.read_csv('mnist_train.csv')

In [4]:
test_df = pd.read_csv('mnist_test.csv')

In [5]:
test_df.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
dataset = train_df.to_numpy()

In [7]:
testset = test_df.to_numpy()

In [8]:
testset.shape

(10000, 785)

In [9]:
from sklearn.preprocessing import StandardScaler
X_train = dataset[:, 1:]
X_test = testset[:, 1:]
# X_train = X_train/255
# X_test = X_test/255
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [10]:
y_train = dataset[:, 0]
y_test = testset[:, 0]

In [11]:
X_test.shape

(10000, 784)

In [12]:
nn = MyNeuralNetwork(5, [784, 256, 128, 64, 10], 'sigmoid', 0.1, 'normal', len(X_train), 10)

In [13]:
nn.fit(X_train, y_train)

  return 1/(1 + np.exp(-X))


Loss: 8.062849526050323


 10%|█         | 1/10 [00:03<00:28,  3.20s/it]

Loss: 5.970112209151061


 20%|██        | 2/10 [00:06<00:26,  3.28s/it]

Loss: 4.6336321992306795


 30%|███       | 3/10 [00:10<00:24,  3.54s/it]

Loss: 3.709464106922473


 40%|████      | 4/10 [00:14<00:21,  3.65s/it]

Loss: 3.0432118358854816


 50%|█████     | 5/10 [00:18<00:18,  3.70s/it]

Loss: 2.5330138002736042


 60%|██████    | 6/10 [00:22<00:15,  3.88s/it]

Loss: 2.1758962992401782


 70%|███████   | 7/10 [00:26<00:11,  3.92s/it]

Loss: 1.9379368996238273


 80%|████████  | 8/10 [00:30<00:07,  3.93s/it]

Loss: 1.772702667524957


 90%|█████████ | 9/10 [00:35<00:04,  4.02s/it]

Loss: 1.6493737569981248


100%|██████████| 10/10 [00:38<00:00,  3.89s/it]


<__main__.MyNeuralNetwork at 0x7fb317fe1d00>

In [14]:
preds = nn.predict(X_test)

  return 1/(1 + np.exp(-X))


In [15]:
preds.shape

(10000,)

In [16]:
np.unique(preds)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
nn.score(X_test, y_test)

  return 1/(1 + np.exp(-X))


0.5339

In [18]:
nn.score(X_train, y_train)

  return 1/(1 + np.exp(-X))


0.52485

In [19]:
preds[:50]

array([7, 2, 1, 0, 4, 1, 4, 1, 5, 7, 0, 0, 4, 5, 1, 5, 4, 7, 2, 4, 7, 6,
       6, 5, 4, 0, 5, 4, 0, 1, 5, 1, 5, 0, 3, 2, 7, 1, 0, 1, 1, 7, 4, 1,
       3, 3, 3, 6, 4, 4])