In [1]:
import torch
import random

In [2]:
class NeuralNetwork:
    def __init__(self, depth, layers_units=[], activation=[], learning_rate=.01):
        self.depth = depth
        self.layers_units = layers_units
        self.activation = activation
        self.learning_rate = learning_rate
        self.loss = []
    
    # initialize weights
    def initialize_weights(self):
        # set random seed
        torch.manual_seed(0)

        self.params = {}
        c = 1e-4
        input = self.X.shape[1]
        for i in range(self.depth):
            """
                input layer is not counted in depth
                for each layer initialize W and b and save values in params
                N1 -> node-1
                w1 -> weight for x1 in the input

                      N1-N2-N3 
                W = | w1 w1 w1|
                    | w2 w2 w2|
                    | w3 w3 w3|

                hidden layer input = output of previous layers
                
            """
            # initialize weights
            self.params[f'W{i+1}'] = torch.randn(input, self.layers_units[i],
                                                    dtype=torch.float32)
            # initialize bias
            self.params[f'b{i+1}'] = torch.zeros(self.layers_units[i], )

            # update input for next layer to be output of curren tlayer
            input = self.layers_units[i]

    # activation unit
    def activate(self, z, actv):
        # sigmoid
        if actv == 'sig':
            return 1 / ( 1 + torch.exp(-z))
        
        # relu 
        if actv == 'relu':
            z[z < 0] = 0
            return z

        # softmax
        if actv == 'softmax':
            # numerical stability by subtracting constant for z
            z = z - torch.max(z, dim =1, keepdim=True)[0]
            z = torch.exp(z) / (torch.sum(torch.exp(z), dim=1, keepdim=True))
            return z

    # activation grad
    def actv_grad(self, G, actv):
        # gradient w.r.t sigmoid function
        if actv == 'sig':
            return self.activate(G, actv) * (1 - self.activate(G, actv))
        
        # gradient with respect to relu
        if actv == 'relu':
            G[G < 0] = 0
            G[G > 0] = 1
            return G

    # feed forward
    def feed_forward(self, x):
        # dictionary to cache forward pass results
        layer_output = {}
        
        # set x to be A0
        layer_output['A0'] = x

        # feed x forward in the netword
        for l in range(self.depth):
            # grab layer params W and b
            W = self.params[f'W{l+1}']
            b = self.params[f'b{l+1}']

            # grab layer activation
            actv = self.activation[l]

            # calculate Layer output
            Z = torch.mm(x, W) + b
            A = self.activate(Z, actv)

            # layer output is the input to next layer
            x = A

            # cache layer results
            layer_output[f'Z{l+1}'] = Z
            layer_output[f'A{l+1}'] = A

        return layer_output

    # back propagate
    def backprop(self, x, Y, layer_output, reg=0.0):
        # last layer output is the score
        scores = layer_output[f'A{self.depth}']

        N = x.shape[0]
        # compute softmax loss
        idx0 = torch.arange(N).reshape(-1, 1)
        loss = torch.sum(-torch.log(scores[[idx0, Y]]))/N

        # modify convert y from index of correct class to matrix 
        """
            Y -> 1D tensor with values equal to index of correct class
        """
        Y_ = torch.zeros(N, self.c)
        Y_[idx0, Y] = 1

        # dictionary to cache grads
        grads = {}

        # gradient w.r.t classification
        dCE = None

        # backprop Error through network layers
        for i in range(self.depth, 0, -1):
            # grab layer activation function
            layer_actv = self.activation[i-1]

            # if output layer 
            if i == self.depth:
                dCE = (scores - Y_)/N
            
            else:
                # grab next layer weight matrix
                W = self.params[f'W{i+1}']
                # grab layer input 
                Z = layer_output[f'Z{i}']
                # backpropagate gradient from next layer to current layer
                dCE = torch.mm(dCE, W.t()) * self.actv_grad(Z, layer_actv) 

            # calculate gradient of current layer parameters
            A = layer_output[f'A{i-1}']
            grads[f'dW{i}'] = torch.mm(A.T, dCE)
            grads[f'db{i}'] = torch.sum(dCE)
            
        return scores, grads, loss

    # update weights 
    def update_weights(self, grads):
        for i in range(self.depth):
            self.params[f'W{i+1}'] = self.params[f'W{i+1}'] - grads[f'dW{i+1}'] * self.learning_rate
            self.params[f'b{i+1}'] = self.params[f'b{i+1}'] - grads[f'db{i+1}'] * self.learning_rate

    # train network
    def train(self, X, Y, c, batch, num_iter):
        self.c = c
        self.X = X
        # initialize weights
        self.initialize_weights()

        # train
        for i in range(num_iter):
            for b in range(0, X.shape[0], batch):
                # grab batch of the data
                x = X[b:b+batch, :]
                y = Y[b:b+batch, :]

                # feed forward
                layer_output = self.feed_forward(x)

                # backprop
                scores, grads, loss = self.backprop(x, y, layer_output)
                self.loss.append({
                    f'iteration:{i}-batch:{b//batch}': {'loss': loss}
                })

                # update weights
                self.update_weights(grads)
            if i % 10 == 0:
                print(f"Epoch {i}", f"Loss {loss}")

    # make predictions
    def predict(self, X):
        layer_output = self.feed_forward(X)
        pred = layer_output[f'A{self.depth}']
        y_ = torch.max(pred, axis=1, keepdim=True)[1]
        return y_

In [3]:
X = torch.tensor([
        [ 25.4410,  -7.1635,  -4.9337,   1.2671],
        [  1.0136,  -4.0353,   9.0226,   8.0993],
        [ -6.8838,   1.3724,  10.3774,   0.9255],
        [ -3.7518,  -0.9082,  20.6391, -18.1638],
        [ -2.7188,   2.8113, -10.3986,   7.7653]
    ])

Y = torch.tensor([[0],
                  [1],
                  [2],
                  [2],
                  [1]])

In [4]:
obj = NeuralNetwork(depth=2, layers_units=[5, 3], activation=['relu', 'softmax'])

In [5]:
obj.train(X, Y, c=3, batch=4, num_iter=100)

Epoch 0 Loss 1.0751113891601562
Epoch 10 Loss 0.013420832343399525
Epoch 20 Loss 0.0077223158441483974
Epoch 30 Loss 0.0053812554106116295
Epoch 40 Loss 0.0041109067387878895
Epoch 50 Loss 0.003315814072266221
Epoch 60 Loss 0.0027725351974368095
Epoch 70 Loss 0.002378369215875864
Epoch 80 Loss 0.00207980046980083
Epoch 90 Loss 0.0018460494466125965


In [6]:
obj.predict(X)

tensor([[0],
        [1],
        [2],
        [2],
        [1]])