# Introduction

In this notebook, we use **only numpy** to implement **Feed Forward Neural Network** as a **Computational Graph** by employing the idea of forward and backward API. Therafter we will use our implementation on datasets provided in **sklearn**.

In [1]:
import numpy as np
import copy
from util import spiral_data_gen
from sklearn.metrics import classification_report
from sklearn.datasets import *

In [2]:
class Net:
    def __init__(self):
        self.gates = []

    def add(self, gate):
        self.gates.append(gate)

    def forward(self, inputs):
        for g in self.gates:
            inputs = g.forward(inputs)
        return inputs

    def backward(self, dL):
        for g in reversed(self.gates):
            dL = g.backward(dL)

    def update(self):
        for g in self.gates:
            g.update()

In [3]:
class Gate:
    def __init__(self, shape, learning_rate=.001, xavier_init=False):
        fout, fin = shape
        self.learning_rate = learning_rate
        self.W = np.random.randn(fout, fin)
        self.b = np.ones((fout, 1))
        if xavier_init:
            self.W /= np.sqrt(fin)
        else:
            self.W *= .001
            self.b *= .001
        self.X = None
        self.S = None  # S=W.dot(x)+b
        self.Z = None  # activation
        self.dLdW, self.dLdb, self.dLdX = None, None, None
        self.dZdS = None
        self.dSdW = None
        self.dSdX = None

        self.decay_rate = .99
        self.cache_w = np.zeros(self.W.shape)
        self.cache_b = np.zeros(self.b.shape)

    def update(self):

        # Gradient Descent
        # self.W += -self.learning_rate * self.dLdW
        # self.b += -self.learning_rate * self.dLdb

        # AdaGrad update
        # self.cache_w += self.dLdW ** 2
        # self.cache_b += self.dLdb ** 2
        # self.W += -self.learning_rate * self.dLdW / (np.sqrt(self.cache_w) + 1e-7)
        # self.b += -self.learning_rate * self.dLdb / (np.sqrt(self.cache_b) + 1e-7)

        # RMSProb
        self.cache_w = self.decay_rate * self.cache_w + (1 - self.decay_rate) * self.dLdW ** 2
        self.cache_b = self.decay_rate * self.cache_b + (1 - self.decay_rate) * self.dLdb ** 2
        self.W += -self.learning_rate * self.dLdW / (np.sqrt(self.cache_w) + 1e-7)
        self.b += -self.learning_rate * self.dLdb / (np.sqrt(self.cache_b) + 1e-7)

        # ADAM
        # m =  beta1 * m + (1-beta1) * dx # Momentum
        # v = beta2 * v + (1-beta2) * (dx**2) # RMSPROB like
        # m /= 1-beta1**t
        # v /= 1 - beta2 ** t
        # self.W += -self.learning_rate * m / (np.sqrt(v) + 1e-7) # RMSPROB like


class SigmoidGate(Gate):
    def __init__(self, shape, learning_rate, xavier_init=False):
        super().__init__(shape, learning_rate, xavier_init)

    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def dsigmoid(self, x):
        return (1.0 - self.sigmoid(x)) * self.sigmoid(x)

    def forward(self, x):
        self.X = x
        self.S = self.W.dot(self.X) + self.b
        self.Z = self.sigmoid(self.S)
        # compute local gradients
        self.dZdS = self.dsigmoid(self.S)
        self.dSdW = self.X
        self.dSdX = self.W
        return self.Z

    def backward(self, dLdZ):
        assert self.Z.shape == dLdZ.shape
        # dL/dS= dZ/dS * dL/dZ
        dLdS = self.dZdS * dLdZ
        # dL/dW= dS/dW * dL/dS
        self.dLdW = dLdS.dot(self.dSdW.T)
        # dL/dX= dS/dX * dL/dS
        self.dLdX = self.dSdX.T.dot(dLdS)
        self.dLdb = np.sum(dLdS, axis=1, keepdims=True)
        return copy.deepcopy(dLdS)


class SoftmaxGate(Gate):
    def __init__(self, shape, learning_rate, xavier_init=False):
        super().__init__(shape, learning_rate, xavier_init)

    def softmax(self, x, axis=0):
        x -= np.max(x, axis=axis, keepdims=True)
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores, axis=axis, keepdims=True)

    def forward(self, x):
        self.X = x
        self.S = self.W.dot(self.X) + self.b
        self.dSdW = self.X
        self.dSdX = self.W
        return self.softmax(self.S)

    def backward(self, dLdS):
        assert self.S.shape == dLdS.shape
        # Propagate dLdZ into dW,db, dS
        # dLdW= dS/dW * dL/dS
        self.dLdW = dLdS.dot(self.dSdW.T)
        assert self.dLdW.shape == self.W.shape

        # dLdX= dS/dX * dL/dS
        dLdX = self.dSdX.T.dot(dLdS)
        assert dLdX.shape == self.X.shape

        self.dLdb = np.sum(dLdS, axis=1, keepdims=True)
        return copy.deepcopy(dLdX)


class ReluGate(Gate):
    def __init__(self, shape, learning_rate, xavier_init=False):
        super().__init__(shape, learning_rate, xavier_init)

        if xavier_init:
            self.W /= np.sqrt(fin / 2)  # He et al. 2015
        else:
            self.W *= .001
            self.b *= .001

    def relu(self, X):
        return np.maximum(0, X)

    def forward(self, x):
        self.S = self.W.dot(x) + self.b
        self.dSdW = x
        self.dSdX = self.W
        self.Z = self.relu(self.S)
        return self.Z

    def backward(self, dLdZ):
        try:
            assert self.Z.shape == dLdZ.shape
        except:
            print(self.Z.shape)
            print(dLdZ.shape)
            exit(1)
        dLdZ[self.S <= 0] = 0
        dZdS = dLdZ
        # Propagate dLdZ into dW,db, dS
        # dLdW= dS/dW * dL/dS
        self.dLdW = dZdS.dot(self.dSdW.T)
        # dLdX= dS/dX * dL/dS
        dLdX = self.dSdX.T.dot(dZdS)
        self.dLdb = np.sum(dZdS, axis=1, keepdims=True)
        return copy.deepcopy(dLdX)

In [4]:

for X, y in [
    spiral_data_gen(False),
    (load_wine()['data'], load_wine()['target']),
    (load_breast_cancer()['data'], load_breast_cancer()['target']),
    (load_iris()['data'], load_iris()['target']),
    (load_digits()['data'], load_digits()['target']),
]:  # ,
    X -= np.mean(X, axis=0)  # zero-centerring.
    X = X.T
    D, N = X.shape
    K = len(np.unique(y))

    print(X.shape)

    hidden_size = 100
    model = Net()  # TODO weight decay impleement.
    model.add(ReluGate(shape=(hidden_size, D), learning_rate=.001))
    model.add(SoftmaxGate(shape=(K, hidden_size), learning_rate=.001))
    num_epoch = 10_000
    mode = num_epoch // 10

    for epoch in range(num_epoch):
        # forward
        f = model.forward(X)
        if epoch % mode == 0:
            loss = (-np.log(f[y, range(N)] + .01)).mean()  # compute the loss
            print('{0}.th epoch Loss:{1}'.format(epoch, loss))
            if loss < .001:
                break
        # backward
        dLdf = f
        dLdf[y, range(N)] -= 1
        dLdf /= N
        model.backward(dLdf)
        model.update()

    y_head = np.argmax(model.forward(X), axis=0)
    print(classification_report(y, y_head))

(2, 300)
0.th epoch Loss:1.0690534873792346
1000.th epoch Loss:0.3154230348059171
2000.th epoch Loss:0.17371204566429593
3000.th epoch Loss:0.09685781695093648
4000.th epoch Loss:0.054512089125164184
5000.th epoch Loss:0.03409565771282187
6000.th epoch Loss:0.024718949602783133
7000.th epoch Loss:0.019324454305583886
8000.th epoch Loss:0.015667583673292265
9000.th epoch Loss:0.013101627620210166
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       100
           1       1.00      0.99      0.99       100
           2       1.00      0.99      0.99       100

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300

(13, 178)
0.th epoch Loss:1.069055140709139
1000.th epoch Loss:-0.009648465911185972
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        59
           1       1.00   

# Acknowledgement

Thank your Andrej Karpathy for CS231n Winter 2016.

https://www.youtube.com/watch?v=NfnWJUyUJYU&list=PLkt2uSq6rBVctENoVBg1TpCC7OQi31AlC