In [2]:
import numpy as np
import keras
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)



(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [63]:
def calculate_z(a, w, b):

    """caculate z from z = a*w + b"""

    return np.dot(w, a) + b

def sigmoid(x):
    
    """return value of sigmoid function of x"""

    return 1/(1+np.exp(-x))

def cost_function_derivative(a, y):
    
    """return value of the cost function derivative of a and y"""

    return 2*(a-y)

def sigmoid_derivative(x):
    
    """reuturn value of sigmoid derivative of x"""

    return (1/(1+np.exp(-x))**2)*np.exp(-x)

def back(l, n, g, e = None):

    """compute backpropagation and return gradient of the network"""

    if (l == 1):
        return g
    else:
        z = calculate_z(n[l-1]["activations"], n[l]["weights"], n[l]["biases"])
        da_by_dz = sigmoid_derivative(z)

        if (l == len(n)):
            dc_by_da = cost_function_derivative(n[l]["activations"], e)
        else:
            dc_by_da = np.sum(g[l+1]["biases"]*n[l+1]["weights"], axis = 0).reshape(-1,1)
            # dc_by_da = np.dot(n[l+1]["weights"], g[l+1["biases"]])

        g[l]["weights"] = n[l-1]["activations"].reshape(-1)*dc_by_da*da_by_dz
        #g[l]["weights"] = np.dot()
        g[l]["biases"] = dc_by_da*da_by_dz
        g = back(l-1,n,g)
        return g

def add_gradient(g1, g2):

    """return added gradient"""

    w = "weights"
    b = "biases"
    l = len(g1)
    for i in range(2, l+1, 1):
        g1[i][w] += g2[i][w]
        g1[i][b] += g2[i][b]

    return g1

def average_gradient(g, batch_size):

    """averaging the gradient with batch size and return averaged gradient"""

    w = "weights"
    b = "biases"
    l = len(g)
    for i in range(2, l+1, 1):
        g[i][w]/=batch_size
        g[i][b]/=batch_size

    return g

def graident_descent(n, g, lr):

    """compute graident descent"""

    w = "weights"
    b = "biases"
    l = len(n)
    for i in range(2, l+1, 1):
        n[i][w] -= lr*g[i][w]
        n[i][b] -= lr*g[i][b]
    return n

digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
batch_size = 1
learning_rate = 0.01
epochs = 15

network = {1: {"activations" : np.empty(1)}, 
            2: {"activations" : np.random.rand(16).reshape(16, 1), "weights" : np.random.uniform(-0.5, 0.5, size=784*16).reshape(16, 784), "biases" : np.random.uniform(-0.5, 0.5, size=16).reshape(16, 1)}, 
            3: {"activations" : np.random.rand(16).reshape(16, 1), "weights" : np.random.uniform(-0.5, 0.5, size=16*16).reshape(16, 16), "biases" : np.random.uniform(-0.5, 0.5, size=16).reshape(16, 1)},
            4 : {"activations" : np.random.rand(10).reshape(10, 1), "weights" : np.random.uniform(-0.5, 0.5, size=16*10).reshape(10, 16), "biases" : np.random.uniform(-0.5, 0.5, size=10).reshape(10, 1)}}

layers = len(network)

for i in range(epochs):

    #train

    current = 0
    for j in range(0, 60000, batch_size):

        gradient = {2: {"weights" : np.zeros((16, 784)), "biases" : np.zeros((16, 1))}, 
                3: {"weights" : np.zeros((16, 16)), "biases" : np.zeros((16, 1))},
                4 : {"weights" : np.zeros((10, 16)), "biases" : np.zeros((10, 1))}}
        
        for k in range(batch_size):

            network[1]["activations"] = x_train[current].reshape(784, 1)/255
            z2 = calculate_z(network[1]["activations"], network[2]["weights"], network[2]["biases"])
            network[2]["activations"] = sigmoid(z2)
            z3 = calculate_z(network[2]["activations"], network[3]["weights"], network[3]["biases"])
            network[3]["activations"] = sigmoid(z3)
            z4 = calculate_z(network[3]["activations"], network[4]["weights"], network[4]["biases"])
            network[4]["activations"] = sigmoid(z4)
            
            expect_num = y_train[current]
            expect_output = np.zeros(10)
            expect_output[digits.index(expect_num)] = 1
            expect_output = expect_output.reshape((-1,1))

            output = network[4]["activations"]
        

            gradient = add_gradient(gradient, back(layers, network, gradient, expect_output))


        gradient = average_gradient(gradient, batch_size)
    
        network = graident_descent(network, gradient, learning_rate)
        
        current += 1
        

    #test
        
    current = 0
    correct = 0
    for l in range(10000):
        
        network[1]["activations"] = x_test[current].reshape(784, 1)/255
        z2 = calculate_z(network[1]["activations"], network[2]["weights"], network[2]["biases"])
        network[2]["activations"] = sigmoid(z2)
        z3 = calculate_z(network[2]["activations"], network[3]["weights"], network[3]["biases"])
        network[3]["activations"] = sigmoid(z3)
        z4 = calculate_z(network[3]["activations"], network[4]["weights"], network[4]["biases"])
        network[4]["activations"] = sigmoid(z4)
        
        expect_num = y_test[current]

        output = network[4]["activations"]
        
        if(np.argmax(output) == expect_num):
            correct += 1
        current += 1
        
    print("accuracy", i+1, (correct/current)*100)


accuracy 1 83.41
accuracy 2 90.2
accuracy 3 91.46
accuracy 4 92.25
accuracy 5 92.81
accuracy 6 93.11
accuracy 7 93.30000000000001
accuracy 8 93.5
accuracy 9 93.62
accuracy 10 93.75
accuracy 11 93.92
accuracy 12 94.02000000000001
accuracy 13 94.08
accuracy 14 94.19999999999999
accuracy 15 94.31


In [25]:
#back propagation

        #layer 4
        
        # da4 = sigmoid_derivative(z4)
        # dc4 = cost_function_derivative(network[4]["activations"], expect_output)
        
        # gradient[4]["weights"] = network[3]["activations"].reshape(-1)*dc4*da4
        # gradient[4]["biases"] = dc4*da4 
        
        # #layer 3
        
        # da3 = sigmoid_derivative(z3)
        # dc3 = np.sum(dc4*da4*network[4]["weights"], axis=0).reshape(16, 1)
        # gradient[3]["weights"] = network[2]["activations"].reshape(-1)*dc3*da3
        # gradient[3]["biases"] = dc3*da3
        
        # #layer 2
        
        # da2 = sigmoid_derivative(z2)
        # dc2 = np.sum(dc3*da3*network[3]["weights"], axis = 0).reshape(16, 1)
        # gradient[2]["weights"] = network[1]["activations"].reshape(-1)*dc2*da2
        # gradient[2]["biases"] = dc2*da2

In [38]:
a = np.arange(3).reshape((3,1))
b = np.arange(3).reshape((3,1))
c = np.arange(3).reshape((3,1))
d = np.arange(9).reshape((9,1))
print(c.reshape(-1)*d)
print(np.dot(d, c.T))
# print(np.dot(a.T, b))

[[ 0  0  0]
 [ 0  1  2]
 [ 0  2  4]
 [ 0  3  6]
 [ 0  4  8]
 [ 0  5 10]
 [ 0  6 12]
 [ 0  7 14]
 [ 0  8 16]]
[[ 0  0  0]
 [ 0  1  2]
 [ 0  2  4]
 [ 0  3  6]
 [ 0  4  8]
 [ 0  5 10]
 [ 0  6 12]
 [ 0  7 14]
 [ 0  8 16]]


In [52]:
a = np.arange(3)
b = np.arange((3)).reshape((3,1))
print(a.reshape((1,3)))
print(b)
print(np.dot(b, a.reshape(1,3)))

[[0 1 2]]
[[0]
 [1]
 [2]]
[[0 0 0]
 [0 1 2]
 [0 2 4]]
