In [None]:
'''Jume Notes:  https://www.youtube.com/watch?v=vcZub77WvFA
Basic neural network, understanding the math behind what is happening.

'''

In [15]:
"""
Simplistic implementation of the two-layer neural network.
Training method is stochastic (online) gradient descent with momentum.
As an example it computes XOR for given input.
Some details:
- tanh activation for hidden layer
- sigmoid activation for output layer
- cross-entropy loss
Less than 100 lines of active code.
"""

import numpy as np
import time

n_hidden = 10
n_in = 10
n_out = 10
n_samples = 300

learning_rate = 0.01
momentum = 0.9

np.random.seed(0)

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def tanh_prime(x):
    return  1 - np.tanh(x)**2

def train(x, t, V, W, bv, bw):

    # forward
    A = np.dot(x, V) + bv
    Z = np.tanh(A)

    B = np.dot(Z, W) + bw
    Y = sigmoid(B)

    # backward
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)

    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)

    loss = -np.mean ( t * np.log(Y) + (1 - t) * np.log(1 - Y) )

    # Note that we use error for each layer as a gradient
    # for biases

    return  loss, (dV, dW, Ev, Ew)

def predict(x, V, W, bv, bw):
    A = np.dot(x, V) + bv
    B = np.dot(np.tanh(A), W) + bw
    return (sigmoid(B) > 0.5).astype(int)

# Setup initial parameters
# Note that initialization is cruxial for first-order methods!

V = np.random.normal(scale=0.1, size=(n_in, n_hidden))
W = np.random.normal(scale=0.1, size=(n_hidden, n_out))

bv = np.zeros(n_hidden)
bw = np.zeros(n_out)

params = [V,W,bv,bw]

# Generate some data

X = np.random.binomial(1, 0.5, (n_samples, n_in))
T = X ^ 1

# Train
for epoch in range(100):
    err = []
    upd = [0]*len(params)

    t0 = time.clock()
    for i in range(X.shape[0]):
        loss, grad = train(X[i], T[i], *params)

        for j in range(len(params)):
            params[j] -= upd[j]

        for j in range(len(params)):
            upd[j] = learning_rate * grad[j] + momentum * upd[j]

        err.append( loss )

    print("Epoch: %d, Loss: %.8f, Time: %.4fs" % (
                epoch, np.mean( err ), time.clock()-t0 ))

# Try to predict something

x = np.random.binomial(1, 0.5, n_in)
print("XOR prediction:")
print(x)
print(predict(x, *params))

Epoch: 0, Loss: 0.45465070, Time: 0.0183s
Epoch: 1, Loss: 0.13697961, Time: 0.0185s
Epoch: 2, Loss: 0.06206941, Time: 0.0217s
Epoch: 3, Loss: 0.04092746, Time: 0.0192s
Epoch: 4, Loss: 0.03159958, Time: 0.0180s
Epoch: 5, Loss: 0.02592744, Time: 0.0176s
Epoch: 6, Loss: 0.02199575, Time: 0.0178s
Epoch: 7, Loss: 0.01907812, Time: 0.0178s
Epoch: 8, Loss: 0.01682099, Time: 0.0174s
Epoch: 9, Loss: 0.01502363, Time: 0.0174s
Epoch: 10, Loss: 0.01356039, Time: 0.0172s
Epoch: 11, Loss: 0.01234775, Time: 0.0176s
Epoch: 12, Loss: 0.01132776, Time: 0.0197s
Epoch: 13, Loss: 0.01045887, Time: 0.0205s
Epoch: 14, Loss: 0.00971052, Time: 0.0227s
Epoch: 15, Loss: 0.00905971, Time: 0.0178s
Epoch: 16, Loss: 0.00848887, Time: 0.0178s
Epoch: 17, Loss: 0.00798436, Time: 0.0175s
Epoch: 18, Loss: 0.00753542, Time: 0.0177s
Epoch: 19, Loss: 0.00713347, Time: 0.0175s
Epoch: 20, Loss: 0.00677160, Time: 0.0176s
Epoch: 21, Loss: 0.00644415, Time: 0.0174s
Epoch: 22, Loss: 0.00614650, Time: 0.0177s
Epoch: 23, Loss: 0.00

In [2]:
'''https://www.youtube.com/watch?v=vcZub77WvFA'''

import numpy as np
import time

# variables
n_hidden = 10
n_in = 10
# output
n_out = 10
# sample data
n_sample = 300

In [3]:
#hyperparameters
learning_rate = 0.01
momentum = 0.9

#non deterministic seeding
np.random.seed(0)

In [12]:
def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def tanh_prime(x):
    return 1-np.tanh(x)**2

def train(x, t, V, W, bv, bw):
    
    # forward -- matrix multiply + biases
    A = np.dot(x, V) + bv
    Z = np.tanh(A)
    
    B = np.dot(Z, W)
    Y = sigmoid(B)
    
    # backward
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)
    
    # predict our loss 
    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)
    
    # cross entropy
    loss = -np.mean(t * np.log(Y) + (1-t) * np.log(1-Y))
    
    return loss, (dV, dW, Ev, Ew)

def predict(x, V, W, bv, bw):
    A = np.dot(x, V) + bv
    B = np.dot(np.tanh(A), W) + bw
    
    return (sigmoid(B) > 0.5).astype(int)

# create layers
V = np.random.normal(scale=0.1, size = (n_in, n_hidden))
W = np.random.normal(scale=0.1, size = (n_hidden, n_out))

bv = np.zeros(n_hidden)
bw = np.zeros(n_out)

params = [V, W, bv, bw]

#generate our data 
X = np.random.binomial(1, 0.5, (n_sample, n_in))
T = X^1
# training time
for epoch in range(100):
    err = []
    upd = [0] * len(params)
    t0 = time.clock()
    
    # for each data point, update our weights
    for i in range(X.shape[0]):
        loss, grad = train(X[i], T[i], * params)
        
        # update loss
        for j in range(len(params)):
            params[j] -= upd[j]
            
        for j in range(len(params)):
            upd[j] = learning_rate * grad[j] + momentum + upd[j]
        
        err.append(loss)
        
    print('Epoch: %d, Loss: %8f, Time: %.4fs' %(
        epoch, np.mean(err), time.clock() - t0))

# try to predict something
x = np.random.binomial(1, 0.5, n_in)
print('XOR prediction')
print(x)
print(predict(x, * params))
            
    



Epoch: 0, Loss:      nan, Time: 0.0241s
Epoch: 1, Loss:      nan, Time: 0.0217s
Epoch: 2, Loss:      nan, Time: 0.0199s
Epoch: 3, Loss:      nan, Time: 0.0224s
Epoch: 4, Loss:      nan, Time: 0.0218s
Epoch: 5, Loss:      nan, Time: 0.0198s
Epoch: 6, Loss:      nan, Time: 0.0197s
Epoch: 7, Loss:      nan, Time: 0.0199s
Epoch: 8, Loss:      nan, Time: 0.0204s
Epoch: 9, Loss:      nan, Time: 0.0196s
Epoch: 10, Loss:      nan, Time: 0.0225s
Epoch: 11, Loss:      nan, Time: 0.0237s
Epoch: 12, Loss:      nan, Time: 0.0243s
Epoch: 13, Loss:      nan, Time: 0.0203s
Epoch: 14, Loss:      nan, Time: 0.0191s
Epoch: 15, Loss:      nan, Time: 0.0196s
Epoch: 16, Loss:      nan, Time: 0.0194s
Epoch: 17, Loss:      nan, Time: 0.0201s
Epoch: 18, Loss:      nan, Time: 0.0199s
Epoch: 19, Loss:      nan, Time: 0.0190s
Epoch: 20, Loss:      nan, Time: 0.0198s
Epoch: 21, Loss:      nan, Time: 0.0203s
Epoch: 22, Loss:      nan, Time: 0.0224s
Epoch: 23, Loss:      nan, Time: 0.0213s
Epoch: 24, Loss:      nan,