In [1]:
import numpy as np

# Possible outputs
X = np.matrix('0 0; 1 0; 0 1; 1 1')
y = np.array([0, 1, 1, 0])

In [2]:
def relu(z):
    """ReLU activation function"""
    return np.maximum(z, 0, z)

def relu_prime(z):
    """First derivative of the ReLU activation function"""
    return 1*(z>0)

def sigmoid(z):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-z))

def sigmoid_prime(z):
    """First derivative of sigmoid activation function"""
    return np.multiply(sigmoid(z), 1-sigmoid(z))

def cost(a, y):
    """Calculate MSE"""
    return ((a - y) ** 2).mean()

def cost_grad(a, y):
    """First derivate of MSE function"""
    return a - y

def weighted_sum(W, a, b):
    """Compute the weighted average z for all neurons in new layer"""
    return W.dot(a) + b

def forward_prop(x, W, b): 
    """Calculate z and a for every neuron using current weights and biases"""
    a = [None] * len(layer_sizes)
    z = [None] * len(layer_sizes)
    
    a[0] = x.T
    
    for l in range(1, len(a)):
        z[l] = weighted_sum(W[l], a[l-1], b[l])
        a[l] = sigmoid(z[l])
        
    return (a, z)

def back_prop(a, z, W, y):
    """Calculate error delta for every neuron"""
    delta = [None] * len(layer_sizes)
    end_node = len(a)-1
    
    delta[end_node] = np.multiply(cost_grad(a[end_node], y), sigmoid_prime(z[end_node]))
    
    for l in reversed(range(1, end_node)):
        delta[l] = np.multiply(W[l+1].T.dot(delta[l+1]), sigmoid_prime(z[l]))
    
    return delta

def calc_gradient(W, b, a, delta, eta):
    """Update W and b using gradient descent steps based"""
    W_grad = [None] * len(W)
    b_grad = [None] * len(b)
    
    for l in range(1, len(W)):
        W_grad[l] = a[l-1].dot(delta[l].T)
        b_grad[l] = delta[l]
    
    return (W_grad, b_grad)

def backpropagation_iter(X, y, W, b, eta):
    """One iteration of the backpropagation algorithm, i.e., forward- and backward propagate and compute gradient"""
    y_pred = [None] * len(y)
    
    for i in range(n):
        # First we propagate forward through the network to obtain activation levels and z.
        a, z = forward_prop(X[i, :], W, b)
        y_pred[i] = np.max(a[-1])

        # Back propagate to obtain delta's.
        delta = back_prop(a, z, W, y[i])

        # This allows us to compute the gradient for this instance. Add this to all.
        W_grad, b_grad = calc_gradient(W, b, a, delta, eta)

        if i == 0:
            W_grad_sum = W_grad
            b_grad_sum = b_grad
        else:
            for l in range(1, len(W_grad)):
                W_grad_sum[l] += W_grad[l]
                b_grad_sum[l] += b_grad[l]

    # Update weights and bias
    for l in range(1, len(W)):
        W[l] = W[l] - (eta/n) * W_grad_sum[l]
        b[l] = b[l] - (eta/n) * b_grad_sum[l]
    
    # Show MSE
    MSE = cost(y_pred, y)
    
    return (W, b, y_pred, MSE)

In [5]:
# Initialise layer sizes of all layers in the neural network
layer_sizes = [X.shape[1], 2, 1]

# Initialise weights and activation and weight vectors as None.
W = [None] * len(layer_sizes)
b = [None] * len(layer_sizes)

# Initialise weights randomly
for l in range(1, len(layer_sizes)):
    W[l] = np.random.random((layer_sizes[l], layer_sizes[l-1]))
    b[l] = np.random.random((layer_sizes[l], 1))
    
# Set number of iterations for backpropagation to work, size, and learning rate
n_iter = 10000
n = X.shape[0]
eta = 0.1

In [6]:
for iter in range(n_iter+1):
    W, b, y_pred, MSE = backpropagation_iter(X, y, W, b, eta)
    
    # Only print every 10 iterations
    if iter % 100 == 0:
        print('Iteration {0}: {1}'.format(iter, MSE))

Iteration 0: 0.29154168076363696
Iteration 100: 0.25644418613616293
Iteration 200: 0.2508820478709627
Iteration 300: 0.25015239926399613
Iteration 400: 0.2500547498759536
Iteration 500: 0.25004375724705
Iteration 600: 0.25004585350541564
Iteration 700: 0.250097618880977
Iteration 800: 0.2500996087852953
Iteration 900: 0.25010257503305944
Iteration 1000: 0.25010605759556803
Iteration 1100: 0.2501098155399221
Iteration 1200: 0.25011372106651364
Iteration 1300: 0.25011770649182263
Iteration 1400: 0.2501217363741961
Iteration 1500: 0.2501257926427838
Iteration 1600: 0.2501298666234644
Iteration 1700: 0.2501339547542252
Iteration 1800: 0.2501380562812584
Iteration 1900: 0.25014217202060846
Iteration 2000: 0.25014630369406526
Iteration 2100: 0.2501504535752841
Iteration 2200: 0.25015462430421614
Iteration 2300: 0.25015881879360063
Iteration 2400: 0.25016304018660374
Iteration 2500: 0.25016729184370046
Iteration 2600: 0.2501715773471257
Iteration 2700: 0.25017590051672234
Iteration 2800: 0.25