## 1 - Packages

- [numpy](www.numpy.org)

In [131]:
import numpy as np

np.random.seed(961)

## 2 - Load the dataset

Load the [dataset](https://archive.ics.uci.edu/ml/datasets/banknote+authentication#) from `data.csv` file and split it to train (60%) and test (40%) sets.

In [148]:
data = np.genfromtxt('data.csv', delimiter=',')
np.random.shuffle(data)
m = data.shape[0]

X_train = data[:int(m * 0.6), 0:4]
X_train = X_train.T
Y_train = data[:int(m * 0.6), 4:]
Y_train = Y_train.T
X_test = data[int(m * 0.6):, 0:4]
X_test = X_test.T
Y_test = data[int(m * 0.6):, 4:]
Y_test = Y_test.T

## 3 - Activation functions

- [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function)
- [relu](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))

In [133]:
def sigmoid(Z):
    """
    Z: the input for the activation function
    
    Returns:
    A: the output of the activation function
    cache: dictionary containing Z
    """
    
    A = (1.0 / (1.0 + np.exp(-Z)))
    cache = (Z)
    
    return A, cache

In [134]:
def sigmoid_backward(dA, cache):
    """
    Z: the input for the activation function
    
    Returns:
    dZ: gradients of the activations
    """
    
    Z = cache
    s = 1.0 / (1.0 + np.exp(-Z))
    dZ = dA * s * (1 - s)
    
    return dZ

In [135]:
def relu(Z):
    """
    Z: the input for the activation function
    
    Returns:
    A: the output of the activation function
    cache: dictionary containing Z
    """
    
    A = Z * (Z > 0)
    cache = (Z)
    
    return A, cache

In [136]:
def relu_backward(dA, cache):
    """
    Z: the input for the activation function
    
    Returns:
    dZ: gradients of the activations
    """
    
    Z = cache
    dZ = dA * (Z > 0)
    
    return dZ

## 4 - Initialization

Initialize the weights and biases matrices and vectors.

In [137]:
def initialize_parameters(n_x, n_h, n_y):
    """
    n_x: number of units in the input layer
    n_h: number of units in the hidden layer
    n_y: number of units in the output layer
    
    Returns:
    parameters: dictionary containing the parameters
                W1, W2: weights
                b1, b2: biases
    """
    
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.random.randn(n_h, 1)
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.random.randn(n_y, 1)
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

## 5 - Forward propagation

In [138]:
def linear_forward(A, W, b):
    """
    A: activations from the previous layer
    W: weights matrix
    b: biases vector
    
    Returns:
    Z: the input for the next activation function
    cache: dictionary containing A, W and b; stored to compute backward propagation step
    """
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [139]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    A_prev: activation from the previous layer
    W: weights matrix
    b: biases vector
    activation: activation function to use
    
    Returns:
    A: output of the activation function
    cache: dictionary stored linear_cache and activation_cache to compute backward propagation step
    """
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
    
    if activation == "relu":
        A, activation_cache = relu(Z)
    
    cache = (linear_cache, activation_cache)
    
    return A, cache

## 6 - Cost function

In [140]:
def compute_cost(Yhat, Y):
    """
    Yhat: probabilities vector
    Y: labels vector
    
    Returns:
    cost: cross-entropy cost
    """
    
    m = Y.shape[1]
    
    cost = -(1.0 / m) * (np.dot(Y, np.log(Yhat).T) + np.dot(1 - Y, np.log(1 - Yhat).T))
    cost = np.squeeze(cost)
    
    return cost

## 7 - Backward propagation

In [141]:
def linear_backward(dZ, cache):
    """
    dZ: gradients of activations
    cache: tuple (A_prev, W, b)
    
    Returns:
    dA_prev: gradients of activations
    dW: gradients of weights
    db: gradients of biases
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = (1.0 / m) * np.dot(dZ, A_prev.T)
    db = (1.0 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [142]:
def linear_activation_backward(dA, cache, activation):
    """
    dA: gradients of activations
    cache: tuple (linear_cache, activation_cache)
    activation: activation function to use
    
    Returns:
    dA_prev: gradients of activations
    dW: gradients of weights
    db: gradients of biases
    """
    
    linear_cache, activation_cache = cache
    
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

## 8 - Update parameters

In [143]:
def update_parameters(parameters, grads, learning_rate):
    """
    parameters: dictionary containing the parameters
    grads: dictionary contaning the gradients
    learning_rate: the learning rate
    
    Returns:
    parameters: the updated parameters
    """
    
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
    
    return parameters

## 9 - The model

Everything come together here

In [144]:
def model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000):
    """
    X: training examples
    Y: training labels
    layers_dims: layers dimensions
    learning_rate: the learning rate
    num_iterations: number of iterations for gradient descent
    
    Returns:
    parameters: the parameters for the final iteration
    """
    
    grads = {}
    m = X.shape[1]
    (n_x, n_h, n_y) = layers_dims
    
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    for i in range(0, num_iterations):
        A1, cache1 = linear_activation_forward(X, W1, b1, "relu")
        A2, cache2 = linear_activation_forward(A1, W2, b2, "sigmoid")
        
        cost = compute_cost(A2, Y)
        
        dA2 = -(np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, "sigmoid")
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1, "relu")
        
        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        parameters = update_parameters(parameters, grads, learning_rate)
        
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        if (i + 1) % 100 == 0:
            print("Cost after iteration {}: {}".format(i + 1, np.squeeze(cost)))
    
    return parameters

In [145]:
parameters = model(X_train, Y_train, layers_dims = (4, 8, 1))

Cost after iteration 100: 0.714546164559489
Cost after iteration 200: 0.6295192563324075
Cost after iteration 300: 0.4817191360592121
Cost after iteration 400: 0.3654151694209473
Cost after iteration 500: 0.30879554582777397
Cost after iteration 600: 0.2746478320170577
Cost after iteration 700: 0.24942371507419836
Cost after iteration 800: 0.22895475144693245
Cost after iteration 900: 0.21167534592644754
Cost after iteration 1000: 0.19680165642267408
Cost after iteration 1100: 0.18389434516197503
Cost after iteration 1200: 0.1725856156668431
Cost after iteration 1300: 0.1625935396793638
Cost after iteration 1400: 0.15367573372549234
Cost after iteration 1500: 0.14570517078620554
Cost after iteration 1600: 0.13854384604144165
Cost after iteration 1700: 0.13207505249750243
Cost after iteration 1800: 0.12620555556178253
Cost after iteration 1900: 0.12087263753740299
Cost after iteration 2000: 0.11602343644213224
Cost after iteration 2100: 0.11158771362264092
Cost after iteration 2200: 0.1

## 10 - Prediction

Use `X_test` and `Y_test` to make predictions

In [146]:
def predict(X, Y, parameters):
    """
    X: test examples
    Y: test labels
    parameters: gradient descent parameters
    
    Returns:
    percent: the percentage of correction
    """
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    A1, cache = linear_activation_forward(X, W1, b1, "relu")
    A2, cache = linear_activation_forward(A1, W2, b2, "sigmoid")
    
    m = Y.shape[1]
    predictions = (Y == (A2 > 0.5))
    percent = np.sum(predictions) / m * 100
    
    return percent

In [147]:
print("{0:.2f}%".format(predict(X_test, Y_test, parameters)))

98.91%
