# Coding a neural network from scratch

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    activ_cache = Z
    return A, activ_cache

def relu(Z):
    A = np.maximum(0,Z)
    activ_cache = Z
    return A, activ_cache

In [3]:
def initialize(layers_dims): #ex: layerdims = [12288, 5,5,5,1] => 1st input is the number of features
    
    L = len(layers_dims)
    params = {}
    
    for i in range(1, L):
        params['W' + str(i)] = np.random.randn(layers_dims[i], layers_dims[i-1]) #* 0.001 / np.sqrt(layer_dims[i-1]) 
        params['b' + str(i)] = np.zeros((layers_dims[i],1))
    
    return params

In [4]:
def linear_activation_forward(A_prev, W, b, activation): #type(activation) == string
    
    linear_cache = (A_prev, W, b)
    assert W.shape[1] == A_prev.shape[0]
    Z = np.dot(W,A_prev) + b
    
    if activation == "relu":
        A, activ_cache = relu(Z)
    elif activation == "sigmoid":
        A, activ_cache = sigmoid(Z)
    else:
        raise Exception('{activation} function is not known'.format(activation))

    cache = (linear_cache, activ_cache)
    
    return A, cache

In [5]:
def L_model(X, params, activations, final_activ):
    
    L = len(params) // 2
    caches = []
    
    A = X
    
    # L-1 premières couches
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, params['W' + str(l)], params['b' + str(l)], activations)
        caches.append(cache)
    
    # L-ième couche
    A, cache = linear_activation_forward(A, params['W' + str(L)], params['b' + str(L)], final_activ)
    caches.append(cache)
    
    Yhat = A
    
    return Yhat, caches
    # caches is a list containing L "cache" tupples, each one containing
    # 1) lin_cache (with A_prev (activations for layer l-1), matrix W and 
    # vector b for layer l)
    # 2) activ_cache (containing just Z for layer l)

In [6]:
def compute_cost(Yhat,Y):
    
    m = Y.shape[1]
    
    # average of losses, computed through vector multiplication. Returns an array.
    cost = (- 1. / m) * (np.dot(Y,np.log(Yhat).T) + np.dot((1 - Y),np.log(1 - Yhat).T))

    # np.squeeze allows to remove a "level" of brackets
    cost = np.squeeze(cost)
    assert cost.shape == ()
    
    return cost

In [7]:
def linear_backward(dZ, lin_cache):

    A_prev, W, b = lin_cache
    m = A_prev.shape[1]

    dW = 1 / m * np.dot(dZ, A_prev.T)
    db = 1 / m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [8]:
def relu_backward(dA, Z):
    assert len(Z) == len(dA)
    dA_copy = np.array(dA, copy=True)
    dA_copy[Z <= 0] = 0
    return dA_copy

def sigmoid_backward(dA, Z):
    s, _ = sigmoid(Z)
    dZ = dA * (s * (1 -s))
    return dZ

In [9]:
def lin_activation_backward(dA, cache, activation):
    
    lin_cache, activ_cache = cache
    Z = activ_cache
    
    if activation == "relu":
        dZ = relu_backward(dA, Z)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)
    else:
        raise Exception('{activation} function is not known'.format(activation))

    dA_prev, dW, db = linear_backward(dZ, lin_cache)
    
    return dA_prev, dW, db

In [10]:
def L_model_backward(Yhat, Y, caches, activations, final_activ):
    
    grads = {}
    L = len(caches)
    m = Y.shape[1]    
    
    # grad of cost function relative to Yhat
    dYhat = - (np.divide(Y, Yhat) - np.divide(1-Y, (1-Yhat))) 
    
    # calculate grads for layer L-1
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = lin_activation_backward(dYhat, current_cache, final_activ)
    
    # calculating grads for layer L-2 through 1
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = lin_activation_backward(grads["dA" + str(l+1)], current_cache, activations)
        
    return grads

In [11]:
def update_params(params, grads, learning_rate):
    
    L = len(params) // 2
    
    for l in range(1, L +1):
        params['W' + str(l)] = params['W' + str(l)] - learning_rate * grads['dW' + str(l)]
        params['b' + str(l)] = params['b' + str(l)] - learning_rate * grads['db' + str(l)]     
    
    return params

In [12]:
def L_layer_model(X, Y, layer_dims, learning_rate, activations, final_activ, num_iterations, print_cost=True, show_plot=False):
    
    assert (layer_dims[0] == X.shape[0])

    params = initialize(layer_dims) #reminder: layer_dims[0] = number of features
    costs = []
    
    for it in range(1,num_iterations + 1):        
        Yhat, caches = L_model(X, params, activations, final_activ) #forward pass        
        cost = compute_cost(Yhat, Y) #compute cost of this state        
        if print_cost and it % 1000 == 0:
            print("Cost after {} iterations : {}".format(it, cost))
            costs.append(cost) #logs the cost every 1000 iterations
        
        grads = L_model_backward(Yhat, Y, caches, activations, final_activ)
        update_params(params, grads, learning_rate)
    
    # optional visualization
    if show_plot:
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per 1000s)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    return params

### Creating a dataset

We generate binary tuples (X) and attribute to Y the result of a XOR evaluation, as follows:

In [13]:
def new_xor_dataset(nbr_of_examples):
    X = np.random.randint(0, 2, size = (2, nbr_of_examples))

    Y = X.sum(axis=0, keepdims=True)    
    Y[Y != 1] = 0
    
    return X,Y

Example:

In [14]:
new_xor_dataset(5)

(array([[1, 1, 1, 1, 0],
        [0, 0, 0, 0, 0]]), array([[1, 1, 1, 1, 0]]))

### Running the model

#### Hyperparameters

In [15]:
num_iterations = 5000
layer_dims = [2, 2, 1]
learning_rate = 2
activations="relu"
final_activ="sigmoid"

First, let's try to launch it several times to see the reliability:

In [16]:
for i in range(5):
    print(">>>>>>>", i + 1, "<<<<<<<")
    X_train, Y_train = new_xor_dataset(5000)
    trained_model = L_layer_model(X_train, Y_train, layer_dims, learning_rate, activations, final_activ, num_iterations)
    print()


>>>>>>> 1 <<<<<<<
Cost after 1000 iterations : 0.471853989867292
Cost after 2000 iterations : 0.4718399471857046
Cost after 3000 iterations : 0.4718467082174015
Cost after 4000 iterations : 0.47183814231023535
Cost after 5000 iterations : 0.4718355379513478

>>>>>>> 2 <<<<<<<
Cost after 1000 iterations : 0.6931468605599116
Cost after 2000 iterations : 0.6931468605599116
Cost after 3000 iterations : 0.6931468605599116
Cost after 4000 iterations : 0.6931468605599116
Cost after 5000 iterations : 0.6931468605599116

>>>>>>> 3 <<<<<<<
Cost after 1000 iterations : 0.4840477562801589
Cost after 2000 iterations : 0.4839784840008317
Cost after 3000 iterations : 0.4840100357392776
Cost after 4000 iterations : 0.48397719131174677
Cost after 5000 iterations : 0.48397400633170246

>>>>>>> 4 <<<<<<<
Cost after 1000 iterations : 0.4742658686520668
Cost after 2000 iterations : 0.47423700257038404
Cost after 3000 iterations : 0.47423245884078297
Cost after 4000 iterations : 0.47425438802547126
Cost aft

Seems pretty unreliable, the cost varies a lot. This may have to do with the initialization of our weights ; we'll have to see this.

Meanwhile, let's try to assess the performance of our model with regards to accuracy.

#### Getting parameters

In [17]:
X_train, Y_train = new_xor_dataset(5000)
trained_model = L_layer_model(X_train, Y_train, layer_dims, learning_rate, activations, final_activ, num_iterations)


Cost after 1000 iterations : 0.0007405144717997071
Cost after 2000 iterations : 0.00035292978326913004
Cost after 3000 iterations : 0.00023039029075513874
Cost after 4000 iterations : 0.00017055699625922162
Cost after 5000 iterations : 0.00013526008651966755


#### 'predict' function and appraisal of performance with these weights

In [18]:
def test_pred(X, Y, params):
    """
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model

    Returns:
    p -- predictions for the given dataset X
    """

    m = X.shape[1]
    p = np.zeros((1,m))

    # Forward propagation
    probas, caches = L_model(X, params, activations, final_activ)


    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p[0] == Y[0]))/m))

    return Y, p

In [19]:
X_test, Y_test = new_xor_dataset(250)
test_pred(X_test, Y_test, trained_model)

Accuracy: 1.0


(array([[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
         1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
         0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
         0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
         1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
         1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
         0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
         0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
         0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
         0, 0, 0, 0, 1, 0, 1, 0]]),
 array([[0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
         0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0

### Next phases

- Complexifier le dataset : Tester avec un dataset où l'étiquette est 1 dans [0:20] et [80:100]