# Coding a neural network from scratch

In [18]:
import numpy as np

In [19]:
test = np.zeros((5,5))
test2 = np.array(([3,3],[3,3],[3,3]))

In [23]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    activ_cache = Z
    return A, activ_cache

def relu(Z):
    A = np.maximum(0,Z)
    activ_cache = Z
    return A, activ_cache

In [24]:
def initialize(layers_dims): #ex argument : [12288, 5,5,5,1]
    
    L = len(layers_dims)
    
    params = {}
    
    for i in range(1, L):
        params['W' + str(i)] = np.random.randn(layers_dims[i], layers_dims[i-1]) #  / np.sqrt(layer_dims[l-1]) #*0.01
        params['b' + str(i)] = np.zeros((layers_dims[i]))
    
    return params

In [25]:
np.random.seed(1)
initialize([2,4,1])

{'W1': array([[ 1.62434536, -0.61175641],
        [-0.52817175, -1.07296862],
        [ 0.86540763, -2.3015387 ],
        [ 1.74481176, -0.7612069 ]]),
 'b1': array([0., 0., 0., 0.]),
 'W2': array([[ 0.3190391 , -0.24937038,  1.46210794, -2.06014071]]),
 'b2': array([0.])}

In [12]:
def linear_forward(A,W,b):
    linear_cache = A,W,b
    Z = np.dot(W,A) + b
    return Z, linear_cache

In [13]:
def linear_activation_forward(A_prev, W, b, activation): #type(activation) == string
    
    Z, linear_cache = linear_forward(A_prev, W, b)    
    
    if activation == "relu":
        A, activ_cache = relu(Z)
    elif activation == "sigmoid":
        A, activ_cache = sigmoid(Z)
    else:
        raise Exception(f'{activation} function is not known'.format(activation))

    cache = (linear_cache, activ_cache)
    
    return A, cache

In [111]:
def L_model(X, params):
    
    L = len(params // 2)
    caches = []
    
    A = X
    
    # L-1 premières couches
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, params['W' + str(l)], params['b' + str(l)], activation='relu')
        caches.append(cache)
    
    # L-ième couche
    A, cache = linear_activation_forward(A, params['W' + str(L)], params['b' + str(L)], activation = 'sigmoid')
    caches.append(cache)
    
    Yhat = A
    
    return Yhat, caches
    # caches est une liste qui contient L tupples "cache", chacun contenant
    # 1) lin_cache (avec A_prev, matrice W de la couche, et b vecteur des biais de la couche)
    # 2) activ_cache (contenant juste Z de la couche)

In [112]:
#def loss(Y,Yhat):
#    loss = - (Y * np.log(Yhat)   +   (1 - Y) * np.log(1 - Yhat))
#    return loss

In [117]:
#def cost(X,Y, ):
#    total_loss = 0
#    
#    total += loss(Y)
#    
#    mycost = 1 / (nb exemples) * (somme des losses)
#    

In [121]:
def compute_cost(Yhat,Y):
    
    m = Y.shape[1]
    
    cost = (- 1 / m) * (np.dot(Y,np.log(Yhat).T) + np.dot((1 - Y),np.log(1 - Yhat).T))
    #moyenne des losses sous forme vectorielle. Renvoie un array.
    
    cost = np.squeeze(cost) #retire un 'niveau' de crochets
    
    return cost

In [2]:
def linear_backward(dZ, lin_cache):
        
    A_prev, W, b = lin_cache
    
    m = A_prev.shape[1]

    dW = 1 / m * np.dot(dZ, A_prev.T)
    db = 1 / m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [9]:
def relu_backward(dA, Z):

    dA_copy = np.array(dA, copy=True)
    dZ = 0 if Z <= 0 else dA_copy  

    #other solutions:
    #dA_copy[Z <= 0] = 0
    #dZ = dA_copy * (Z >= 0) * 1

    return dZ

In [10]:
def sigmoid_backward(dA, Z):
    s, _ = sigmoid(Z)
    dZ = dA * (s * (1 -s))
    return dZ

In [26]:
def lin_activation_backward(dA, cache, activation):
    
    lin_cache, activ_cache = cache
    Z = activ_cache
    
    if activation == "relu":
        dZ = relu_backward(dA, Z)

    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)

    dA_prev, dW, db = linear_backward(dZ, lin_cache)
    
    return dA_prev, dW, db

In [35]:
def L_model_backward(Yhat, Y, caches):
    
    grads = {}
    L = len(caches)
    m = Y.shape[1]
    print("Y.shape", Y.shape)
    print("Yhat.shape", Yhat.shape)
    #Y = Y.reshape(AL.shape)
    
    
    # dérivée de la cost function par rapport à Yhat
    dYhat = - (np.divide(Y, Yhat) - np.divide(1-Y, (1-Yhat))) 
    
    # descente de gradient pour les paramêtres de la dernière couche
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = lin_activation_backward(dYhat, current_cache, "sigmoid")
    
    # descente de gradient pour le reste des paramêtres des couches intermédiaires
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = lin_activation_backward(grads["dA" + str(l+1)], current_cache, "relu")
        
    return grads

In [38]:
def update_params(params, grads, learning_rate=0.003):
    
    L = len(params) // 2
    
    for l in range(1, L +1):
        params['W' + str(l)] = params['W' + str(l)] - learning_rate * grads['dW' + str(l)]
        params['b' + str(l)] = params['b' + str(l)] - learning_rate * grads['db' + str(l)]     
    
    return params

In [58]:
def L_layer_model(X, Y, layer_dims, num_iterations=3000, learning_rate=0.003, print_cost=True):
    
    assert(layer_dims[0] == X.shape[0])

    np.random.seed(1)
    
    params = initialize(layer_dims) #le premier élément de layer_dims est le nombre de features
    costs = []
    
    for it in range(1,num_iterations + 1):
        
        Yhat, caches = L_model_backward(X, params) #forward pass
        
        cost = compute_cost(Yhat, Y)
        
        if print_cost and it % 100 == 0:
            print(cost)
            costs.append(cost)
        #ongoing reflexion : on modifie notre neurone après avoir évalué son coût (/perf), c'est moyen, non ?
        grads = L_model_backward(Yhat, Y, caches)
        
        update_params(params, grads, learning_rate=1e-3)
        
    return params