In [33]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases_v4a import *
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward

%matplotlib inline
plt.rcParams['figure.figsize'] = (9.0, 7.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
def initialize_shallow_NN(n_x,n_h,n_y):
    '''
        This function I used to initialize a shallow Nerual Network with just:
        input layer & 1  hidden layer & output layer as n_x,n_h,n_y 
        and n_x,n_h,n_y  for number of nodes or features in each layer
        also we use it to initialize our parameters using these params
        
        function return
            This function will return a python dictionary that has a key of param and its value
    '''
    '''
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    parameters -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    '''
    W1 = np.random.randn(n_h,n_x) * 0.01 # multiple each value of .01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h) * 0.01
    b2 = np.zeros((n_y,1))
    
    parameters = {
        'W1' : W1,
        'b1' : b1,
        'W2' : W2,
        'b2' : b2
    }
    return parameters

In [35]:
parameters = initialize_shallow_NN(3,2,1)
print("W1 = " + str(parameters["W1"].shape))
print("b1 = " + str(parameters["b1"].shape))
print("W2 = " + str(parameters["W2"].shape))
print("b2 = " + str(parameters["b2"].shape))

W1 = (2, 3)
b1 = (2, 1)
W2 = (1, 2)
b2 = (1, 1)


In [36]:
def initialize_deep_NN(layer_dims):
    '''
        This another function I used to initialize a deep Nerual Network with:
        layer_dims, its a python list which contain size of each layer 
        and include the input layer so we need to start our loop from layer 1 which first hidden layer
        that help you create a multiple NN layers depends on your size of each layer given to layer_dims
    '''
    
    '''
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
    '''
    
    L = len(layer_dims) # first we get length of the layers of out NN 
    parameters = {}
    for l in range(1,L):
        '''
             start from layer 1 because actually we do not get weight for input layer 
             we use it to get weights of first hidden layer 
             and then use size of first hidden layer to get scond hidden layer weights and so on to output layer
             general role for that 
             "Weight of any Layer L = (size of its nodes, size of previous layer nodes)"
             "bias of any Layer L = (size of its nodes, 1)"
        '''
        parameters['W' + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1]) * .01
        parameters['b' + str(l)] = np.zeros((layer_dims[l],1))
        
    
    return parameters
        
    

In [37]:
# you can give it as you need multiple layers but then you should loop to print all w and b
parameters = initialize_deep_NN([5,4,3])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.00319039 -0.0024937   0.01462108 -0.02060141 -0.00322417]
 [-0.00384054  0.01133769 -0.01099891 -0.00172428 -0.00877858]
 [ 0.00042214  0.00582815 -0.01100619  0.01144724  0.00901591]
 [ 0.00502494  0.00900856 -0.00683728 -0.0012289  -0.00935769]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]]
W2 = [[-0.00267888  0.00530355 -0.00691661 -0.00396754]
 [-0.00687173 -0.00845206 -0.00671246 -0.00012665]
 [-0.0111731   0.00234416  0.01659802  0.00742044]]
b2 = [[0.]
 [0.]
 [0.]]


In [38]:
def linear_forward(A,W,b):
    '''
        This function used to calculate Z which is need:
        A_previous we send as A, and W of the layer and b of the layer
        then all of these values saved and returned
    '''
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    Z     =     np.dot(W,A) +b
    cache = (A,W,b)
#     print(Z.shape)
#     print(A.shape)
#     print(W.shape)
#     print(b.shape)
    return Z, cache

In [39]:
A = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)
Z, linear_cache = linear_forward(A, W, b)
print("Z = " + str(Z))
print("linear_cache = " + str(linear_cache))

Z = [[-0.98253931  3.92578674]]
linear_cache = (array([[-0.19183555, -0.88762896],
       [-0.74715829,  1.6924546 ],
       [ 0.05080775, -0.63699565]]), array([[0.19091548, 2.10025514, 0.12015895]]), array([[0.61720311]]))


In [40]:
def linear_activation_forward(A_previous,W,b, activation_fun_name):
    '''
        This function used to return the activation value A for specific layer and also save these values
        all of saved values in cache we use it for backward
        also the function depend on what is activation function you need to use for this layer
        because we have multiple activation function we can use like Relu, tanch and sigmoid
        but actually most use of relu and sigmoid for only output layer
    '''
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    if activation_fun_name == "sigmoid":
        Z, linear_cache = linear_forward(A_previous, W, b) # function we create above
        A, activation_cache = sigmoid(Z) # return activation cache which save value of Z and Activation of the layer
        print(A.shape)
    elif activation_fun_name == "relu":
        Z, linear_cache = linear_forward(A_previous, W, b)
        A, activation_cache = relu(Z)
        print(A.shape)
        print(activation_cache.shape)
    
    '''
        now the variable cache contain the the Z in activation_cache value
        and contain A,W,b for this layer, A here is A_prev that used with the w,b paramter to get
        activation of this layer
    '''
    cache = (linear_cache, activation_cache)
    
    return A, cache # activation of the layer we in

In [41]:

A_prev = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "sigmoid")
print("With sigmoid: A = " + str(A))

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "relu")
print("With ReLU: A = " + str(A))

(1, 2)
With sigmoid: A = [[0.50338007 0.60631959]]
(1, 2)
(1, 2)
With ReLU: A = [[0.01352047 0.4318678 ]]


In [42]:
def Linear_forward_model_shallow_or_deep_NN(X, parameters):
    '''
        here is our forward process we will make just we need X which the input layer
        and parameters for each hidden layer and output layer and we get length of NN layer via len(parameters)//2
        because parameters has all w,b of all layers and for that layers number is len(parameters)//2
        at same time we need to save all layers paramters we use so now we will use:
        caches to store each cache for each layer and cache it self contain 2 values
        1- linear_cache which has three values A_previous,W,b
        2- activation_cache which has the Z of the  layer we used linear_cache variables to calculate it
    '''
    
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L-1 of them, indexed from 0 to L-1)
    """
        
    caches = []
    L      = len(parameters) // 2
    A = X # its means input layer we call it as A of 0
    for l in range(1,L):
        '''
            actually this loop will go all only hidden layer because when l is == L will terminated
            then we will calculate output layer after loop using sigmoid function
        '''
        A_prev = A
        A,cache = linear_activation_forward(A_prev, parameters['W' + str(l)],
                                            parameters['b' + str(l)] , "relu")
        caches.append(cache)
        
    # AL means output layer
    AL,cache = linear_activation_forward(A, parameters['W' + str(L)],
                                            parameters['b' + str(L)] , "sigmoid")
    caches.append(cache)
    '''
        now we have all layers actavation variable for each layer in caches
        variable: A_prev,W,b,Z for each layer  
        and last layer activation AL
    '''
    return AL, caches 

In [43]:
X = np.random.randn(4,2)
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
parameters = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2}

AL, caches = Linear_forward_model_shallow_or_deep_NN(X, parameters)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

(3, 2)
(3, 2)
(1, 2)
AL = [[0.38830879 0.58540103]]
Length of caches list = 2


In [44]:
def forward_cost_function(AL, Y):
    '''
        At the end of forward process:
        we calculate the cost function for last layer then we process to backward 
    '''
    
    m = Y.shape[1] # get number of training examples
    cost = (-1/m) * np.sum((np.dot(Y, np.log(AL).T) + np.dot((1-Y), np.log(1-AL).T)))
    return cost

In [101]:
Y = np.asarray([[1, 1, 0]])
AL = np.array([[.8,.9,0.4]])

print("cost = " + str(forward_cost_function(AL, Y)))

cost = 0.2797765635793422


In [102]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis=1,keepdims=True)
    dA_prev = np.dot(W.T,dZ)

    return dA_prev, dW, db

In [103]:
dZ = np.random.randn(3,4)
A = np.random.randn(5,4)
W = np.random.randn(3,5)
b = np.random.randn(3,1)
linear_cache = (A, W, b)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

dA_prev = [[ 0.77669034 -1.88132907 -0.65140192 -0.55878017]
 [ 1.33952232  0.40764844 -0.75963566  0.10429644]
 [-2.82534175 -1.52885908  1.30192054 -1.12570307]
 [-0.64062238  0.11938664 -0.25193898 -1.92267442]
 [-0.28589563  0.29024029  0.01835481 -0.46303988]]
dW = [[-0.20686586 -0.42776967  0.10987313  0.00932069 -0.24162395]
 [-0.20670748  0.28502228  0.51533545  0.11976457 -0.49756057]
 [ 0.74681502  0.25294116  0.08213842  0.1887332  -0.57425581]]
db = [[-0.2410353 ]
 [-0.46228445]
 [-0.58997865]]


In [104]:
def linear_activation_backward(dA, cache, activation_fun_name):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.

    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    if activation_fun_name == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation_fun_name == "relu":
        dZ = relu_backward(dA,activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    return dA_prev, dW, db

In [105]:
dAL = np.random.randn(1,2)
A = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)
Z = np.random.randn(1,2)
linear_cache = (A, W, b)
activation_cache = Z
linear_activation_cache = (linear_cache, activation_cache)
dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache,"sigmoid")
print ("sigmoid:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db) + "\n")
print("=====================================")
dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, "relu")
print ("relu:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

sigmoid:
dA_prev = [[0.20534043 0.70022212]
 [0.04860888 0.16575893]
 [0.02522553 0.08602045]]
dW = [[-0.15884885 -0.24187055  0.01191697]]
db = [[-0.17993903]]

relu:
dA_prev = [[1.58386634 2.80174356]
 [0.37493815 0.66323812]
 [0.19457382 0.34418684]]
dW = [[-0.62537927 -0.96360787  0.1650769 ]]
db = [[-0.87143885]]


In [107]:
def Linear_backward_model_shallow_or_deep_NN(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    L = len(caches)
    Y = Y.reshape(AL.shape)
    m = AL.shape[1]
    grades = {}
    dAL = -(np.divide(Y,AL)) - (np.divide((1-Y), (1-AL)))
    dA_prev, dW, db = linear_activation_backward(dAL, caches[L-1], "sigmoid")
    grades['dA' + str(L-1)], grades['dW' + str(L)], grades['db' + str(L)]  = dA_prev, dW, db
    for l in reversed(range(L-1)):
        cuurent_cache = caches[l]
        dA_prev, dW, db = linear_activation_backward(grades['dA' + str(l+1)],cuurent_cache, "sigmoid")
        grades['dA' + str(l)], grades['dW' + str(l+1)], grades['db' + str(l+1)]  = dA_prev, dW, db
    
    return grades

In [109]:
AL = np.random.randn(1, 2)
Y = np.array([[1, 0]])

A1 = np.random.randn(4,2)
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
Z1 = np.random.randn(3,2)
linear_cache_activation_1 = ((A1, W1, b1), Z1)

A2 = np.random.randn(3,2)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
Z2 = np.random.randn(1,2)
linear_cache_activation_2 = ((A2, W2, b2), Z2)

caches = (linear_cache_activation_1, linear_cache_activation_2)

grads = Linear_backward_model_shallow_or_deep_NN(AL, Y, caches)
print_grads(grads)

dW1 = [[ 1.58545954e-03 -1.40949465e-02 -1.29782508e-02 -8.80901444e-03]
 [ 2.49625221e-03 -1.62157979e-02 -1.49536643e-02 -1.10810514e-02]
 [-6.34542015e-05  7.11808671e-04  6.54856160e-04  4.21471372e-04]]
db1 = [[ 2.14132099e-03]
 [ 3.11373435e-03]
 [-9.20699198e-05]]
dA1 = [[ 0.08273794 -0.05459794]
 [ 0.08234898 -0.05434127]
 [-0.00497779  0.0032848 ]]


In [110]:
# GRADED FUNCTION: update_parameters

def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    ### START CODE HERE ### (≈ 3 lines of code)
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W"+str(l+1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b"+str(l+1)] - learning_rate * grads["db" + str(l + 1)]
    ### END CODE HERE ###
    return parameters

In [111]:
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
parameters = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2}
np.random.seed(3)
dW1 = np.random.randn(3,4)
db1 = np.random.randn(3,1)
dW2 = np.random.randn(1,3)
db2 = np.random.randn(1,1)
grads = {"dW1": dW1,
         "db1": db1,
         "dW2": dW2,
         "db2": db2}
    
parameters = update_parameters(parameters, grads, 0.1)

print ("W1 = "+ str(parameters["W1"]))
print ("b1 = "+ str(parameters["b1"]))
print ("W2 = "+ str(parameters["W2"]))
print ("b2 = "+ str(parameters["b2"]))

W1 = [[-1.17487374 -1.2395125   2.49633054  2.10614156]
 [-1.36395506  0.48569364  0.63571123  0.8140373 ]
 [ 0.14477725 -0.87915014 -0.05103393 -0.57958738]]
b1 = [[ 0.04624131]
 [-0.43932861]
 [-0.13667899]]
W2 = [[ 1.05902022  1.28509419 -1.02645544]]
b2 = [[-0.55816685]]
