In [160]:
import numpy as np

In [163]:
def initialize_parameters(layer_dims):
    """
    Implements initialization of network layers
    
    Receives:
    layer_dims -- python array containing the dimensions of each layer in the network (counting input layer)
    
    Returns:
    parameters -- python dictionary containing the parameters W1, b1, ..., WL, bL
                    Wl -- weight matrix, numpy array with shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector, numpy array with shape (layer_dims[l], 1)
    """
    
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)
    
    for l in range(1, L):
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(1/layer_dims[l-1])
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

In [164]:
layer_dims = [5, 3, 1]
parameters = initialize_parameters(layer_dims)
L = len(layer_dims)
for l in range(1, L):
    print("W" + str(l) + ":\n" + str(parameters["W"+str(l)]))
    print("b" + str(l) + ":\n" + str(parameters["b"+str(l)]))

W1:
[[ 0.72642933 -0.27358579 -0.23620559 -0.47984616  0.38702206]
 [-1.0292794   0.78030354 -0.34042208  0.14267862 -0.11152182]
 [ 0.65387455 -0.92132293 -0.14418936 -0.17175433  0.50703711]]
b1:
[[0.]
 [0.]
 [0.]]
W2:
[[-0.63502252 -0.09955147 -0.50683179]]
b2:
[[0.]]


In [165]:
def sigmoid(Z, use_cache=True):
    """
    Implements the sigmoid function
    
    Receives:
    Z -- output of the linear function, numpy array with shape (size of current layer, number of examples)
    use_cache -- if true, caches Z
    
    Returns:
    A -- output of sigmoid(Z), numpy array with shape (size of current layer, number of examples)
    cache -- caches Z, useful during backpropagation
    """
    
    A = 1/(1+np.exp(-Z))
    
    if use_cache:
        cache = Z
        return A, cache
    else:
        return A

In [166]:
Z = np.random.randn(3,2)
A = sigmoid(Z, use_cache=False)
print("A:\n" + str(A))

A:
[[0.51055187 0.64171493]
 [0.2496239  0.75854586]
 [0.71127629 0.62304533]]


In [167]:
def relu(Z, use_cache=True):
    """
    Implements the ReLU function
    
    Receives:
    Z -- output of the linear function, numpy array with shape (size of current layer, number of examples)
    use_cache -- if true, caches Z
    
    Returns:
    A -- output of ReLU(Z), numpy array with shape (size of current layer, number of examples)
    cache -- caches Z, useful during backpropagation
    """
    
    A = np.maximum(0, Z)
    
    if use_cache:
        cache = Z
        return A, cache
    else:
        return A

In [168]:
Z = np.random.randn(3,2)
A = relu(Z, use_cache=False)
print("A:\n" + str(A))

A:
[[0.90085595 0.        ]
 [0.         0.        ]
 [0.         0.53035547]]


In [169]:
def linear_forward(A_prev, W, b, use_cache=True):
    """
    Implements the linear part of forward propagation
    
    Receives:
    A_prev -- activations from previous layer (or input layer), numpy array with shape (size of previous layer, number of examples)
    W -- weight matrix, numpy array with shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array with shape (size of current layer, 1)
    use_cache -- if true, caches A_prev, W and b
    
    Returns:
    Z -- output of the linear function, numpy array with shape (size of current layer, number of examples)
    cache -- caches A_prev, W and b, useful during backpropagation
    """
    
    Z = np.dot(W, A_prev) + b
    
    if use_cache:
        cache = (A_prev, W, b)
        return Z, cache
    else:
        return Z

In [171]:
A_prev = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)

Z = linear_forward(A_prev, W, b, use_cache=False)
print("Z:\n" + str(Z))

Z:
[[0.66160388 1.2287564 ]]


In [16]:
def linear_activation_forward(A_prev, W, b, activation, use_cache=True):
    """
    Implements the activation part of forward propagation
    
    Receives:
    A_prev -- activations from previous layer (or input layer), numpy array with shape (size of previous layer, number of examples)
    W -- weight matrix, numpy array with shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array with shape (size of current layer, 1)
    activation -- the activation to be used in this layer, can be "sigmoid" or "relu"
    use_cache -- if true, caches linear_cache and activation_cache
    
    Returns:
    A -- output of the activation function, numpy array with shape (size of current layer, number of examples)
    cache -- caches linear_cache and activation_cache, useful during backpropagation
    """
    
    if use_cache:
        Z, linear_cache = linear_forward(A_prev, W, b, use_cache=True)
        if activation == "sigmoid":
            A, activation_cache = sigmoid(Z, use_cache=True)
        elif activation == "relu":
            A, activation_cache = relu(Z, use_cache=True)
        cache = (linear_cache, activation_cache)
        return A, cache
    else:
        Z = linear_forward(A_prev, W, b, use_cache=False)
        if activation == "sigmoid":
            A = sigmoid(Z, use_cache=False)
        elif activation == "relu":
            A = relu(Z, use_cache=False)
        return A

In [229]:
A_prev = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)

A = linear_activation_forward(A_prev, W, b, activation="sigmoid", use_cache=False)
print("sigmoid:")
print("A:\n" + str(A))

A = linear_activation_forward(A_prev, W, b, activation="relu", use_cache=False)
print("\nReLU:")
print("A:\n" + str(A))

sigmoid:
A:
[[0.74432175 0.75958251]]

ReLU:
A:
[[1.0685536 1.150392 ]]


In [23]:
def forward_propagation(X, parameters, use_cache=True):
    """
    Implements forward propagation
    
    Receives:
    X -- data, numpy array with shape (size of input layer, number of examples)
    parameters -- python dictionary containing parameters of the model
    use_cache -- if true, caches every cache of linear_activation_forward()
    
    Returns:
    AL -- last output of the activation function
    caches -- python array containing every cache of linear_activation_forward(), useful during backpropagation
    """
    
    L = len(parameters) // 2
    A = X
    
    if use_cache:
        caches = []
        for l in range(1, L):
            A_prev = A
            A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], "relu", use_cache=True)
            caches.append(cache)
        AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], "sigmoid", use_cache=True)
        caches.append(cache)
        return AL, caches
    else:
        for l in range(1, L):
            A_prev = A
            A = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], "relu", use_cache=False)
        AL = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], "sigmoid", use_cache=False)
        return AL

In [299]:
X = np.random.randn(4,3)
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
parameters = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2}
AL = forward_propagation(X, parameters, use_cache=False)
print("AL:\n" + str(AL))

AL:
[[0.12926182 0.32471255 0.32471255]]


In [328]:
def compute_cost(AL, Y):
    """
    Implements the cost function
    
    Receives:
    AL -- probability vector corresponding to label predictions, numpy array with shape (output size, number of examples)
    Y -- true label vector, numpy array with shape (output size, number of examples)
    
    Returns:
    cost -- cross-entropy cost, scalar
    """
    
    m = AL.shape[1]
    
    cost = -1/m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL))
    cost = np.squeeze(cost)
    
    return cost

In [368]:
AL = np.array([[0.9, 0.3, 0.6]])
Y = np.array([[1, 0, 1]])

cost = compute_cost(AL, Y)
print("cost:\n" + str(cost))

cost:
0.3242870277875165


In [308]:
def sigmoid_backward(dA, cache):
    """
    Implements backward propagation for a sigmoid unit
    
    Receives:
    dA -- post-activation gradient, numpy array with shape (size of current layer, number of examples)
    cache -- Z, stored from linear_activation_forward() (of current layer l)
    
    Returns:
    dZ -- gradient of the cost with respect to Z, numpy array with shape (size of current layer, number of examples)
    """
    
    Z = cache
    s = sigmoid(Z, use_cache=False)
    dZ = dA * s * (1-s)
    
    return dZ

In [309]:
def relu_backward(dA, cache):
    """
    Implements backward propagation for a ReLU unit
    
    Receives:
    dA -- post-activation gradient, numpy array with shape (size of current layer, number of examples)
    cache -- Z, stored from linear_activation_forward() (of current layer l)
    
    Returns:
    dZ -- gradient of the cost with respect to Z, numpy array with shape (size of current layer, number of examples)
    """
    
    Z = cache
    Z[Z <= 0] = 0
    Z[Z > 0] = 1
    dZ = dA * Z
    
    return dZ

In [310]:
def linear_backward(dZ, cache):
    """
    Implements the linear part of backward propagation
    
    Receives:
    dZ -- gradient of the cost with respect to Z, numpy array with shape (size of current layer, number of examples)
    cache -- tuple of values (A_prev, W, b), stored from linear_forward() (of current layer l)
    
    Returns:
    dA_prev -- gradient of the cost with respect to the activation (of previous layer l-1), numpy array with same shape as A_prev
    dW -- gradient of the cost with respect to W (of current layer l), numpy array with same shape as W
    db -- gradient of the cost with respect to b (of current layer l), numpy array with same shape as b
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [369]:
dZ = np.random.randn(1,2)
A_prev = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)
linear_cache = (A_prev, W, b)

dA_prev, dW, db = linear_backward(dZ, linear_cache)
print ("dA_prev:\n"+ str(dA_prev))
print ("dW:\n" + str(dW))
print ("db:\n" + str(db))

dA_prev:
[[ 0.3549596   0.04786418]
 [ 1.63613911  0.22062357]
 [-1.07529876 -0.1449976 ]]
dW:
[[ 0.71624339 -0.76133087  0.28548822]]
db:
[[-1.02856814]]


In [370]:
def linear_activation_backward(dA, cache, activation):
    """
    Implements the activation part of backward propagation
    
    Receives:
    dA -- post-activation gradient, numpy array with shape (size of current layer, number of examples)
    cache -- tuple of values (linear_cache, activation_cache), stored from linear_activation_forward() (of current layer l)
    activation -- the activation to be used in this layer, can be "sigmoid" or "relu"
    
    Returns:
    dA_prev -- gradient of the cost with respect to the activation (of previous layer l-1), numpy array with same shape as A_prev
    dW -- gradient of the cost with respect to W (of current layer l), numpy array with same shape as W
    db -- gradient of the cost with respect to b (of current layer l), numpy array with same shape as b
    """
    
    linear_cache, activation_cache = cache
    
    
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

In [378]:
dAL = np.random.randn(1,2)
A = np.random.randn(3,2)
W = np.random.randn(1,3)
b = np.random.randn(1,1)
Z = np.random.randn(1,2)
linear_cache = (A, W, b)
activation_cache = Z
linear_activation_cache = (linear_cache, activation_cache)

dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation="sigmoid")
print ("sigmoid:")
print ("dA_prev:\n"+ str(dA_prev))
print ("dW:\n" + str(dW))
print ("db:\n" + str(db))

dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation="relu")
print ("\nReLU:")
print ("dA_prev:\n"+ str(dA_prev))
print ("dW:\n" + str(dW))
print ("db:\n" + str(db))

sigmoid:
dA_prev:
[[-0.46313465 -0.02826771]
 [-0.16832144 -0.0102736 ]
 [ 0.04235701  0.00258529]]
dW:
[[ 0.2262499   0.06811006 -0.07440005]]
db:
[[0.13884292]]

ReLU:
dA_prev:
[[-2.09419105 -0.13165465]
 [-0.76111181 -0.0478485 ]
 [ 0.19152889  0.01204077]]
dW:
[[ 1.02305044  0.30927502 -0.33557597]]
db:
[[0.62889995]]


In [379]:
def backward_propagation(AL, Y, caches):
    """
    Implements backward propagation
    
    Receives:
    AL -- probability vector corresponding to label predictions, numpy array with shape (output size, number of examples)
    Y -- true label vector, numpy array with shape (output size, number of examples)
    caches -- python array containing every cache of linear_activation_forward()
    
    Returns:
    grads -- python dictionary containing gradients
    """
    
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    
    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid")
    
    for l in range(L-1, 0, -1):
        current_cache = caches[l-1]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l)], current_cache, activation="relu")
        grads["dA" + str(l-1)] = dA_prev_temp
        grads["dW" + str(l)] = dW_temp
        grads["db" + str(l)] = db_temp
    
    return grads

In [382]:
AL = np.random.randn(1, 2)
Y = np.array([1, 0])

A1 = np.random.randn(4,2)
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
Z1 = np.random.randn(3,2)
linear_activation_cache_1 = ((A1, W1, b1), Z1)
A2 = np.random.randn(3,2)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
Z2 = np.random.randn(1,2)
linear_activation_cache_2 = ((A2, W2, b2), Z2)

caches = (linear_activation_cache_1, linear_activation_cache_2)
grads = backward_propagation(AL, Y, caches)

print ("dW1:\n"+ str(grads["dW1"]))
print ("db1:\n"+ str(grads["db1"]))
print ("dA1:\n"+ str(grads["dA1"]))   

dW1:
[[0.00652007 0.00127316 0.00356815 0.00023798]
 [0.         0.         0.         0.        ]
 [0.02595563 0.00506831 0.01420436 0.00094735]]
db1:
[[-0.00374853]
 [ 0.        ]
 [-0.01492246]]
dA1:
[[-0.00749706  0.00497036]
 [-0.0015931   0.00105619]
 [-0.02984492  0.01978641]]


In [383]:
def update_parameters(parameters, grads, learning_rate):
    """
    Updates parameters using gradient descent
    
    Receives:
    parameters -- python dictionary containing parameters
    grads -- python dictionary containing gradients
    learning_rate -- learning rate of the update rule, scalar
    
    Returns:
    parameters -- python dictionary containing updated parameters
    """
    
    L = len(parameters) // 2
    
    for l in range(1, L+1):
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * grads["dW" + str(l)]
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * grads["db" + str(l)]
    
    return parameters

In [384]:
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
parameters = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2}
dW1 = np.random.randn(3,4)
db1 = np.random.randn(3,1)
dW2 = np.random.randn(1,3)
db2 = np.random.randn(1,1)
grads = {"dW1": dW1,
         "db1": db1,
         "dW2": dW2,
         "db2": db2}
parameters = update_parameters(parameters, grads, 0.01)

print ("W1:\n"+ str(parameters["W1"]))
print ("b1:\n"+ str(parameters["b1"]))
print ("W2:\n"+ str(parameters["W2"]))
print ("b2:\n"+ str(parameters["b2"]))

W1:
[[-1.2280518  -0.03836167 -0.25914991 -3.41483169]
 [-0.80030676  0.61356837  2.58664907  1.77434171]
 [-1.10588837  0.60891102 -0.93046137 -0.73290954]]
b1:
[[1.4227802 ]
 [0.54949888]
 [0.56013303]]
W2:
[[ 1.15082314 -0.7500405   0.9910529 ]]
b2:
[[0.32704674]]


In [385]:
def predict(X, Y, parameters):
    """
    Implements predicting for testing
    
    Receives:
    X -- data, numpy array with shape (size of input layer, number of examples)
    Y -- true label vector, numpy array with shape (output size, number of examples)
    parameters -- python dictionary containing parameters
    
    Returns:
    p -- predictions for X
    """
    
    L = len(parameters) // 2
    m = X.shape[1]
    
    p = np.zeros((1, m))
    probabilities = forward_propagation(X, parameters, use_cache=False)
    
    for i in range(m):
        if probabilities[0,i] > 0.5:
            p[0,i] = 1
    
    print("Accuracy: " + str(np.sum(p == Y)/m))
    
    return p

In [386]:
X = np.random.randn(4,3)
Y = np.array([1, 0, 1])
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
parameters = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2}
p = predict(X, Y, parameters)

Accuracy: 0.6666666666666666
