In [1]:
import numpy as np

In [2]:
# Initialize weights
def initialize_weights(layer_dimensions):
    np.random.seed(42) # For reproducibility
    parameters = {}
    L = len(layer_dimensions) - 1 # Number of layers excluding input layer
    for l in range(1, L + 1):
        # The initialization for weights
        parameters[f"W{l}"] = np.random.randn(
            layer_dimensions[l], layer_dimensions[l - 1]
            ) * np.sqrt(2 / layer_dimensions[l - 1])

        # Initializing biases to zeros
        parameters[f"b{l}"] = np.zeros((layer_dimensions[l], 1))
    return parameters

In [3]:
# Activation functions
elu_alpha = 1
def elu(x):
    return np.where(x > 0, x, elu_alpha * (np.exp(x) - 1))

def softmax(x):
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis = 0, keepdims = True)


In [4]:
# cross-entropy Cost function

def cost(y_hat, y):
    assert(y.shape[0] == y_hat.shape[0])
    assert(y.shape[1] == y_hat.shape[1])
    assert(y.shape[1] == 1)

    m = y.shape[1]
    return -np.sum(y * np.log(y_hat)) / m

# Test
y_temp = np.array([[1], [0], [0]])
y_hat_temp = np.array([[0.1], [0.1], [0.8]])
assert(cost(y_hat_temp, y_temp) > 1)

In [5]:
# Forward propagation functions

def linear_and_activation_forward(prev_A, W, b, activation_function):
    # Assert part (for debugging)
    assert(prev_A.shape[1] == 1)
    assert(W.shape[1] == prev_A.shape[0])
    assert(b.shape[1] == 1)
    assert(W.shape[0] == b.shape[0])

    # Linear part forward
    Z = np.matmul(W, prev_A) + b

    # Activation part forward
    if activation_function == 'elu':
        A = elu(Z)
    elif activation_function == 'softmax':
        A = softmax(Z)
    else:
        assert(false)
    
    cache = ((prev_A, W, b), Z)
    return A, cache

def forward_propagation(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2
    for l in range(1, L):
        A_prev = A
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        A, cache = linear_and_activation_forward(A_prev, W, b, 'elu')
        caches.append(cache)
    
    W = parameters[f"W{L}"]
    b = parameters[f"b{L}"]
    A, cache = linear_and_activation_forward(A, W, b, 'softmax')
    caches.append(cache)

    return A, caches
    

In [6]:
# Backward propagation functions

def linear_and_activation_backward(dA, cache, activation_function):
    # Extract useful caches
    linear_cache, activation_cache = cache
    A_prev, W, b = linear_cache

    # Activation part backward
    if activation_function == 'elu':
        dZ = np.array(dA, copy=True)
        dZ[activation_cache <= 0] = dA[activation_cache <= 0] * (elu_alpha * np.exp(activation_cache[activation_cache <= 0]))
    elif activation_function == 'softmax':
        # TODO
    else:
        assert(false)

    print(activation_cache)
    print(dZ)
    
    # Linear part backward
    dW = np.mulmat(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims = True) / m
    dA_prev = np.mulmat(W.T, dZ)

    return dA_prev, dW, db

def backward_propagation(Y, Y_hat, caches):
    grads = {}
    L = len(caches)
    m = Y_hat.shape[1]

    assert(Y.shape == Y_hat.shape)

    dA = Y - Y_hat
    print("Shape: ", dA.shape)
    
    dA, dW, db = linear_and_activation_backward(dA, caches[L - 1], 'softmax')
    grads[f"dA{L - 1}"] = dA
    grads[f"dW{L}"] = dW
    grads[f"db{L}"] = db
    
    for l in np.arange(L - 2, -1, -1):
        dA, dW, db = linear_and_activation_backward(dA, caches[l], 'relu')
        grads[f"dA{l}"] = dA
        grads[f"dW{l + 1}"] = dW
        grads[f"db{l + 1}"] = db

    return grads

In [7]:
# Update parameters

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(0, L):
        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]

In [9]:
# Part (i)

hl_count = 5
n_x = 5
n_h = 10 
n_y = 3

layer_dimensions = [n_x] + [n_h for _ in range(0, hl_count)] + [n_y]
X = np.array([[1], [2], [3], [4], [5]])

parameters = initialize_weights(layer_dimensions)

output, caches = forward_propagation(X, parameters)
print("Output of feed forward is: \n", output)

Output of feed forward is: 
 [[0.01158557]
 [0.40535973]
 [0.5830547 ]]


In [9]:
# Part (ii)

# Initialize
hl_count = 30
n_x = 5
n_h = 10 
n_y = 3

layer_dimensions = [n_x] + [n_h for _ in range(0, hl_count)] + [n_y]
X = np.array([[1], [2], [3], [4], [5]])
Y = np.array([[1], [0], [0]])

# Run forward
parameters = initialize_weights(layer_dimensions)
y_hat, caches = forward_propagation(X, parameters)

# Run backward
grads = backward_propagation(Y, y_hat, caches)

Shape:  (3, 1)


ValueError: operands could not be broadcast together with shapes (10,1) (3,1) 