In [18]:
import numpy as np

### Initialization
Make weight matrices and biases for each layer

Create a dictionary object where the `key` is the layer you're in and the `value` is the weight matrix or bias for the fully connected layer

Randomly initiate weights (but *scale*) and initiate biases as 0

In [77]:
layer_dims = [3,2,4,2]

weights = {}
biases = {}

def initialize_weights(dims): # He initialization
    for i in range(1, len(dims)):
        weights.update({f'W{i}': np.random.randn(dims[i],dims[i-1]) * np.sqrt(2 / dims[i])}) # scaled initialization for ReLU

def initialize_biases(dims):
    for i in range(1, len(dims)):
        biases.update({f'B{i}': np.zeros((dims[i], 1))})
        

In [198]:
# Xavier initialization is poor for relu

def initialize_params(dims): # He initialization ; for ReLu and Leaky ReLu activations
    params = {}
    for i in range(1, len(dims)):
        params.update({f'W{i}': np.random.randn(dims[i],dims[i-1]) * np.sqrt(2 / dims[i])}) # scaled initialization for ReLU
        params.update({f'B{i}': np.zeros((dims[i], 1))})

    assert(params['W' + str(i)].shape == (dims[i], dims[i-1])) # double check dimensions are correct
    assert(params['B' + str(i)].shape == (dims[i], 1))

    return params

### Define activation functions

Using ReLu for this assignment. 

Several others are included which I would like to fill in later for my own understanding/research

In [174]:
"""
Args:
Z - pre activation parameter ; output of the linear layer

Returns:
A - post activation parameter ; of the same shape as Z
cache - returns Z ; used during backpropagation
"""

def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    return A, cache

def sigmoid_backward(dA, cache):
    Z = cache
    s = 1 / (1 + np.exp(-Z)) # the sigmoid function
    dZ = dA * (s * (1-s)) # dA * the derivative of sigmoid. sigmoid derivative = (e ** -x) /(1 + e ** -X) = sig(x) * (1 - sig(x))

    assert (dZ.shape == Z.shape)

    return dZ

def relu(Z):
    A = np.maximum(0,Z)
    cache = Z
    return A, cache

def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    dZ[Z <= 0] = 0 # When z <= 0, you should set dz to 0 as well - think of what ReLu looks like. 

    assert (dZ.shape == Z.shape) # check if dZ has correct shape
    
    return dZ

def tanh(Z):
    A = (np.exp(Z) - np.exp(-Z)) / (np.exp(Z) + np.exp(-Z))
    return A, cache

def tanh_backward(dA, cache):
    Z = cache
    t = (np.exp(Z) - np.exp(-Z)) / (np.exp(Z) + np.exp(-Z)) # the tanh function
    dZ = dA * (1 - t ** 2) # dA * dervative of tanh. 

    assert (dZ.shape == Z.shape) # check if dZ has correct shape

    return dZ

def softplus(Z):
    A = np.log(1 + np.exp(Z))
    cache = Z
    return A, cache

def softplus_backward(dA, cache): # define later
    pass

def softmax(Z): # really only used in the output layer
    A = np.exp(Z - np.max(Z, axis=1, keepdims=True)) / np.exp(Z - np.max(Z, axis=1, keepdims=True)).sum(axis=1, keepdims=True) # numerically stable softmax
    cache = Z
    return A, cache

def softmax_backeard(dA, cache): # define later
    pass


### Define a feed forward method

In [144]:
def linear_forward (A, W, b): # linear transformation
    """
    Args:
    A - activations from previous layer: numpy array of shape (size of previous layer, number of examples)
    W - weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b - bias vector: numpy array of shape (size of current layer, 1)

    Returns:
    Z - input of the activation function (pre-activation parameter)
    cache - tuple: (A, W, b) ; used for computing backwards pass
    """
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)

    assert(Z.shape == (W.shape[0], A.shape[1])) # double check dimensions are correct

    return Z, cache

In [167]:
def linear_activation_forward(A_prev, weights, biases, activation): # linear transformation with activation function
    """
    Args:
    A_prev - activations from previous layer: numpy array of shape (size of previous layer, number of examples)
    weights - weights matrix: numpy array of shape (size of current layer, size of previous layer)
    biases - bias vector: numpy array of shape (size of current layer, 1)
    activation - the activation function to be used: string

    Returns:
    A_new - output of activation function: 
    cache - tuple: (linear cache, activation cache) ; used for computing backwards pass
    """

    
    Z, linear_cache = linear_forward(A_prev, weights, biases)

    if activation == "tanh":
        A_new, activation_cache = tanh(Z)

    elif activation == "sigmoid":
        A_new, activation_cache = sigmoid(Z)

    elif activation == "relu":
        A_new, activation_cache = relu(Z)

    elif activation == "softplus":
        pass

    cache = (linear_cache, activation_cache)

    return A_new, cache

In [150]:
def model_forward(X, parameters): # feed forward
    """
    Args:
    X - data to be modeled: numpy array of shape (input size, number of examples)
    parameters - python dictionary containing parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)

    Returns:
    A_final - activations from final layer
    caches - an indexed list of caches from every layer containing the previous activations, weights, biases, and pre-activation parameters 
    """

    caches = []
    A = X
    L = len(parameters) // 2 # number of layers in the model

    for i in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters[f'W{i}'], parameters[f'B{i}'], "relu") # hidden layers # note: relu is hard coded
        caches.append(cache)

    A_L, cache = linear_activation_forward(A, parameters[f'W{L}'], parameters[f'B{L}'], "softplus") # output layer # note: softplus is hard coded
    caches.append(cache)     # note: for classification tasks, the final layer usually uses sigmoid or softmax activation. For regression, usually no activation (or a linear activation) is used.

    assert(AL.shape == (1,X.shape[1])) # double check dimensions are correct

    return A_L, cache

### Define some loss functions

Will be using MSE in this assignment

Several others are included which I would like to fill in later for my own understanding/research

In [199]:
"""
Args:
AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

Returns:
cost -- cross-entropy cost
"""

def binary_cross_entropy(): # log loss ; classification
    m = Y.shape[1]
    cost = (-1/m) * (np.dot(Y, np.log(AL).T) + np.dot((1-Y), np.log(1-AL).T)) # using dot product instead of np.sum() becasue it's faster
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

def categorical_cross_entropy(): # log loss ; classification
    m = Y.shape[1]
    cost = (-1/m) * np.sum(np.mulitply(Y, np.log(AL))) # this is unfinished and I'm not sure if it's correct

    return cost

def neg_log_likelihood(): # for simple classifications
    pass

def MSE(AL, Y): # mean squared error ; regression
    m = Y.shape[1]
    cost = (1 / m) * np.dot((AL-Y).flatten(), (AL-Y).flatten()) # using dot product instead of np.sum() because it's faster
                                                                # does anything need to be trasposed?
    return cost

def MAE(): # mean absolute error ; regression
    pass

def KL_divergence(): # Kullback-Leibler Divergence
    pass

def Huber(): # Huber loss ; regression
    pass


### Define a backprop method

In [163]:
# can the autodiff assignment be used here?

def linear_backward(dZ, cache):
    """
    Args:
    dZ - Gradient of the cost with respect to the linear output (of the current layer)
    cache - tuple ; (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev - Gradient of the cost with respect to the activation (of the previous layer) ; same shape as A_prev
    dW - Gradient of the cost with respect to W (of the current layer) ; same shape as W
    db - Gradient of the cost with respect to b (of the current layer) ; same shape as b
    """

    A_prev, W, b = cache
    m = A_prev.shape[1] # is this the length of entries?

    dA_prev = np.dot(W.T, dZ) # what's happening here?
    dW = (1/m) * np.dot(dZ, A_prev.T) # what's happening here?
    db = (1/m) * np.sum(dZ, axis = 1, keepdims = True) # what's happening here?

    assert (dA_prev.shape == A_prev.shape) 
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [168]:
def linear_activation_backward(dA, cache, activation):
    """
    activation function takes input Z and returns A and an activation cache (Z) ; 
    activation_backward function takes input dA and the activation cache (Z) and returns dZ.
    
    Args:
    dA - post-activation gradient for current layer
    cache - tuple ; (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation - activation function used in this layer

    Returns:
    dA_prev - Gradient of the cost with respect to the activation (of the previous layer) ; same shape as A_prev
    dW - Gradient of the cost with respect to W (of the current layer) ; same shape as W
    db - Gradient of the cost with respect to b (of the current layer) ; same shape as b
    """

    linear_cache, activation_cache = cache

    if activation == "relu": # relu activation takes input Z and returns A and an activation cache (Z) ; relu_backward takes input dA and the activation cache (Z) and returns dZ
        dZ = relu_backward(dA, activation_cache)

    elif activation == "tanh":
        dZ = tanh_backward(dA, activation_cache)

    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)

    dA_prev, dW, b = linear_backward(dZ, linear_cache)

    return dA_prev, dW, b

### Check This

this block was the hardest to understand how it's working – `go through and make sure everything is correct.`

In [190]:
def model_backward(AL, Y, caches):
    """
    Args:
    AL - probability vector, output of the forward propagation (L_model_forward())
    Y - true "label" vector
    caches - list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
                how would this work for tanh? softplus?

    Returns:
    grads - dictionary ; gradients with respect to activations, weights, biases...
                         grads["dA" + str(l)] = ... 
                         grads["dW" + str(l)] = ...
                         grads["db" + str(l)] = ... 
    """

    grads = {}
    L = length(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL

    dAL = np.divide(Y, AL) - np.divide(1 - Y, 1 - AL) # initialize backprop

    current_cache = caches[L-1] # Last Layer
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(
        dAL, 
        current_cache, 
        activation = "relu") # note: relu is hard coded. This is the last (output) layer

    for i in reversed(range(L-1)): # Loop from l=L-2 to l=0
        current_cache = caches[i]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(
            grads["dA" + str(i+1)],
            current_cache,
            activation = "relu") # note: relu is hard coded. These are the hidden layers

        grads["dA" + str(i)] = dA_prev_temp
        grads["dW" + str(i+1)] = dW_temp
        grads["db" + str(i+1)] = db_temp

    return grads

### Update Parameters
which gradient descent update rule to use? 

which learning rate to use?

In [197]:

def update_params(params, grads, learning_rate):
    """
    generally we update by: new_params = params - grads * learning_rate
    Args:
    params - dict ; contains the (previous) parameters
    grads - dict ; contains the graduients calculated from backprop
    learning_rate - step size for gradient descent
    
    Returns:
    params - dict ; the updated parameters
                parameters["W" + str(l)] = ... 
                parameters["b" + str(l)] = ...
    """

    L = len(params) // 2 # params dict contains entries for both weights and biases

    for i in range(L):
        params["W" + str(i+1)] = params["W" + str(i+1)] - grads["dW" + str(i+1)] * learning_rate
        params["b" + str(i+1)] = params["b" + str(i+1)] - grads["db" + str(i+1)] * learning_rate

    return params

### Define an optimizer 

see https://realpython.com/gradient-descent-algorithm-python/    (implementation of Basic Gradient Descent)

gradient function = model_backward(): this returns the gradients

start could be params?

learning rate = learning rate

n_iter?



### Build this into a NN class