## Imports

In [1]:
import numpy as np

## Activation functions

In [2]:
    """
    Calculates and returns the depth of the tree
        Parameters:
            tree (<DecisionTreeClassifier> object): tree object to draw (optional)
        Returns:
            (int): depth of the tree
    """

'\nCalculates and returns the depth of the tree\n    Parameters:\n        tree (<DecisionTreeClassifier> object): tree object to draw (optional)\n    Returns:\n        (int): depth of the tree\n'

In [3]:
def relu(Z):
    """
    Computes ReLU (Rectified Lenear Unit) activation on Z.
        Parameters:
            Z (<numpy.ndarray>)
        Returns:
            A (<numpy.ndarray>): Z passed to the relu
            cache (<numpy.ndarray>): input (for backward propagation)
    """
    A = np.maximum(0, Z)
    cache = Z
    
    return (A, cache)


def sigmoid(Z):
    """
    Computes sigmoid activation on Z.
        Parameters:
            Z (<numpy.ndarray>)
        Returns:
            A (<numpy.ndarray>): Z passed to the relu
            cache (<numpy.ndarray>): input (for backward propagation)
    """
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    
    return (A, cache)

## Weights Initialization

### add diff inits
He, Xavier, random, zeros?

https://datascience-enthusiast.com/DL/Improving-DeepNeural-Networks-Initialization.html

In [4]:
def initialize_params(layer_dims):
    """
    Initializes the weights for the (deep) neural network layers using Xavier's Initialization.
        Parameters:
            layer_dims (list): list of layers' number of nodes (including input layer)
        Returns:
            params (dict): dictionary containing weights and bias per layer
                "Wn": <numpy.ndarray> weights for layer n
                "bn": <numpy.ndarray> bias for layer n
    """
    param = {}
    nlayers = len(layer_dims)
    
    for l in range(1, nlayers):
        params[f"W{l}"] = np.random.rand(layer_dims[l], layer_dims[l-1]) \
        * np.sqrt(6/(layer_dims[l]+layer_dims[l-1]))
#         params[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l-1]) \
#         * np.sqrt(2/(layer_dims[l]+layer_dims[l-1]))
        
        params[f"b{l}"] = np.zeros((layer_dims[l], 1))
    
    return params

## Forward Propagation

In [5]:
def forward_propagate_layer(A_prev, W, b, activate_func):
    """
    Applies forward propagation (linear & activation).
    Parameters:
            A_prev (list): this layer's input (last layer's output)
            params (dict): dictionary containing weights and bias per layer
                "Wn": <numpy.ndarray> weights for layer n
                "bn": <numpy.ndarray> bias for layer n
                "An": (<function>): activation function
        Returns:
            A (<numpy.ndarray>): layer output (post-activation)
            cache (tuple): forward propagation caches for backward
                (linear_cache, (activation_cache, activation_name))
            
    """
    Z = W @ A_prev + b
    linear_cache = (A_prev, W, b)
    
    A, activation_cache = activate_func(Z)
    cache = (linear_cache, (activation_cache, activate_func.__name__))
    
    return (A, cache)

In [6]:
def forward_propagate(X, params):
    """
    Forward propagates X through all model layers.
    Parameters:
            X (list): this layer's input (last layer's output)
            params (dict): dictionary containing weights and bias per layer
                "Wn": <numpy.ndarray> weights for layer n
                "bn": <numpy.ndarray> bias for layer n
                "An": (<function>): activation function
        Returns:
            A (<numpy.ndarray>): model output
            cache (list): forward propagation caches for backward
                [(linear_cache, activation_cache), ...]
            
    """
    caches = []
    A = X
    nlayers = len(params) // 3
    
    for l in range(1, nlayers+1):
        A, cache = forward_propagate_layer(A, 
                                           params[f"W{l}"], 
                                           params[f"b{l}"],
                                           params[f"A{l}"])
        caches.append(cache)
    
    return (A, caches)

## Cost Computation

In [11]:
def compute_cost(Yh, Y):
    """
    Computes cost using the cross-entropy / log-loss function
    Parameters:
            Yh (<numpy.ndarray>): predicted output (y_hat)
            Y (<numpy.ndarray>): true output (y)
        Returns:
            cost (float): cost value
    """
    cost = ((Y @ np.log(Yh.T)) + ((1 - Y) @ np.log((1-Yh).T))) / (-Y.shape[1])
    cost = np.squeeze(cost)
    
    return cost

## Backward Propagation

In [70]:
def backward_propagate_layer(dA, cache):
    """
    Applies backward propagation (linear & activation).
    Parameters:
            dA (<numpy.ndarray>): current layer's post-activation gradient 
            cache (tuple): forward propagation caches for backward
                (linear_cache, (activation_cache, activation_name))
        Returns:
            dA_prev (<numpy.ndarray>): Gradient with respect to previous layer's input (A_prev)
            dW (<numpy.ndarray>): Gradient with respect to current layer's wieghts (W)
            db (<numpy.ndarray>): Gradient with respect to previous layer's bias (b)
            
    """
    def relu_backward(dA, cache):
        """
        ReLU backward propagation implementation.
        Parameters:
            dA (<numpy.ndarray>): post-activation gradient 
            Y (<numpy.ndarray>): activation input (Z)
        Returns:
            dZ (<numpy.ndarray>): Gradient with respect to activation input (Z)
        """
        dZ = np.copy(dA)
        dZ[cache <= 0] = 0
        
        return dZ
    
    def sigmoid_backward(dA, cache):
        """
        sigmoid backward propagation implementation.
        Parameters:
            dA (<numpy.ndarray>): post-activation gradient
            Y (<numpy.ndarray>): activation input (Z)
        Returns:
            dZ (<numpy.ndarray>): Gradient with respect to activation input (Z)
        """
        s, _ = sigmoid(cache)
        dZ = dA * s * (1 - s)
        
        return dZ
    
    activation_backward_func = {'relu': relu_backward,
                                'sigmoid': sigmoid_backward}

    linear_cache, (activation_cache, activation_name) = cache
    
    # Activation backward propagation
    dZ = activation_backward_func[activation_name](dA, activation_cache)
    
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    # Linear backward propagation
    dA_prev = W.T @ dZ
    dW = (dZ @ A_prev.T) / m
    db = np.sum(dZ, 1, keepdims=True) / m
    
    return (dA_prev, dW, db)

In [71]:
def backward_propagate(Yh, Y, caches):
    """
    Backward propagates Error through all model layers.
    Parameters:
            Yh (<numpy.ndarray>): predicted output (y_hat)
            Y (<numpy.ndarray>): true output (y)
            cache (list): forward propagation caches
                [(linear_cache, activation_cache), ...]
        Returns:
            grads (dict): dictionary containing parameters' gradients
                "dAn": <numpy.ndarray> weights for layer n (*deprecated)
                "dWn": <numpy.ndarray> weights for layer n
                "dbn": <numpy.ndarray> bias for layer n      
    """
    grads = {}
    nlayers = len(caches)
    
    grads[f"dA{nlayers}"] = (Yh - Y) / ((1 - Yh) * Yh)
    
    for l in range(nlayers, 0, -1):
        current_cache = caches[l-1]
        dA_prev, dW, db = backward_propagate_layer(grads[f"dA{l}"], 
                                                   current_cache)
        grads[f"dA{l-1}"] = dA_prev
        grads[f"dW{l}"] = dW
        grads[f"db{l}"] = db
    
    return grads

## Update Parameters (with Gradient Descent)

In [88]:
def update_params(params, grads, lr):
    """
    Apply Gradient Descent to update parameters using 
        computed gradients and learning rate.
    Parameters:
        params (dict): dictionary containing weights and bias per layer
                    "Wn": <numpy.ndarray> weights for layer n
                    "bn": <numpy.ndarray> bias for layer n
                    "An": (<function>): activation function
        grads (dict): dictionary containing parameters' gradients
                    "dAn": <numpy.ndarray> weights for layer n (*deprecated)
                    "dWn": <numpy.ndarray> weights for layer n
                    "dbn": <numpy.ndarray> bias for layer n 
        lr (float): learning rate
    Returns:
        params (dict): *updated dictionary containing weights and bias per layer
                    "Wn": <numpy.ndarray> weights for layer n
                    "bn": <numpy.ndarray> bias for layer n
                    "An": (<function>): activation function
    """
    
    nlayers = len(params) // 3
    print(nlayers)
    for l in range(1, nlayers+1):
        params[f"W{l}"] -= lr * grads[f"dW{l}"]
        params[f"b{l}"] -= lr * grads[f"db{l}"]
    
    return params