In [171]:
import numpy as np
import warnings
import pandas as pd
from typing import Tuple
import matplotlib.pyplot as plt
EPSILON = 1e-8

In [2]:
def initialize_parameters(layer_dims : np.ndarray) -> dict:
    parameters = {}     # This will hold the parameters 
    
    # randomize the weights in each layer and set to zero all the biases
    for i in range (1,layer_dims.size):
        parameters[f"W{i}"] = np.array((np.random.randn(layer_dims[i],layer_dims[i-1])))*np.sqrt(2/layer_dims[i-1])
        # parameters[f"W_{i}"] = np.array((np.random.randint(1,3,size=(layer_dims[i],layer_dims[i-1]))))
        parameters[f"b{i}"] = np.zeros((layer_dims[i],1))
        
    return parameters

In [3]:
def linear_forward(A : np.ndarray, W : np.ndarray, b : np.ndarray)->Tuple[np.ndarray, dict]:
    """
    This function gets as input activation vector A, weight matrix W and bias vector b for each layer
    The output will be vector Z and a dictionary that saves the inpt parameters
    """
    Z = np.dot(W,A) + b

    linear_cach = {"A" : A,
                 "W" : W,
                 "b" : b
                }
    return Z, linear_cach

In [4]:
def Softmax(Z : np.ndarray)->Tuple[np.ndarray, dict]:
    """
    This function gets as an input the Z vector
    The output will be the activation vector for this function using the softmax function and the Z input
    """    
    # Z should be np.array
    exp_Z_sum = np.sum(np.exp(Z)+EPSILON)
    softmax = lambda Z_i : np.exp(Z_i)/exp_Z_sum
    A = softmax(Z)
    activation_cache = Z
    
    return A, activation_cache 

In [5]:
def ReLu(Z : np.ndarray)->Tuple[np.ndarray, dict]:
    """
    This function gets as an input the Z vector
    The output will be the activation vector for this function using the ReLu function and the Z input
    """

    relu_func = lambda Z : np.maximum(0,Z)
    A = relu_func(Z)
    activation_cache  = Z
    
    return A, activation_cache 

In [6]:
def linear_activation_forward(A_prev : np.ndarray, W : np.ndarray, b : np.ndarray, activation : str) -> Tuple[np.ndarray , dict]:
    """
    This function inputs are the previous layer activation, its weight matrix and the activation function
    The output is the activation vector and activation cach and the dictionary saving the information   
    """
    Z, linear_cach = linear_forward(A_prev, W, b)
    
    if activation == "softmax":
        A, activation_cache = Softmax(Z)
    elif activation == "relu":
        A, activation_cache = ReLu(Z)
    else:
        raise ValueError("The Activation code is not recognizable")
        
    dict_update = ({"Z" : activation_cache})
    cach = {**linear_cach,**dict_update}

    return A, cach

In [7]:
def L_model_forward(X : np.ndarray, parameters : dict, use_batchnorm : bool) -> Tuple[np.ndarray, dict]:
    """
    Function input:
    X - the data, numpy array of shape (input size, number of examples)
    parameters – the initialized W and b parameters of each layer - a dictionary with W_i and b_i as titles
    use_batchnorm - a boolean flag used to determine whether to apply batchnorm after the activation 

    Funciton output:
    AL – the last post-activation value
    caches – a list of all the cache objects generated by the linear_forward function
    """
    # X = X.T     
    # For the case the number of examples is one
    if(X.ndim == 1):
        X = X.reshape(-1,1)

    num_param = np.shape(X)[0] #number of lines
    num_examples = np.shape(X)[1] #number of columns    
    caches = {}
    L = len(parameters) // 2  #number of layers
    A_prev = X 
    for j in range(1,L):

        W = parameters[f'W{j}']
        b = parameters[f'b{j}']
        A_prev, cach = linear_activation_forward(A_prev, W, b,'relu')
        caches.update({f'Layer_{j}' : cach})

    W = parameters[f'W{L}']
    b = parameters[f'b{L}']
    bias = np.array([b]*num_examples).T #creates a bias matrix
    AL, cach = linear_activation_forward(A_prev, W, b,'softmax')
    caches.update({f'Activation Layer' : cach})
    
    # print (f"Activation for last layer is {AL}")

    return AL, caches

In [101]:
def compute_cost(AL : np.ndarray,Y : np.ndarray)-> int:
    """
    Input:
    AL – probability vector corresponding to your label predictions, shape (num_of_classes, number of examples)
    Y – the labels vector (i.e. the ground truth)

    Output:
    cost – the cross-entropy cost
    """
    num_examples = np.shape(AL)[1]
    cost = -np.sum(np.dot(Y,np.log(AL + EPSILON).T))
    return cost/num_examples

In [9]:
def apply_batchnorm(A : np.ndarray)-> int:
    """
    Description:
    performs batchnorm on the received activation values of a given layer.

    Input:
    A - the activation values of a given layer

    output:
    NA - the normalized activation values, based on the formula learned in class
    """
    mu = np.mean(A, axis=0)
    var = np.var(A, axis=0)
    A_normalized = (A - mu) / np.sqrt(var + EPSILON)

    return A_normalized

Backwards

In [10]:
def linear_backward(dZ : np.ndarray, cache : dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    description:
    Implements the linear part of the backward propagation process for a single layer

    Input:
    dZ – the gradient of the cost with respect to the linear output of the current layer (layer l)
    cache – tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Output:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache['A'], cache['W'], cache['b']
    m = A_prev.shape[1]

    dW = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    return dA_prev, dW, db



In [11]:
def relu_backward(dA : np.ndarray, activation_cache : np.ndarray) -> np.ndarray:
    """
    Description:
    Implements backward propagation for a ReLU unit

    Input:
    dA – the post-activation gradient
    activation_cache – contains Z (stored during the forward propagation)

    Output:
    dZ – gradient of the cost with respect to Z
    """
    Z = activation_cache
    dZ = np.array(dA, copy=True)  # Converting dA to a correct object

    # When Z <= 0, set dZ to 0
    dZ[Z <= 0] = 0

    return dZ

In [103]:
def softmax_backward(dA : np.ndarray, activation_cache : np.ndarray) -> np.ndarray:
    """
    Description:
    Implements backward propagation for a softmax unit

    Input:
    dA – the post-activation gradient
    activation_cache – contains Z (stored during the forward propagation)

    Output:
    dZ – gradient of the cost with respect to Z
    """
    Z = activation_cache
    
#     print(f"""softmax_backward:
#           Z shape : {Z.shape}
#           dA shape: {dA.shape}
#           ---------------------------------------""")
    


    # Calculate softmax(Z)
    expZ = np.exp(Z)
    sum_expZ = np.sum(expZ, axis=0, keepdims=True) + EPSILON
    p = expZ / sum_expZ

    # Calculate dZ=p-y (dA=y when using cross-entropy)
    dZ = dA + Z

    return dZ

In [13]:
def linear_activation_backward(dA : np.ndarray, cache : dict, activation : str) -> Tuple[np.ndarray,np.ndarray,np.ndarray] :
    """
    Description:
    Implements the backward propagation for the LINEAR->ACTIVATION layer. The function first computes dZ and then applies the linear_backward function.

    Some comments:
        The derivative of ReLU is f^' (x)={■(1&if x>0@0&otherwise)┤
        The derivative of the softmax function is: p_i-y_i, where p_i is the softmax-adjusted probability of the class and y_i is the “ground truth” (i.e. 1 for the real class, 0 for all others) 
        You should use the activations cache created earlier for the calculation of the activation derivative and the linear cache should be fed to the linear_backward function

    Input:
    dA – post activation gradient of the current layer
    cache – contains both the linear cache and the activations cache

    Output:
    dA_prev – Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW – Gradient of the cost with respect to W (current layer l), same shape as W
    db – Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache = {}
    linear_cache['A'] = cache['A']
    linear_cache['W'] = cache['W']
    linear_cache['b'] = cache['b']
    activation_cache = cache['Z']
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
    else:
        raise ValueError("The Activation code is not recognizable")

    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    return dA_prev, dW, db


In [98]:
def L_model_backward(AL : np.ndarray, Y : np.ndarray, caches : dict) -> dict:
    """
    Description:
    Implement the backward propagation process for the entire network.

    Some comments:
    the backpropagation for the softmax function should be done only once as only the output layers uses it and the RELU should be done iteratively over all the remaining layers of the network. 

    Input:
    AL - the probabilities vector, the output of the forward propagation (L_model_forward)
    Y - the true labels vector (the "ground truth" - true classifications)
    Caches - list of caches containing for each layer: a) the linear cache; b) the activation cache

    Output:
    Grads - a dictionary with the gradients
                grads["dA" + str(l)] = ... 
                grads["dW" + str(l)] = ...
                grads["db" + str(l)] = ...
    """
    grads = {}
    L = len(caches)  # the number of layers
    m = AL.shape[1]
    # Y = np.tile(Y,(m,1)).T  # after this line, Y is the same shape as AL

    # Initializing the backpropagation
    # Gradient of cost with respect to AL
    dAL = Y

    # Lth layer (softmax -> linear) gradients. Inputs: "AL, Y, caches".
    current_cache = caches[str(list(caches.keys())[-1])]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, "softmax")

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (ReLU -> linear) gradients.
        current_cache = caches[(list(caches.keys())[l])]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [15]:
def update_parameters(parameters : dict, grads : dict, learning_rate : int) -> dict:
    """
    Description:
    Updates parameters using gradient descent

    Input:
    parameters – a python dictionary containing the DNN architecture’s parameters
    grads – a python dictionary containing the gradients (generated by L_model_backward)
    learning_rate – the learning rate used to update the parameters (the “alpha”)

    Output:
    parameters – the updated values of the parameters object provided as input
    """

    L = len(parameters) // 2  # number of layers in the neural network

    # Update rule for each parameter. Use a loop.
    for l in range(L):
        parameters["W" + str(l + 1)] -= learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] -= learning_rate * grads["db" + str(l + 1)]
    return parameters

In [96]:
def L_layer_model(X : np.ndarray, Y : np.ndarray, layers_dims : list , learning_rate : int , num_iterations : int, batch_size : int) -> Tuple[dict, np.ndarray]:
    """
    Description:
    Implements a L-layer neural network. All layers but the last should have the ReLU activation function, 
    and the final layer will apply the softmax activation function. The size of the output layer should be 
    equal to the number of labels in the data. Please select a batch size that enables your code to run well 
    (i.e. no memory overflows while still running relatively fast).

    Hint: the function should use the earlier functions in the following order: initialize -> L_model_forward -> compute_cost -> L_model_backward -> update parameters

    Input:
    X – the input data, a numpy array of shape (height*width , number_of_examples) 
    Comment: since the input is in grayscale we only have height and width, otherwise it would have been height*width*3
    Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    Layer_dims – a list containing the dimensions of each layer, including the input
    batch_size – the number of examples in a single training batch.

    Output:
    parameters – the parameters learnt by the system during the training
    (the same parameters that were updated in the update_parameters function).
    costs – the values of the cost function (calculated by the compute_cost function). 
    One value is to be saved after each 100 training iterations (e.g. 3000 iterations -> 30 values).
    """
    num_batches = int(X.shape[1] / batch_size)
    if (num_iterations % num_batches != 0):
        raise ValueError ("number of iteration over batch size is NOT an integer")
    num_epoches = int(num_iterations / num_batches)


    cost_array = []
    parameters = initialize_parameters(layers_dims)
    iteration = 1
    for epoch in range(1,num_epoches+1):

        for i in range(0,num_batches,batch_size):

            X_mini_batch = X[:, i : i+batch_size]
            Y_mini_batch = Y[:, i : i+batch_size]
            AL, caches = L_model_forward(X_mini_batch, parameters, False)

            grads = L_model_backward(AL, Y_mini_batch, caches)

            parameters = update_parameters(parameters, grads, learning_rate=1e-3)

            #This next part is to register cost function reslut every 100 iteration, the outer loop runs on epoch so here we calculate
            #the all the iteration indexes in a given run over the network
            if (iteration + batch_size <= num_iterations):
                iteration_list = np.arange(iteration,iteration + batch_size,1)
            else:
                iteration_list = np.arange(iteration,num_iterations+1,1)

            if np.any(iteration_list % 100 == 0):
                cost_array.append(compute_cost(AL,Y_mini_batch))

            iteration += batch_size

    return parameters, cost_array

In [133]:
def Predict(X : np.ndarray, Y : np.ndarray, parameters : dict) -> float:
    """
    Description:
    The function receives an input data and the true labels and calculates the accuracy of the trained neural network on the data.

    Input:
    X – the input data, a numpy array of shape (height*width, number_of_examples)
    Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    Parameters – a python dictionary containing the DNN architecture’s parameters

    Output:
    accuracy – the accuracy measure of the neural net on the provided data (i.e. the percentage of the samples 
    for which the correct label receives the hughest confidence score). Use the softmax function to normalize the output values.
    """

    AL, _ = L_model_forward(X, parameters, False)
    AL = Softmax(AL)

    max_prob_index = np.argmax(AL, axis=0) #This will hold, for each example, the index of the feature with largest probabilty
    max_true_index = np.argmax(Y, axis=0) #This will hold, for each example, the index of the feature with largest truth

    num_correct_pred = np.sum(max_prob_index == max_true_index) #How many crrect prediction did the network made with this prameters
    num_examples = Y.shape[1]

    accuracy = num_correct_pred/num_examples

    return accuracy


In [179]:
layer_dims = np.array([3,20,7,5,10])
X = np.random.randint(0,3,size = (layer_dims[0],1000))
Y = np.random.randint(0,3,size = (layer_dims[-1],X.shape[1]))