In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
def initialize_parameters(layer_dims, activations, initialization):
    params = {}
    for i in range(1, len(layer_dims)):
        if initialization == True:
            if activations[i - 1] == 'linear' or activations[i - 1] == 'relu':
                params['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(1 / layer_dims[i - 1])
            else:
                params['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(2 / layer_dims[i - 1])
        else:
            params['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * 0.01
        params['b' + str(i)] = np.zeros((layer_dims[i], 1))
    return params

In [5]:
def linear_activation(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

In [6]:
def sigmoid(Z):
    A = (1 / (1 + np.exp(-Z)))
    cache = Z
    return A, cache

In [7]:
def relu(Z):
    A = np.maximum(0, Z)
    cache = Z
    return A, cache

In [8]:
def tanh(Z):
    A = np.tanh(Z)
    cache = Z
    return A, cache

In [9]:
def leakyrelu(Z):
    A = np.maximum(0.01 * Z, Z)
    cache = Z
    return A, cache

In [10]:
def single_layer_forward(A_prev, W, b, activation, keep_prob):
    Z, linear_cache = linear_activation(A_prev, W, b)
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    elif activation == 'relu':
        A, activation_cache = relu(Z)
    elif activation == 'tanh':
        A, activation_cache = tanh(Z)
    elif activation == 'leakyrelu':
        A, activation_cache = leakyrelu(Z)
    elif activation == 'linear':
        if keep_prob != 1:
            D = np.random.rand(Z.shape[0], Z.shape[1])
            D = (D < keep_prob).astype(int)
            Z = D * Z
            Z /= keep_prob
            A_prev, W, b = linear_cache
            linear_cache = (D, A_prev, W, b)
        cache = linear_cache
        return Z, cache
    if keep_prob != 1:
        D = np.random.rand(A.shape[0], A.shape[1])
        D = (D < keep_prob).astype(int)
        A = D * A
        A /= keep_prob
        A_prev, W, b = linear_cache
        linear_cache = (D, A_prev, W, b)
    cache = (linear_cache, activation_cache)
    return A, cache

In [11]:
def n_layer_forward(X, layer_dims, parameters, activations, keep_prob):
    A = X
    caches = []
    for i in range(1, len(layer_dims)):
        A_prev = A
        if i < len(layer_dims) - 1:
            A, cache = single_layer_forward(A_prev, parameters['W' + str(i)], parameters['b' + str(i)], activations[i - 1], keep_prob[i - 1])
        else:
            A, cache = single_layer_forward(A_prev, parameters['W' + str(i)], parameters['b' + str(i)], activations[i - 1], 1)
        caches.append(cache)
    return A, caches

In [12]:
def compute_cost(AL, Y, activation):
    m = Y.shape[1]
    if activation == 'linear':
        cost = (1 / (2 * m)) * (np.sum((AL - Y) ** 2))
    else:
        cost = (- 1 / m) * (np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL))))
    cost = np.squeeze(cost)
    return cost

In [36]:
def linear_backward(dZ, cache, activation, keep_prob):
    if keep_prob != 1:
        D, A_prev, W, b = cache
    else:
        A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)
    return dA_prev, dW, db

In [14]:
def linear_derivative(dA):
    dZ = dA
    return dZ

In [15]:
def sigmoid_derivative(dA, cache):
    Z = cache
    s = (1 / (1 + np.exp(-Z)))
    dZ = dA * np.multiply(s, 1 - s)
    return dZ

In [16]:
def relu_derivative(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

In [17]:
def leakyrelu_derivative(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0.01
    return dZ

In [18]:
def tanh_derivative(dA, cache):
    Z = cache
    A = np.tanh(Z)
    dZ = 1 - np.power(A, 2)
    return dZ

In [19]:
def single_layer_backward(dA, caches, activation, keep_prob):
    if activation == 'linear':
        linear_cache = caches
        if keep_prob != 1:
            D, c, v, b = linear_cache
            dA = dA * D
            dA /= keep_prob
            linear_cache = (D, c, v, b)
        dZ = linear_derivative(dA)
        dA_prev, dW, db = linear_backward(dZ, linear_cache, activation, keep_prob)
    else:
        linear_cache, activation_cache = caches
        if keep_prob != 1:
            D, c, v, b = linear_cache
            dA = dA * D
            dA /= keep_prob
            linear_cache = (D, c, v, b)
        if activation == 'sigmoid':
            dZ = sigmoid_derivative(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, activation, keep_prob)
        elif activation == 'tanh':
            dZ = tanh_derivative(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, activation, keep_prob)
        elif activation == 'relu':
            dZ = relu_derivative(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, activation, keep_prob)
        elif activation == 'leakyrelu':
            dZ = leakyrelu_derivative(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, activation, keep_prob)
    return dA_prev, dW, db

In [54]:
def n_layer_backward(X, Y, A, layer_dims, caches, activations, keep_prob):
    grads = {}
    count = 0;
    if activations[len(activations) - 1] == 'linear':
        dA = A - Y
    else: 
        dA = - (np.divide(Y, A) - np.divide(1 - Y, 1 - A))
    grads['dA' + str(len(layer_dims) - 1)] = dA
    for i in range(len(layer_dims) - 1, 0, -1):
        if i < len(layer_dims) - 1:
            grads['dA' + str(i - 1)], grads['dW' + str(i)], grads['db' + str(i)] = single_layer_backward(grads['dA' + str(i)], caches[len(caches) - 1 - count], activations[len(activations) - 1 - count], keep_prob[i - 1])
        else:
            grads['dA' + str(i - 1)], grads['dW' + str(i)], grads['db' + str(i)] = single_layer_backward(grads['dA' + str(i)], caches[len(caches) - 1 - count], activations[len(activations) - 1 - count], 1)
        count += 1
    return grads

In [21]:
def gradient_descent(parameters, layer_dims, grads, learning_rate):
    for i in range(1, len(layer_dims)):
        parameters['W' + str(i)] = parameters['W' + str(i)] - learning_rate * grads['dW' + str(i)]
        parameters['b' + str(i)] = parameters['b' + str(i)] - learning_rate * grads['db' + str(i)]
    return parameters

In [22]:
def plot(costs, learning_rate):
    plt.plot(np.squeeze(costs))
    plt.ylabel('Cost')
    plt.xlabel('Iterations (Per Five)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

In [23]:
def predict(A):
    for i in range(A.shape[1]):
        if A[0][i] < 0.5:
            A[0][i] = 0
        else:
            A[0][i] = 1
    return A

In [24]:
def F1SA(A, Y):
    prec = 0
    recall = 0
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(Y.shape[1]):
        if A[0][i] == Y[0][i] and A[0][i] == 1:
            TP += 1
        elif A[0][i] != Y[0][i] and A[0][i] == 1:
            FP += 1
        elif A[0][i] != Y[0][i] and A[0][i] == 0:
            FN += 1
        elif A[0][i] == Y[0][i] and A[0][i] == 0:
            TN += 1
    prec = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1Score = (2 * prec * recall) / (prec + recall)
    accuracy = (TP + TN) / (TP + FN + TN + FP)
    return F1Score, accuracy

In [25]:
def test_set_prediction(X, Y, layer_dims, parameters, activations):
    A = X
    for i in range(1, len(layer_dims)):
        A_prev = A
        A, cache = single_layer_forward(A_prev, parameters['W' + str(i)], parameters['b' + str(i)], activations[i - 1], 1)
    if activations[len(activations) - 1] == 'linear':
        return A
    elif activations[len(activations) - 1] != 'linear' and layer_dims[len(layer_dims) - 1] > 1:
        A = multi_class(A)
        Y = multi_class(Y)
        Accuracy = predict_multiclass(A, Y)
        print('The accuracy of the model on Test Set is', Accuracy, '%')
    else:
        A = predict(A)
        F1Score, Accuracy = F1SA(A, Y)
        print('F1Score and Accuracy of the model on the Test Set is respectively', F1Score * 100, '% and', Accuracy * 100, '%')

In [26]:
def multi_class(AL):
    A = np.zeros((1, AL.shape[1]))
    for i in range(AL.shape[1]):
        A[0][i] = np.argmax(AL[:,i])
    return A

In [27]:
def predict_multiclass(A, Y):
    count = 0
    for i in range(Y.shape[1]):
        if A[0][i] == Y[0][i]:
            count += 1
    return (count / Y.shape[1]) * 100

In [40]:
def neural_network(X, Y, iterations, layer_dims, activations, learning_rate, keep_prob, initialization):
    parameters = initialize_parameters(layer_dims, activations, initialization)
    costs = []
    for i in range(iterations):
        A, caches = n_layer_forward(X, layer_dims, parameters, activations, keep_prob)
        grads = n_layer_backward(X, Y, A, layer_dims, caches, activations, keep_prob)
        parameters = gradient_descent(parameters, layer_dims, grads, learning_rate)
        if i % 5 == 0 or i == iterations - 1:
            cost = compute_cost(A, Y, activations[len(activations) - 1])
            print('Cost at iteration', i, 'is', cost)
            costs.append(cost)
    plot(costs, learning_rate)
    if activations[len(activations) - 1] != 'linear' and layer_dims[len(layer_dims) - 1] > 1:
        A = multi_class(A)
        Y = multi_class(Y)
        print(A[0], Y[0])
        Accuracy = predict_multiclass(A, Y)
        print('The accuracy of the model on Training Set is', Accuracy, '%')
    elif activations[len(activations) - 1] != 'linear':
        A = predict(A)
        F1Score, Accuracy = F1SA(A, Y)
        print('F1Score and Accuracy of the model on the Training Set is respectively', F1Score * 100, '% and', Accuracy * 100, '%')
    return parameters