In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import math
import sklearn.datasets
import h5py
from PIL import Image
from scipy import ndimage
import copy


from nn_building_blocks import *

%matplotlib inline

# Binary Classification with Tanh in a Single Hidden Layer NN

In [None]:
np.random.seed(1)
m = 400 # number of examples
N = int(m/2) # number of points per class
D = 2 # dimensionality
X = np.zeros((m,D)) # data matrix where each row is a single example
Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
a = 4 # maximum ray of the flower

for j in range(2):
    ix = range(N*j,N*(j+1))
    t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta
    r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius
    X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
    Y[ix] = j
    
X = X.T
Y = Y.T

In [None]:
binary_classification_layers_dims = (X.shape[0], 4, 1)

In [None]:
def customized_binary_1L_model(X, Y, layers_dims, optimizer="gd", learning_rate = 0.0007, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 5000, print_cost = False, decay=None, decay_rate=1):

    np.random.seed(3)
    
    L = len(layers_dims)             
    costs = []                       
    t = 0                            
    m = X.shape[1]                  
    lr_rates = []
    learning_rate0 = learning_rate
    
    parameters = initialize_parameters(layers_dims)

    if optimizer != "momentum" and optimizer != "adam":
        pass 
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    
    for i in range(num_epochs):
        al, caches = forward_propagation(X, parameters, "tanh")
        cost_avg = compute_cost_log_loss(al, Y)
        grads = backward_propagation(X, Y, caches, "tanh")

    
        if optimizer != "momentum" and optimizer != "adam":
            parameters = update_parameters(parameters, grads, learning_rate)
        elif optimizer == "momentum":
            parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
        elif optimizer == "adam":
            t = t + 1 # Adam counter
            parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                           t, learning_rate, beta1, beta2,  epsilon)

            
        if decay:
            learning_rate = decay(learning_rate0, i, decay_rate)
       
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            if decay:
                print("learning rate after epoch %i: %f"%(i, learning_rate))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)
                
    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("model Learning rate = " + str(learning_rate))
    plt.show()

    # Parameters for prediction and grads for gradient checking
    return parameters, grads

In [None]:
def predict1(parameters, X):
    A2, cache = forward_propagation(X, parameters, "tanh")
    prediction = (A2 > 0.5).astype(int)
    return prediction

In [None]:
def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)

In [None]:
def load_extra_datasets():  
    N = 200
    noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3)
    noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2)
    blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6)
    gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None)
    no_structure = np.random.rand(N, 2), np.random.rand(N, 2)
    
    return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure

In [None]:
binary_parameters, gradients = model(X, Y, binary_classification_layers_dims, hidden_activation="tanh", optimizer = "gd", learning_rate = 1.2, num_epochs=10000, print_cost=True, check_gradient=True, seed=1)

# Predict
predictions = predict1(binary_parameters, X)

In [None]:
predictions = predict1(binary_parameters, X)
print ('Accuracy: %d' % float((np.dot(Y, predictions.T) + np.dot(1 - Y, 1 - predictions.T)) / float(Y.size) * 100) + '%')

In [None]:
plot_decision_boundary(lambda x: predict1(binary_parameters, x.T), X, Y)
plt.title("Decision Boundary for hidden layer size " + str(4))
plt.show()

In [None]:
# # Datasets
# '''
# Rerun the cells above after selecting a datasets and running this cell
# '''

# noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure = load_extra_datasets()

# datasets = {"noisy_circles": noisy_circles,
#             "noisy_moons": noisy_moons,
#             "blobs": blobs,
#             "gaussian_quantiles": gaussian_quantiles}

# ### START CODE HERE ### (choose your dataset)
# dataset = "noisy_circles"
# ### END CODE HERE ###

# X, Y = datasets[dataset]
# X, Y = X.T, Y.reshape(1, Y.shape[0])

# # make blobs binary
# if dataset == "blobs":
#     Y = Y%2

# # Visualize the data
# plt.scatter(X[0, :], X[1, :], c=Y, s=40, cmap=plt.cm.Spectral);
# plt.show()

# Binary Image Classification (Cats vs Not Cats)

In [None]:
train_dataset = h5py.File('../datasets/train_catvnoncat.h5')
train_x_orig = np.array(train_dataset["train_set_x"][:])
train_y = np.array(train_dataset["train_set_y"][:])
train_y = train_y.reshape(1, -1)

test_dataset = h5py.File('../datasets/test_catvnoncat.h5')
test_x_orig = np.array(test_dataset["test_set_x"][:])
test_y = np.array(test_dataset["test_set_y"][:])
test_y = test_y.reshape(1, -1)

classes = np.array(test_dataset["list_classes"][:])

train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1)
test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1)

train_x = train_x_flatten / 255
test_x = test_x_flatten / 255

xtr = train_x.T
xtt = test_x.T

In [None]:
# Example of a picture
index = 10
plt.imshow(train_x_orig[index])
plt.show()
print ("y = " + str(train_y[0,index]) + ". It's a " + classes[train_y[0,index]].decode("utf-8") +  " picture.")

In [None]:
print ("train_x's shape: " + str(xtr.shape))
print ("test_x's shape: " + str(xtt.shape))
print ("train_y's shape: " + str(train_y.shape))
print ("test_y's shape: " + str(test_y.shape))

In [None]:
image_classification_layers_dims = (xtr.shape[0], 20, 7, 5, 1)

In [None]:
def customized_image_classification_model(X, Y, layers_dims, optimizer="gd", learning_rate = 0.0007, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 5000, print_cost = False, decay=None, decay_rate=1):

    np.random.seed(1)
    
    L = len(layers_dims)             
    costs = []                       
    t = 0                            
    m = X.shape[1]                  
    lr_rates = []
    learning_rate0 = learning_rate
    
    parameters = initialize_parameters_xavier(layers_dims)

    if optimizer != "momentum" and optimizer != "adam":
        pass 
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    
    for i in range(num_epochs):
        al, caches = forward_propagation(X, parameters)
        cost_avg = compute_cost_log_loss(al, Y)
        grads = backward_propagation(X, Y, caches)

    
        if optimizer != "momentum" and optimizer != "adam":
            parameters = update_parameters(parameters, grads, learning_rate)
        elif optimizer == "momentum":
            parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
        elif optimizer == "adam":
            t = t + 1 # Adam counter
            parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                           t, learning_rate, beta1, beta2,  epsilon)

            
        if decay:
            learning_rate = decay(learning_rate0, i, decay_rate)
       
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            if decay:
                print("learning rate after epoch %i: %f"%(i, learning_rate))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)
                
    # # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("model Learning rate = " + str(learning_rate))
    plt.show()

    # Parameters for prediction and grads for gradient checking
    return parameters, grads

In [None]:
image_classification_parameters, gradients = model(xtr, train_y, image_classification_layers_dims, learning_rate=0.0075, num_epochs = 2500, print_cost=True, init="xavier", seed=1, check_gradient=False)

In [None]:
predictions_train = predict(xtr, train_y, image_classification_parameters)

In [None]:
predictions_test = predict(xtt, test_y, image_classification_parameters)

In [None]:
# my_image = "goat-2775034_960_720-3704390797.jpg"
# my_label_y = [1] 


# num_px = 64
# fname = my_image
# image = np.array(Image.open(fname).resize((num_px, num_px)))
# plt.imshow(image)
# plt.show()
# image = image / 255.
# image = image.reshape((1, num_px * num_px * 3)).T

# my_predicted_image = predict(image, my_label_y, image_classification_parameters, print_accuracy=False)


# print ("y = " + str(np.squeeze(my_predicted_image)) + ", your L-layer model predicts a \"" + classes[int(np.squeeze(my_predicted_image)),].decode("utf-8") +  "\" picture.")

# Testing Different Parameter Initializations

In [None]:
np.random.seed(1)
train_x1, train_y1 = sklearn.datasets.make_circles(n_samples=300, noise=0.05)
np.random.seed(2)
test_x1, test_y1 = sklearn.datasets.make_circles(n_samples=100, noise=0.05)

train_x1 = train_x1.T
test_x1 = test_x1.T

train_y1 = train_y1.reshape(1, -1)
test_y1 = test_y1.reshape(1, -1)

In [None]:
init_param_layers_dims = [train_x1.shape[0], 10, 5, 1]

In [None]:
def customized_initialize_parameters_model(X, Y, layers_dims, optimizer="gd", learning_rate = 0.0007, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 5000, print_cost = False, decay=None, decay_rate=1 ,init="random"):

    np.random.seed(1)
    
    L = len(layers_dims)             
    costs = []                       
    t = 0                            
    m = X.shape[1]                  
    lr_rates = []
    learning_rate0 = learning_rate
    
    
    if init == "zeros":
        parameters = {}
        L = len(layers_dims)
    
        for l in range(1, L):
            parameters["W"+str(l)] = np.zeros((layers_dims[l], layers_dims[l-1]))
            parameters["b"+str(l)] = np.zeros((layers_dims[l],1))
    
    if init == "random":
        np.random.seed(3)
        parameters = {}
        L = len(layers_dims)
    
        for l in range(1, L):
            parameters["W"+str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) *10
            parameters["b"+str(l)] = np.zeros((layers_dims[l],1))

    if init == "he":
        parameters = initialize_parameters_he(layers_dims)

    
    if optimizer != "momentum" and optimizer != "adam":
        pass 
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    
    for i in range(num_epochs):
        al, caches = forward_propagation(X, parameters)
        cost_avg = compute_cost_log_loss(al, Y)
        grads = backward_propagation(X, Y, caches)

    
        if optimizer != "momentum" and optimizer != "adam":
            parameters = update_parameters(parameters, grads, learning_rate)
        elif optimizer == "momentum":
            parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
        elif optimizer == "adam":
            t = t + 1 # Adam counter
            parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                           t, learning_rate, beta1, beta2,  epsilon)

            
        if decay:
            learning_rate = decay(learning_rate0, i, decay_rate)
       
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            if decay:
                print("learning rate after epoch %i: %f"%(i, learning_rate))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)
                
    # # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("model Learning rate = " + str(learning_rate))
    plt.show()

    # Parameters for prediction and grads for gradient checking
    return parameters, grads

In [None]:
xavier_parameters, gradientss = model(train_x1, train_y1, init_param_layers_dims, init = "xavier", learning_rate=0.01, num_epochs=15000, print_cost=True, check_gradient=True)
print ("On the train set:")
predictions_train = predict(train_x1, train_y1, xavier_parameters)
print ("On the test set:")
predictions_test = predict(test_x1, test_y1, xavier_parameters)

# Testing Regularization 

In [None]:
data = scipy.io.loadmat("../datasets/data.mat")
train_x2 = data["X"].T
train_y2 = data["y"].T
test_x2 = data["Xval"].T
test_y2 = data["yval"].T

In [None]:
regularization_model_layers_dim = [train_x2.shape[0], 20, 3, 1]

In [None]:
def customized_reg_model(X, Y, layers_dims, 
                        optimizer="gd", learning_rate = 0.0007, num_epochs = 5000,
                        momentum_beta = 0.9, adam_beta1 = 0.9, adam_beta2 = 0.999,  epsilon = 1e-8,
                        init="random", hidden_activation="relu", output_activation="sigmoid",
                        lambd=0, keep_probs=1,
                        decay=None, decay_rate=1, 
                        print_cost = False, seed=None):

    assert(lambd ==0 or keep_probs==1)

    
    L = len(layers_dims)             
    costs = []                       
    t = 0                            
    m = X.shape[1]                  
    lr_rates = []
    learning_rate0 = learning_rate
    
    
    
    if init == "random":
        parameters = initialize_parameters(layers_dims, seed)
    elif init == "he":
        parameters = initialize_parameters_he(layers_dims, seed)
    elif init == "xavier":
        parameters = initialize_parameters_xavier(layers_dims, seed)

    
    
    if optimizer != "momentum" and optimizer != "adam":
        pass 
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    
    
    for i in range(num_epochs):
        if keep_probs == 1:
            al, caches = forward_propagation(X, parameters, hidden_activation, output_activation)
        elif keep_probs < 1:
            al, caches = forward_propagation_with_dropout(X, parameters, hidden_activation, output_activation, keep_probs)
        
        if lambd == 0:
            cost_avg = compute_cost_log_loss(al, Y)
        elif lambd > 0:
            cost_avg = compute_cost_with_regularization(al, Y, parameters, lambd)

        if lambd >= 0 and keep_probs == 1:
            grads = backward_propagation(X, Y, caches, hidden_activation, lambd)
        elif keep_probs < 1:
            grads = backward_propagation_with_dropout(X, Y, caches, keep_probs, al, hidden_activation)
        

    
        if optimizer != "momentum" and optimizer != "adam":
            parameters = update_parameters(parameters, grads, learning_rate)
        elif optimizer == "momentum":
            parameters, v = update_parameters_with_momentum(parameters, grads, v, momentum_beta, learning_rate)
        elif optimizer == "adam":
            t = t + 1 # Adam counter
            parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                           t, learning_rate, adam_beta1, adam_beta2,  epsilon)

            
        if decay:
            learning_rate = decay(learning_rate0, i, decay_rate)
       
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            if decay:
                print("learning rate after epoch %i: %f"%(i, learning_rate))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)
                
    # # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("model Learning rate = " + str(learning_rate))
    plt.show()

    # Parameters for prediction and grads for gradient checking
    return parameters, grads

#### Unregularized

In [None]:
unreg_parameters, gradients = model(train_x2, train_y2, regularization_model_layers_dim, learning_rate=0.3, num_epochs=30000, print_cost=True, init="xavier", seed=3, check_gradient=True)
print ("On the training set:")
predictions_train = predict(train_x2, train_y2, unreg_parameters)
print ("On the test set:")
predictions_test = predict(test_x2, test_y2, unreg_parameters)

#### Regularized

In [None]:
reg_parameters, gradients = model(train_x2, train_y2, regularization_model_layers_dim, lambd=0.7, learning_rate=0.3, num_epochs=30000, print_cost=True, init="xavier", seed=3, check_gradient=True)
print ("On the training set:")
predictions_train = predict(train_x2, train_y2, reg_parameters)
print ("On the test set:")
predictions_test = predict(test_x2, test_y2, reg_parameters)

#### Dropout

In [None]:
dropout_parameters, gradients = model(train_x2, train_y2, regularization_model_layers_dim, keep_probs=0.86, learning_rate=0.3, 
                                      num_epochs=30000, print_cost=True, init="xavier", seed=3, check_gradient=True)
print ("On the training set:")
predictions_train = predict(train_x2, train_y2, dropout_parameters)
print ("On the test set:")
predictions_test = predict(test_x2, test_y2, dropout_parameters)

# Optimization Testing

In [None]:
np.random.seed(3)
train_X3, train_Y3 = sklearn.datasets.make_moons(n_samples=300, noise=.2) #300 #0.2 
# Visualize the data
plt.scatter(train_X3[:, 0], train_X3[:, 1], c=train_Y3, s=40, cmap=plt.cm.Spectral);
plt.show()
train_X3 = train_X3.T
train_Y3 = train_Y3.reshape((1, train_Y3.shape[0]))

In [None]:
optimizing_layers_dims = [train_X3.shape[0], 5, 2, 1]

In [None]:
optimized_parameters, gradients = model(train_X3, train_Y3, optimizing_layers_dims, optimizer = "momentum", learning_rate = 0.1, num_epochs=5000, 
                              decay=schedule_lr_decay, print_cost=True, init="he", seed=3, check_gradient=True)

# Predict
predictions = predict(train_X3, train_Y3, optimized_parameters)

# Gradient Checking

#### Gradient Checking: Binary Classification with Tanh in a Single Hidden Layer NN

In [None]:
final_al, final_caches = forward_propagation(X, binary_parameters, hidden_activation="tanh", output_activation="sigmoid")

final_grads = backward_propagation(X, Y, final_caches, hidden_activation="tanh", lambd=0)

difference = gradient_checking(binary_parameters, final_grads, X, Y, print_msg=True, hidden_activation="tanh", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: image classification Network

In [None]:
final_al, final_caches = forward_propagation(xtr[:,:1], image_classification_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(xtr[:,:1], train_y[:,:1], final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(image_classification_parameters, final_grads, xtr[:,:1], train_y[:,:1], print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: Different Parameter Initializations

In [None]:
final_al, final_caches = forward_propagation(train_x1, xavier_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(train_x1, train_y1, final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(xavier_parameters, final_grads, train_x1, train_y1, print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: Unregularized Network

In [None]:
final_al, final_caches = forward_propagation(train_x2, unreg_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(train_x2, train_y2, final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(unreg_parameters, final_grads, train_x2, train_y2, print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: Regularized Network

In [None]:
final_al, final_caches = forward_propagation(train_x2, reg_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(train_x2, train_y2, final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(reg_parameters, final_grads, train_x2, train_y2, print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: Dropout Network

In [None]:
final_al, final_caches = forward_propagation(train_x2, dropout_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(train_x2, train_y2, final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(dropout_parameters, final_grads, train_x2, train_y2, print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")

#### Gradient Checking: Different Optimization

In [None]:
final_al, final_caches = forward_propagation(train_X3, optimized_parameters, hidden_activation="relu", output_activation="sigmoid")

final_grads = backward_propagation(train_X3, train_Y3, final_caches, hidden_activation="relu", lambd=0)

difference = gradient_checking(optimized_parameters, final_grads, train_X3, train_Y3, print_msg=True, hidden_activation="relu", output_activation="sigmoid")

print(f"Gradient check difference: {difference}")