# Cat vs. non-cat
#### Coding a neural network from scratch
*Spoiler alert: serious case of overfitting!*

In [4]:
import numpy as np
import h5py
import matplotlib.pyplot as plt

In [5]:
def load_data():
    train_dataset = h5py.File('train_catvnoncat.h5', 'r')
    train_set_x_orig = np.array(train_dataset['train_set_x'][:])
    train_set_y_orig = np.array(train_dataset['train_set_y'][:])

    test_dataset = h5py.File('test_catvnoncat.h5', 'r')
    test_set_x_orig = np.array(test_dataset['test_set_x'][:])
    test_set_y_orig = np.array(test_dataset['test_set_y'][:])

    classes = np.array(test_dataset['list_classes'][:])
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))

    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [6]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    activ_cache = Z
    return A, activ_cache

def relu(Z):
    A = np.maximum(0,Z)
    activ_cache = Z
    return A, activ_cache

In [7]:
def initialize(layers_dims): #ex: layerdims = [12288, 5,5,5,1] => 1st input is the number of features
    
    L = len(layers_dims)
    params = {}
    
    for i in range(1, L):
        params['W' + str(i)] = np.random.randn(layers_dims[i], layers_dims[i-1]) / np.sqrt(layer_dims[i-1]) 
        params['b' + str(i)] = np.zeros((layers_dims[i],1))
    
    return params

In [8]:
def linear_activation_forward(A_prev, W, b, activation): #type(activation) == string
    
    linear_cache = (A_prev, W, b)
    assert W.shape[1] == A_prev.shape[0]
    Z = np.dot(W,A_prev) + b
    
    if activation == "relu":
        A, activ_cache = relu(Z)
    elif activation == "sigmoid":
        A, activ_cache = sigmoid(Z)
    else:
        raise Exception('{activation} function is not known'.format(activation))

    cache = (linear_cache, activ_cache)
    
    return A, cache

In [9]:
def L_model(X, params, activations, final_activ):
    
    L = len(params) // 2
    caches = []
    
    A = X
    
    # L-1 premières couches
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, params['W' + str(l)], params['b' + str(l)], activations)
        caches.append(cache)
    
    # L-ième couche
    A, cache = linear_activation_forward(A, params['W' + str(L)], params['b' + str(L)], final_activ)
    caches.append(cache)
    
    Yhat = A
    
    return Yhat, caches
    # caches is a list containing L "cache" tupples, each one containing
    # 1) lin_cache (with A_prev (activations for layer l-1), matrix W and 
    # vector b for layer l)
    # 2) activ_cache (containing just Z for layer l)

In [10]:
def compute_cost(Yhat,Y):
    
    m = Y.shape[1]
    
    # average of losses, computed through vector multiplication. Returns an array.
    cost = (- 1. / m) * (np.dot(Y,np.log(Yhat).T) + np.dot((1 - Y),np.log(1 - Yhat).T))

    # np.squeeze allows to remove a "level" of brackets
    cost = np.squeeze(cost)
    assert cost.shape == ()
    
    return cost

In [11]:
def linear_backward(dZ, lin_cache):

    A_prev, W, b = lin_cache
    m = A_prev.shape[1]

    dW = 1 / m * np.dot(dZ, A_prev.T)
    db = 1 / m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [12]:
def relu_backward(dA, Z):
    assert len(Z) == len(dA)
    dA_copy = np.array(dA, copy=True)
    dA_copy[Z <= 0] = 0
    return dA_copy

def sigmoid_backward(dA, Z):
    s, _ = sigmoid(Z)
    dZ = dA * (s * (1 -s))
    return dZ

In [13]:
def lin_activation_backward(dA, cache, activation):
    
    lin_cache, activ_cache = cache
    Z = activ_cache
    
    if activation == "relu":
        dZ = relu_backward(dA, Z)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)
    else:
        raise Exception('{activation} function is not known'.format(activation))

    dA_prev, dW, db = linear_backward(dZ, lin_cache)
    
    return dA_prev, dW, db

In [14]:
def L_model_backward(Yhat, Y, caches, activations, final_activ):
    
    grads = {}
    L = len(caches)
    m = Y.shape[1]    
    
    # grad of cost function relative to Yhat
    dYhat = - (np.divide(Y, Yhat) - np.divide(1-Y, (1-Yhat))) 
    
    # calculate grads for layer L-1
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = lin_activation_backward(dYhat, current_cache, final_activ)
    
    # calculating grads for layer L-2 through 1
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = lin_activation_backward(grads["dA" + str(l+1)], current_cache, activations)
        
    return grads

In [15]:
def update_params(params, grads, learning_rate):
    
    L = len(params) // 2
    
    for l in range(1, L +1):
        params['W' + str(l)] = params['W' + str(l)] - learning_rate * grads['dW' + str(l)]
        params['b' + str(l)] = params['b' + str(l)] - learning_rate * grads['db' + str(l)]     
    
    return params

In [16]:
def L_layer_model(X, Y, layer_dims, learning_rate, activations, final_activ, num_iterations, print_cost=True, show_plot=False):
    
    assert (layer_dims[0] == X.shape[0])

    params = initialize(layer_dims) #reminder: layer_dims[0] = number of features
    costs = []
    
    for it in range(1,num_iterations + 1):        
        Yhat, caches = L_model(X, params, activations, final_activ) #forward pass        
        cost = compute_cost(Yhat, Y) #compute cost of this state        
        if print_cost and it % print_iter == 0:
            print("Cost after {} iterations : {}".format(it, cost))
            costs.append(cost) #logs the cost every xx iterations
        
        grads = L_model_backward(Yhat, Y, caches, activations, final_activ)
        update_params(params, grads, learning_rate)
    
    # optional visualization
    if show_plot:
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per 1000s)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    return params

### Generating a dataset

We generate binary tuples (X) and attribute to Y the result of a XOR evaluation, as follows:

In [17]:
train_x_orig, train_y, test_x_orig, test_y, classes = load_data()

train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T
test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T

train_x = train_x_flatten/255
test_x = test_x_flatten/255

### Running the model

#### Hyperparameters

In [21]:
num_iterations = 2000
layer_dims = [12288, 7, 7, 5, 1]
learning_rate = 0.01
print_iter = 250
activations="relu"
final_activ="sigmoid"

First, let's try to launch it several times to see the reliability:

In [22]:
for i in range(3):
    print(">>>>", i, "<<<<<")
    trained_model = L_layer_model(train_x, train_y, layer_dims, learning_rate, activations, final_activ, num_iterations)

>>>> 0 <<<<<
Cost after 250 iterations : 0.5333951485928804
Cost after 500 iterations : 0.5259052663649282
Cost after 750 iterations : 0.33809760967591485
Cost after 1000 iterations : 0.048566322244869224
Cost after 1250 iterations : 0.014633329122023076
Cost after 1500 iterations : 0.007437215293787577
Cost after 1750 iterations : 0.004637068923818518
Cost after 2000 iterations : 0.0032421468967496936
>>>> 1 <<<<<
Cost after 250 iterations : 0.5770656981665404
Cost after 500 iterations : 0.3497382374613005
Cost after 750 iterations : 0.14538478442748323
Cost after 1000 iterations : 0.5820250492445604
Cost after 1250 iterations : 0.018684252057648804
Cost after 1500 iterations : 0.009803262347692998
Cost after 1750 iterations : 0.006225326292736658
Cost after 2000 iterations : 0.004398034918380499
>>>> 2 <<<<<
Cost after 250 iterations : 0.48565874499084993
Cost after 500 iterations : 0.44270085549303284
Cost after 750 iterations : 0.1549034988166938
Cost after 1000 iterations : 0.4051

Now, let's try to assess the performance of our model with regards to accuracy.

#### Getting parameters

In [24]:
trained_model = L_layer_model(train_x, train_y, layer_dims, learning_rate, activations, final_activ, num_iterations)


Cost after 250 iterations : 0.6274862791579834
Cost after 500 iterations : 0.5058889361837662
Cost after 750 iterations : 0.36188655778637524
Cost after 1000 iterations : 0.4660055438743584
Cost after 1250 iterations : 0.18841296348449973
Cost after 1500 iterations : 1.0143950021901962
Cost after 1750 iterations : 0.10627131913299245
Cost after 2000 iterations : 0.09511278122987296


#### 'predict' function and appraisal of performance with these weights

In [26]:
def test_pred(X, Y, params):
    """
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model

    Returns:
    p -- predictions for the given dataset X
    """

    m = X.shape[1]
    p = np.zeros((1,m))

    # Forward propagation
    probas, caches = L_model(X, params, activations, final_activ)


    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p[0] == Y[0]))/m))

    return p

In [29]:
test_pred(test_x, test_y, trained_model)

Accuracy: 0.64


array([[1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
        0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
        1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
        1., 0.]])

This result is not very good (let's remember that a random choice would yield an accuracy of 0.5), but maybe another initialization would be more satisfactory: in the last case (the one used for the prediction), the cost was ~0.1 while on other simulations, we reached half this cost. Let's train again, and this time a bit longer.

In [30]:
trained_model2 = L_layer_model(train_x, train_y, layer_dims, learning_rate, activations, final_activ, num_iterations + 500)


Cost after 250 iterations : 0.45596622827909267
Cost after 500 iterations : 0.29825104766968036
Cost after 750 iterations : 0.1253567329518423
Cost after 1000 iterations : 0.058499407623742644
Cost after 1250 iterations : 0.021790063614392455
Cost after 1500 iterations : 0.011305273855980694
Cost after 1750 iterations : 0.00682594116976536
Cost after 2000 iterations : 0.004649173935583757
Cost after 2250 iterations : 0.003414508125884274
Cost after 2500 iterations : 0.00264696797262856


This looks better! So what's our accuracy with these parameters?

In [31]:
test_pred(test_x, test_y, trained_model2)

Accuracy: 0.68


array([[1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1.,
        0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
        1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
        1., 0.]])

Huh! This exemplifies overfitting!

In [33]:
test_pred(train_x, train_y, trained_model2)

Accuracy: 1.0


array([[0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
        0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0.,
        0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
        1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0.,
        1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0.]])