# Objectives
The purpose of this project is to build a dog-identifier, based on image data. It will be an L-Layer Deep Neural Network model following the steps below - 
1. Compress and convert raw image data (check)
2. Initialise parameters (check)
3. Forward propagation (check)
4. Compute cost function (+ regularisation) (check)
5. Backpropagation (+ regularisation) (check)
6. Update parameters (check)
7. Gradient checking (check)
8. Prediction accuracy / precision / recall (check) + F1 Score (check)
9. Display data and provide manual labelling (1 = Yes, it's a dog; 0 = No, it's not a dog)
10. Plot iteration vs. cost (training) & cost (cv)
11. Plot lambda vs. cost (training) & cost (cv); the skier shape
12. Plot m vs. cost (training) & cost (cv) to tell Bias vs. Variance
13. Plot iteration vs. cost (traininng) for different learning rates

In [204]:
import numpy as np
import scipy
import matplotlib.image as mpimg

from PIL import Image
import matplotlib.pyplot as plt
from pylab import *
from scipy import ndimage
import os

In [205]:
def initialise_param(layers_size):

    L = len(layers_size)
    parameters={}

    for l in range (1,L):
        parameters['W'+str(l)]=np.random.randn(layers_size[l],layers_size[l-1])*np.sqrt(2/layers_size[l-1])
        parameters['b'+str(l)]=np.zeros((layers_size[l],1))

    return parameters

In [206]:
def sigmoid(Z):
    
    A=1/(1+np.exp(-Z))
    
    return A

In [207]:
def relu(Z):
    
    if Z<0:
        A=0
    
    elif Z>=0:
        A=Z
    
    return A

In [208]:
def single_forward_prop(A_prev,W,b,activation):
    
    Z=np.dot(W,A_prev)+b
    linear_cache= (A_prev,W,b)
    activation_cache=Z
    
    if activation=="tanh":
        A=np.tanh(Z) # Provided by numpy
    
    elif activation=="sigmoid":
        A=sigmoid(Z) # Provided above
        
    elif activation=="relu":
        A=relu(Z) # Provided above
        
    cache = (linear_cache,activation_cache)
    
    return A,cache

In [464]:
def compute_cost(AL,Y):
    # Unregularised
    
    m = Y.shape[1]
    
    cost = -(1/m)*np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL),axis=1,keepdims=True)
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

In [382]:
def single_backward_prop(dA,cache,activation,lambd):
    
    grads={}
    linear_cache,activation_cache=cache
    A_prev,W,b=linear_cache
    Z=activation_cache
    m=dA.shape[1]
    
    if activation=="tanh":
        dZ=dA*(1-np.power(np.tanh(Z),2))
    
    elif activation=="sigmoid":
        dZ=dA*sigmoid(Z)*(1-sigmoid(Z))
    
    elif activation=="relu":
        if Z<=0:
            dZ=0
        
        elif Z>0:
            dZ=dA
    
    dW=(1/m)*np.dot(dZ,A_prev.T)+(lambd/m)*W
    db=(1/m)*np.sum(dZ,axis=1,keepdims=True)
    dA_prev=np.dot(W.T,dZ)
    
    return dA_prev,dW,db

In [383]:
def update_param(parameters,grads,learning_rate):
    
    L=len(parameters)//2
    
    for l in range (L):
        parameters['W'+str(l+1)]=parameters['W'+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
        parameters['b'+str(l+1)]=parameters['b'+str(l+1)]-learning_rate*grads['db'+str(l+1)]    
        
    return parameters

In [416]:
def dog_identifier(X,Y,layers_size,iters,learning_rate,lambd):
    
    # 1. Initialise parameters based on layers_size, L includes input layer
    parameters=initialise_param(layers_size)
    m = Y.shape[1]
    
    # 2. FP with "tanh" except the last(output) layer,in total L-2 sets of caches
    L = len(layers_size)
    
    for i in range(0,iters):
        A_prev=X
        caches=[]
        regularisation = 0

        for l in range (1,L-1):
            W=parameters['W'+str(l)]
            b=parameters['b'+str(l)]
            activation="tanh"
            A,cache=single_forward_prop(A_prev,W,b,activation)
            A_prev=A
            caches.append(cache)
            regularisation = regularisation + np.sum(np.square(W))

        # 3. FP in the last layer and compute cost in layer L, with "sigmoid", now caches has L-1 layers
        W=parameters['W'+str(L-1)]
        b=parameters['b'+str(L-1)]
        activation="sigmoid"
        A,cache=single_forward_prop(A_prev,W,b,activation)
        AL=A
        caches.append(cache)
        regularisation = (lambd/(2*m))*(regularisation +np.sum(np.square(W)))
        
        cost=compute_cost(AL,Y)
        cost = cost + regularisation

        if (i+1)%50==0:
            print("Iteration #"+str(i+1)+" Cost is "+str(cost))

        # 4. BP first with "sigmoid" for the last(output) layer
        grads={}
        dA = -(np.divide(Y,AL)-np.divide(1-Y,1-AL))
        cache = caches[L-2] # caches has L-1 layers, and the index for the last one is L-2
        activation = "sigmoid"
        # In grads, dA (previous) is always 1 layer prior to the dW and db
        grads['dA'+str(L-2)],grads['dW'+str(L-1)],grads['db'+str(L-1)]=single_backward_prop(dA,cache,activation,lambd)

        # 5. BP with L-2 layers of "tanh", going from l = L-3 to l = 0
        for l in reversed(range(L-2)):
            activation="tanh"
            dA_prev_temp,dW_temp,db_temp=single_backward_prop(grads['dA'+str(l+1)],caches[l],activation,lambd)
            grads['dA'+str(l)]=dA_prev_temp
            grads['dW'+str(l+1)]=dW_temp
            grads['db'+str(l+1)]=db_temp    

        #6. update parameters with GD
        parameters=update_param(parameters,grads,learning_rate)
    
    return parameters, grads

In [484]:
def predict(X, Y, parameters):
    
    L=len(parameters)//2 # This L does NOT include input layer
    A_prev=X
    
    for l in range(L-1):
        W=parameters['W'+str(l+1)]
        b=parameters['b'+str(l+1)]
        activation="tanh"
        A,cache=single_forward_prop(A_prev,W,b,activation)
        A_prev=A
    
    W=parameters['W'+str(L)]
    b=parameters['b'+str(L)]
    activation="sigmoid"
    A,cache=single_forward_prop(A_prev,W,b,activation)
    AL=A
    
    m=AL.shape[1]
    p = np.zeros((1,m))
    for i in range(m):
        
        if AL[0,i]>0.5:
            p[0,i]=1
        
        else:
            p[0,i]=0
            
    precision = np.sum(p*Y==1)/np.sum(p==1)
    recall = np.sum(p*Y==1)/np.sum(Y==1)
    f1 = (2*precision*recall)/(precision+recall)
    print('Accuracy is '+str(np.sum(p==Y)/m))
    print('Precision is '+str(precision))
    print('Recall is '+str(recall))
    print('F1 Score is '+str(f1))
    
    return p

In [415]:
# Create data set X and Y, can be for training, CV or test
def create_data_set(arr_dog,arr_nondog,num_px,pathpos,pathneg):
    
    mdog=len(arr_dog)
    mnondog=len(arr_nondog)
    X=np.zeros((num_px*num_px*3,mdog+mnondog))
    Y=np.zeros((1,mdog+mnondog))
    
    for i in range(mdog):
        im=arr_dog[i]
        fname=pathpos+im
        temp=Image.open(fname)
        temp=temp.resize((num_px,num_px))
        temp=np.array(temp)
        im_flatten=temp.ravel().T
        im_flatten=im_flatten/255
        X[:,i]=im_flatten
        Y[0,i]=1
    
    for i in range(mnondog):
        im=arr_nondog[i]
        fname=pathneg+im
        temp=Image.open(fname)
        temp=temp.resize((num_px,num_px))
        temp=np.array(temp)
        im_flatten=temp.ravel().T
        im_flatten=im_flatten/255
        X[:,mdog+i]=im_flatten
        Y[0,mdog+i]=0
    
    return X,Y

In [461]:
def gradient_checking(parameters,grads,training_X,training_Y, epsilon):
    
    param_array = dictionary_to_vector(parameters)
    num_param = param_array.shape[0] # Number of parameters in every W and b
    
    gradientapprox=np.zeros((num_param,1))
    
    cost_plus = np.zeros((num_param,1))
    cost_minus = np.zeros((num_param,1))
    
    param_plus = {}
    param_minus = {}
    
    L = len(parameters)//2
    
    # Unroll the gradient without epsilon
    gradient = gradients_to_vector(grads)
    
    # Update the parameters to plus epsilon
    for i in range(num_param):
        # Convert list to array, update one parameter value, then convert back to array
        param_plus_temp = np.copy(param_array)
        param_plus_temp[i][0]=param_plus_temp[i][0]+epsilon        
        param_plus=vector_to_dictionary(param_plus_temp,parameters)
        
        # FP with updated list up to second last layer
        A_prev=training_X

        for l in range (1,L):
            W=param_plus['W'+str(l)]
            b=param_plus['b'+str(l)]
            activation="tanh"
            A,cache=single_forward_prop(A_prev,W,b,activation)
            A_prev=A

        # FP in the last layer and compute cost in layer L, with "sigmoid"
        W=param_plus['W'+str(L)]
        b=param_plus['b'+str(L)]
        activation="sigmoid"
        A,cache=single_forward_prop(A_prev,W,b,activation)
        AL=A

        cost_plus[i]=compute_cost(AL,training_Y)    
    
    # Update the parameters to minus epsilon
    for i in range(num_param):
        # Convert list to array, update one parameter value, then convert back to array
        param_minus_temp = np.copy(param_array)
        param_minus_temp[i][0]=param_minus_temp[i][0]-epsilon        
        param_minus=vector_to_dictionary(param_minus_temp,parameters)
        
        # FP with updated list up to second last layer
        A_prev=training_X

        for l in range (1,L):
            W=param_minus['W'+str(l)]
            b=param_minus['b'+str(l)]
            activation="tanh"
            A,cache=single_forward_prop(A_prev,W,b,activation)
            A_prev=A

        # FP in the last layer and compute cost in layer L, with "sigmoid"
        W=param_minus['W'+str(L)]
        b=param_minus['b'+str(L)]
        activation="sigmoid"
        A,cache=single_forward_prop(A_prev,W,b,activation)
        AL=A

        cost_minus[i]=compute_cost(AL,training_Y)       
    
    # Calculate grandientapprox
    gradientapprox = (cost_plus-cost_minus)/(2*epsilon)
    
    numerator = np.linalg.norm(gradient-gradientapprox)
    denominator = np.linalg.norm(gradient)+np.linalg.norm(gradientapprox)
    difference = numerator/denominator
    
    return difference

In [446]:
def dictionary_to_vector(parameters):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    keys = []
    count = 0
    
    L=len(parameters)//2
    param = {}
    
    for i in range(L):
        param['W'+str(i+1)]=parameters['W'+str(i+1)]
        param['b'+str(i+1)]=parameters['b'+str(i+1)]
    
    for key in (param):
        
        # flatten parameter
        new_vector = np.reshape(param[key], (-1,1))
        keys = keys + [key]*new_vector.shape[0]
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1
    
    return theta

In [445]:
def gradients_to_vector(grads):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    keys = []
    count = 0
    
    L=len(grads)//3
    gradients = {}
    
    for i in range(L):
        gradients['dW'+str(i+1)]=grads['dW'+str(i+1)]
        gradients['db'+str(i+1)]=grads['db'+str(i+1)]
    
    for key in (gradients):
        
        # flatten parameter
        new_vector = np.reshape(gradients[key], (-1,1))
        keys = keys + [key]*new_vector.shape[0]
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1
    
    return theta

In [457]:
def vector_to_dictionary(theta,parameters):

    param = {}
    L=len(parameters)//2
    count = 0
    
    for i in range(L):
        temp1=parameters['W'+str(i+1)]
        temp2=parameters['b'+str(i+1)]
        
        param['W'+str(i+1)] = theta[count:(count+(temp1.shape[1]*temp1.shape[0]))].reshape(temp1.shape)
        count = count + temp1.shape[1]*temp1.shape[0]
        
        param['b'+str(i+1)] = theta[count:(count+temp2.shape[1]*temp2.shape[0])].reshape(temp2.shape)
        count = count + temp2.shape[1]*temp2.shape[0]
    

    return param

In [517]:
# Use this command box to launch the training set and get parameters
pathpos='images/Dog/'
pathneg='images/Non-Dog/'
arr_dog = os.listdir(pathpos)
arr_nondog=os.listdir(pathneg)
num_px=32
X,Y=create_data_set(arr_dog,arr_nondog,num_px,pathpos,pathneg)
training_X=X
training_Y=Y
layers_size=[num_px*num_px*3,20,10,5,1]
learning_rate=0.003
iters=800
lambd=1
parameters, grads=dog_identifier(training_X,training_Y,layers_size,iters,learning_rate,lambd)
print(training_X.shape[1])
p=predict(training_X, training_Y, parameters)
print(p)
print(len(grads))

Iteration #50 Cost is 0.995350387157117
Iteration #100 Cost is 0.9674521892796787
Iteration #150 Cost is 0.9345632345605823
Iteration #200 Cost is 0.9049390534879251
Iteration #250 Cost is 0.8771895618067906
Iteration #300 Cost is 0.8497841333348384
Iteration #350 Cost is 0.8222831560931603
Iteration #400 Cost is 0.7945889897167555
Iteration #450 Cost is 0.7666778105182295
Iteration #500 Cost is 0.7878661842052821
Iteration #550 Cost is 0.7685038301047675
Iteration #600 Cost is 0.7498928810931559
Iteration #650 Cost is 0.7315665832673495
Iteration #700 Cost is 0.7133306026410593
Iteration #750 Cost is 0.6950160332832697
Iteration #800 Cost is 0.6765791193292212
107
Accuracy is 0.897196261682243
Precision is 0.8472222222222222
Recall is 1.0
F1 Score is 0.9172932330827067
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 

In [518]:
# Use this command box to create test sets
pathpos='images/Test-Dog/'
pathneg='images/Test-Non-Dog/'
arr_dog = os.listdir(pathpos)
arr_nondog=os.listdir(pathneg)
X,Y=create_data_set(arr_dog,arr_nondog,num_px,pathpos,pathneg)
test_X=X
test_Y=Y
p=predict(test_X, test_Y, parameters)
print(p)
print(test_X.shape[1])

Accuracy is 0.5614035087719298
Precision is 0.4594594594594595
Recall is 0.7727272727272727
F1 Score is 0.576271186440678
[[1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1.
  0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.
  1. 1. 1. 1. 1. 1. 0. 1. 1.]]
57


In [505]:
epsilon = 1e-7
difference = gradient_checking(parameters,grads,training_X,training_Y, epsilon)

In [506]:
print(difference)

0.00040681595968665977
