# This is an example of implementing a Deep Learning Neural Network from scratch using python.

<h1> References </h1>
<p> Coursera Deep Learning Specialization </p>

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
from scipy.ndimage import imread
import glob
import sys
# you shouldn't need to make any more imports


In [11]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)-1
        self.parameters["batch_index"]=0
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.cache_dropout=[]
        
        # init parameters
        for l in range(1,len(layer_dimensions)):
            self.parameters['W' + str(l)] = np.random.randn(layer_dimensions[l],layer_dimensions[l-1])*0.01
            self.parameters['b' + str(l)] = np.zeros((layer_dimensions[l],1))

        
    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.dot(W,A)+b
        if np.isnan(W[0][0]):
            print("w = ",W)
            print("A=",A)
        cache = (A,W,b,Z)
        return Z,cache
        

    def activationForward(self, A, activation):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        # A is basically Z = np.dot(W,A)+b
        if activation=="relu":
            return self.relu(A)
        
        elif activation=="softmax":
            temp = self.softmax(A)
            if temp.size==0:
                print(A)
            else:
                return temp


    def relu(self, X):
        return np.maximum(0,X)
    
    def softmax(self, X):
        e_x = np.exp(X - np.max(X))
        return e_x / e_x.sum(axis=0)
            
    def dropout(self, A, prob):
        """
        :param A: Activation
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0],A.shape[1])
        M = (M> prob)*1.0
        M /= (1-prob)
        A*=M
        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        caches = []
        A = X
        L = len(self.parameters) // 2                 # number of layers in the neural network
        for l in range(1,L):
            A_prev = A
            Z, cache = self.affineForward(A_prev, self.parameters['W' + str(l)],self.parameters['b' + str(l)])
            A = self.activationForward(Z, activation="relu")
            caches.append(cache)
            if self.drop_prob > 0:
                A,M=self.dropout(A,self.drop_prob)
                self.cache_dropout.append(M)
            
            
        ZL, cache_final_layer = self.affineForward(A, self.parameters['W' + str(L)],self.parameters['b' + str(L)])
        AL = self.activationForward(ZL, activation="softmax")

        caches.append(cache_final_layer)
        return AL, caches
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """        
        m = y.shape[0]
        AL_softmax = AL
        correct_label_prob = AL_softmax[y,range(m)]
        cost = - np.sum(np.log(correct_label_prob))/m
        cost = np.squeeze(cost)
        
        if(self.reg_lambda>0):
            cost = cost + (self.reg_lambda/(2*m))*np.sum([np.sum(np.square(self.parameters["W"+str(i)])) for i in range(1,self.num_layers+1)])
        
        dAL = self.softmax_derivative(AL,y)
        return cost,dAL
    
    def softmax_derivative(self,AL,Y):
        """
        Implement the backward propagation for output Softmax unit
        Arguments:
        AL -- output unit, of  shape (10,m) where 10 is number of output units in this case and m is number of examples
        Y -- output vector
        returns:
        dZ -- gradient of softmax with respect to Z
        """
        return AL - one_hot(Y)

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        #dA_prev is dz from layer l(input)

        A_prev,W,b,Z = cache
        m = A_prev.shape[1]
        dW = np.dot(dA_prev,A_prev.T)/m
        db = np.sum(dA_prev,axis=1, keepdims=True)/m
        dA = np.dot(W.T,dA_prev)

        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        
        A_prev,W,b,Z = cache
        
        if activation=="relu":
            return self.relu_derivative(dA,Z)
        
    def relu_derivative(self, dx, cached_x):
        """
        Implement the backward propagation for a single RELU unit.

        Arguments:
        dA -- post-activation gradient, of any shape
        cache -- 'Z' where we store for computing backward propagation efficiently

        Returns:
        dZ -- Gradient of the cost with respect to Z
        """
        # When z <= 0, you should set dz to 0 as well. 
        dx[cached_x < 0] = 0
        return dx      
    
    def dropout_backward(self, dA, cache):
        #Here cache is M basically
        dA= dA * cache
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        
        gradients = {}
        L = len(cache)
        m = dAL.shape[1]

        # Lth layer (SOFTMAX -> LINEAR) gradients. Inputs: "dAL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
        # dAL parameter in the function is actualy dZL = AL - one_hot(Y)
        current_cache = cache[L-1]
        gradients["dA" + str(L)], gradients["dW" + str(L)], gradients["db" + str(L)] = self.affineBackward(dAL,current_cache)        
        
        for l in reversed(range(L-1)):
            # lth layer: (RELU -> LINEAR) gradients.
            # Inputs: "grads["dA" + str(l + 2)], caches". Outputs: "grads["dA" + str(l + 1)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
            current_cache = cache[l]
            dA_prev_non_linear = self.activationBackward(gradients["dA" + str(l + 2)],current_cache,"relu")
            dA_prev, dW_temp, db_temp = self.affineBackward(dA_prev_non_linear,current_cache)
            gradients["dA" + str(l + 1)] = dA_prev
            gradients["dW" + str(l + 1)] = dW_temp
            gradients["db" + str(l + 1)] = db_temp
            if self.reg_lambda > 0:
                gradients["dW" + str(l+1)] = gradients["dW" + str(l+1)] + np.multiply(self.reg_lambda,self.parameters["W" + str(l+1)]) / m
        
            if self.drop_prob > 0:
                gradients["dA" + str(l + 1)] = self.dropout_backward(gradients["dA" + str(l + 1)],self.cache_dropout[l])
                
           
        if self.reg_lambda > 0:
            gradients["dW" + str(L)] = gradients["dW" + str(L)] + np.multiply(self.reg_lambda,self.parameters["W" + str(L)]) / m
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        L = len(self.parameters) // 2 # number of layers in the neural network
        # Update rule for each parameter. Use a for loop.
        ### START CODE HERE ### (≈ 3 lines of code)
        for l in range(L):
            self.parameters["W" + str(l+1)] = self.parameters["W" + str(l+1)] - alpha*gradients["dW" + str(l+1)]
            self.parameters["b" + str(l+1)] = self.parameters["b" + str(l+1)] - alpha*gradients["db" + str(l+1)]
        
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        batch_iters = (int)(X.shape[1]/batch_size)
        for i in range(0, iters):
            self.parameters["batch_index"] = 0
            for j in range(batch_iters):
                # get minibatch
                X_batch,Y_batch=self.get_batch( X, y, batch_size)
                # forward prop
                AL, all_layer_cache = self.forwardPropagation(X_batch)

                # compute loss
                cost,dAL = self.costFunction(AL, Y_batch)

                # compute gradients
                gradients=self.backPropagation(dAL, Y_batch, all_layer_cache)
                
                # update weights and biases based on gradient
                self.updateParameters(gradients, alpha)
            if i % print_every == 0:
                # print cost, train and validation set accuracies
                print("Cost: ", cost)
        
    def predict(self, X):
        """
        Make predictions for each sample
        """
        y_pred, cache = self.forwardPropagation(X)
        y_pred = self.softmax(y_pred)
        predicted_labels=[]
        for i in range(y_pred.shape[1]):
            label_predicted=max(y_pred[:,i])
            predicted_labels.append(y_pred[:,i].tolist().index(label_predicted))
    
        return predicted_labels

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        current_index=self.parameters["batch_index"]
        self.parameters["batch_index"]=self.parameters["batch_index"]+batch_size
        X_batch,y_batch = X[:,current_index:current_index+batch_size], y[current_index:current_index+batch_size]

        return X_batch, y_batch

In [3]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [1]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    One-hot encoding converts categorical labels to binary values
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    print(files,"files")
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [5]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


In [6]:
X_validate = X_train[:,45001:50000]
Y_validate = y_train[45001:50000]

X_train_temp = X_train[:,:45000]
Y_train_temp = y_train[:45000]

layer_dimensions_testing = [X_train_temp.shape[0],200,100,10]  # including the input and output layers

NN_test = NeuralNetwork(layer_dimensions_testing)

In [7]:
NN_test.train(X_train_temp, Y_train_temp, iters=100, alpha=0.1, batch_size=100, print_every=10)

Cost:  2.00354165621
Cost:  1.49419885342
Cost:  1.26331600184
Cost:  1.09878281674
Cost:  0.935991552424
Cost:  0.955828276332
Cost:  0.777220562799
Cost:  0.793491477183
Cost:  0.719714036605
Cost:  0.556888695817


In [8]:
y_predicted_validate = NN_test.predict(X_validate)
print(y_predicted_validate[:,1])
len(y_predicted_validate)
y_predicted_validate.shape

[ 0.08864504  0.08836611  0.08838112  0.16858777  0.12080552  0.09075662
  0.08864027  0.08836899  0.08839702  0.08905153]


(10, 4999)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(predicted_labels,Y_validate)*100

46.389277855571116

## Part 1

#### Simple fully-connected deep neural network

In [13]:
layer_dimensions = [X_train.shape[0], 200,100, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=100, alpha=0.1, batch_size=100, print_every=100)

Cost:  2.0267290797


In [14]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-tm2761', y_predicted)

In [15]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-tm2761.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 8, 0, 3, 5, 1, 8, 4, 8, 1])

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [16]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.2, reg_lambda=0.01)
NN2.train(X_train, y_train, iters=100, alpha=0.11, batch_size=100, print_every=10)

Cost:  2.02601511453
Cost:  1.48231966678
Cost:  1.32818367956
Cost:  1.19440113688
Cost:  1.11033931286
Cost:  1.03347161284
Cost:  0.873631487726
Cost:  0.825223326575
Cost:  0.799545125109
Cost:  0.957933363379


In [17]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-tm2761',y_predicted)

In [19]:
y_validate2 = NN2.predict(X_validate)
print("Accuracy=",accuracy_score(y_validate2,Y_validate)*100)

Accuracy= 70.9341868374
