# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Name, UNI

Member 2: Name, UNI

Member 3: Name, UNI

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [6]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.layer_dimensions = layer_dimensions
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        self.random_permutation = np.array([])
        self.last_start = 0
        
        # initialize the parameter
        # use the procedure on page 3, lecture 3
        #W needs to be defined for layers = 1...num_layers - 1
        for i in range(1,self.num_layers):
            self.parameters[i] = []
            fan_in = layer_dimensions[i-1]
            
            #num_units in this layers
            num_units_current = layer_dimensions[i]
            num_units_previous = layer_dimensions[i-1]
            
            #generate parameters at once
            #take out the last column and use it as bias
            parameters = np.random.normal(size=(num_units_current, num_units_previous + 1), loc = 0.0, scale = 1.0/np.sqrt(fan_in))
            
            #first element is the weight matrix
            weight_matrix = parameters[:,0:-1]
            print weight_matrix.shape
            self.parameters[i].append(weight_matrix)
            
            #second element is the bias column vector
            bias_vector = parameters[:,[-1]]
            self.parameters[i].append(bias_vector)
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        
        #for WA to be defined W : [n_current, L], then the product is of size: [n_current, S]
        #b : [n_current, 1]
        
        #automatically broadcasts b column onto the matrix WA
        Z = np.matmul(W,A) + b
        return Z

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        
        #activation function is always zero
        return self.relu(A)

    def relu(self, X):
        
        #sets all the negative elements of the X to zero
        return np.maximum(0,X)
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """

        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        #AL : [last layer size X num_samples]
        
        #indexed by layer number
        #input layer has index 0
        # indices of layer go from 0,1,...num_layers - 1
        # output layer has index num_layers - 1
        # contains - zl, al, wl
        cache = {}
        
        cache[0] = []
        cache[0].append([])
        cache[0].append(X)
        cache[0].append([])
        
        A_l = X
        
        #goes on for layer 1..self.num_layers - 1
        for layer_index in self.parameters:
            parameters = self.parameters[layer_index]
            weight_matrix = parameters[0]
            bias_vector = parameters[1]
            
            #A and Z for this layer
            Z_l_plus_1 = self.affineForward(A_l, weight_matrix, bias_vector)
            A_l_plus_1 = self.activationForward(Z_l_plus_1)
            
            #in backward propagation, Z_l and A_l are needed, so caching these as of now
            #i + 1 because index starting at zero
            cache[layer_index] = [Z_l_plus_1, A_l_plus_1, weight_matrix]
            
            A_l = A_l_plus_1
            
        #AL = A_l_plus_1
        
        #don't use the standard relu on final output layer and use the softmax instead
        #actually, this changes the activation of the final layer to linear
        
        #AL = Z_l_plus_1
        #OR
        AL = A_l_plus_1
        
        #using softmax as activation to calculate the final output probability distribution
        #the first two line make sure inf,nan limits are not reached
        D = np.max(AL, axis = 0)
        AL = AL - D
        AL = np.exp(AL)
        column_sums = np.sum(AL, axis = 0)
        AL = AL / column_sums
        return AL, cache
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        y_one_hot = one_hot(y)
        cost = self.cross_entropy(y_one_hot, AL)
        cost = cost / y.shape[0]
        
#         if self.reg_lambda > 0:
#             pass
            # add regularization
               
        # gradient of cost
        # dAL : [classes X num_samples]
        num_classes = self.layer_dimensions[-1]
        dAL = np.zeros((num_classes, y.shape[0]))
        for sample_index in range(y.shape[0]):
            label_for_this_sample = y[sample_index]
            AL_for_this_sample = AL[:, sample_index]
            dAL[:, sample_index][label_for_this_sample] = 1 - AL_for_this_sample[label_for_this_sample]
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """

        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """

        
#     def relu_derivative(self, dx, cached_x):

#         return dx

    def relu_derivative(self, x):
        dx = np.ones(x.shape)
        dx[x < 0] = 0
        return dx
    def dropout_backward(self, dA, cache):

        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        
        #gradients[i] <- gradient for weight matrix, gradient for bias vector
        gradients = {}
        num_samples = Y.shape[0]
        
        #initialize the gradients for weight matrix & bias vector
        #layer indices go from 1..num_layers - 1
        for layer_index in range(1,self.num_layers):
            gradients[layer_index] = []
            
            #matrix and vector accumulate gradient change over all samples
            
            #dimensions of W and dW are same
            dimensions_of_W = self.parameters[layer_index][0].shape
            dW = np.zeros(dimensions_of_W)
            
            #dimensions of b and db are same
            dimensions_of_b = self.parameters[layer_index][1].shape
            db = np.zeros(dimensions_of_b)
            
            gradients[layer_index].append(dW)
            gradients[layer_index].append(db)
        
        
#         y_one_hot = one_hot(Y)
        
#         for sample_index in range(Y.shape[0]):
            
#             print "Running back prop for sample #" + str(sample_index + 1)
            
            #calculate gradients due to this sample
            
            #since the activation of last layer is linear, backpropagate it separately
        
        l = self.num_layers - 1
#         dAL_for_this_sample = dAL[:, [sample_index]]
#         dAl = dAL_for_this_sample

        #if no activation before feeding into softmax
        #dZl = dAl

        #if activation relu is applied before feeding into softmax
        
        dAl = dAL
        Zl = cache[l][0]
        dZl = np.multiply(dAl, self.relu_derivative(Zl))

        #cache <- {zl, al, wl}
        Al_minus_1 = cache[l - 1][1]
        dWl = np.matmul(dZl, Al_minus_1.T)
        gradients[l][0] = gradients[l][0] + float(1.0 / num_samples) * dWl

        dbl = dZl
        gradients[l][1] = gradients[l][1] + float(1.0 / num_samples) * dbl

        dAl_plus_1 = dAl
        dZl_plus_1 = dZl

        #cache <- {zl, al, wl}
        for l in range(self.num_layers - 2, 0, -1):


#                 print "Back prog sample #{} Layer l: {}".format(sample_index, l)

            #the only requirement from previous layer is dZl_plus_one

            Wl_plus_1 = cache[l + 1][2]
            dAl = np.matmul(Wl_plus_1.T, dZl_plus_1)
            Zl = cache[l][0]
            dZl = np.multiply(dAl, self.relu_derivative(Zl))
            Al_minus_1 = cache[l - 1][1]


            #gradients formula
            dWl = np.matmul(dZl, Al_minus_1.T)
            dbl = dZl

            #accumulate the change
            gradients[l][0] = gradients[l][0] + float(1.0 / num_samples) * dWl
            gradients[l][1] = gradients[l][1] + float(1.0 / num_samples)* dbl

            dZl_plus_1 = dZl
            dAl_plus_1 = dAl


                
            
#             if self.drop_prob > 0:
#                 pass
                #call dropout_backward
           
            
#         if self.reg_lambda > 0:
#             # add gradients from L2 regularization to each dW
#             pass

#already normalized the gradients
#         for layer_index in gradients:
#             dWl = gradients[layer_index][0]
#             dbl = gradients[layer_index][1]
            
#             dWl = (1.0/Y.shape[0]) * dWl
#             dbl = (1.0/Y.shape[0]) * dbl
            
#             gradients[layer_index][0] = dWl
#             gradients[layer_index][1] = dbl
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        
        #update the weight matrices, bias vectors of all the layers 1.., self.num_layers - 1
        for layer_index in gradients:
            parameters = self.parameters[layer_index]
            W = parameters[0]
            b = parameters[1]
            
            dW = gradients[layer_index][0]
            db = gradients[layer_index][1]
            
            #descent's update rule
            W = W - alpha * dW
            b = b - alpha * db
            
            #updated parameters
            parameters = [W, b]
            
            self.parameters[layer_index] = parameters
    
    def cross_entropy(self, p1, p2):
        one_minus_p1 = 1 - p1
        log_one_minus_p2 = np.log(1 - p2)
        log_p2 = np.log(p2)
        
        loss = -(np.multiply(p1, log_p2) + np.multiply(one_minus_p1, log_one_minus_p2))
        return np.sum(loss)
        
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        for i in range(0, iters):
            
#             print "Batch #" + str(i)
            
            # get minibatch
            X_batch, y_batch = self.get_batch(X,y, batch_size)

            
            # forward prop
#             print "Forward Propagation started"
            AL, cache = self.forwardPropagation(X_batch)
#             print "Forward Propagation completed"
            
            # compute loss, conpute cross entropy loss between y_one_hot and AL, both of them are probability distributions
            # compute gradients
            cost, dAL = self.costFunction(AL, y_batch)
#             print "Backward Propagation started"
            gradients = self.backPropagation(dAL, y_batch, cache)
#             print "Backward Propagation completed"
            
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            
            if i % print_every == 0:
                #calculate train_accuracy
                y_train = self.predict(X_batch)
                train_accuracy = self.accuracy(y_batch, y_train)
                
                # print cost, train and validation set accuracies
                print "Cost: {}, Train accuracy: {}".format(cost, train_accuracy)
                
    def accuracy(self, y1, y2):
        matched_indices = (y1 == y2)
        return float(np.sum(matched_indices)) / y1.shape[0]
    def predict(self, X):
        """
        Make predictions for each sample
        """
        
        AL,_ = self.forwardPropagation(X)
        y_pred = np.argmax(AL, axis = 0).T
        
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        
        #selects consecutive batches of size `batch_size` from X,y
        #the order of samples is decided by a random permutation
        if self.random_permutation.shape[0] == 0:
            
            #generate the random permutation
            self.random_permutation = np.random.permutation(X.shape[1])
            
            
            #get the permuted samples into X_permuted
            self.X_permuted = X[:,self.random_permutation]
            
            #get the permuted labels into Y_permuted
            self.Y_permuted = y[self.random_permutation]
            
            #initialize the start of the batch to zero
            self.last_start = 0
        
        if self.last_start + batch_size <= X.shape[1]:
            X_batch = self.X_permuted[:,self.last_start: self.last_start + batch_size]
            y_batch = self.Y_permuted[self.last_start: self.last_start + batch_size]
            
            #increase the last_start
            self.last_start = (self.last_start + batch_size) % X.shape[1]
        else:
            X_batch_1 = self.X_permuted[:, self.last_start: X.shape[1]]
#             x_1 = X.shape[0] - self.last_start
#             X_batch_2 = self.X_permuted[:, 0: batch_size - x_1]
#             X_batch = np.concatenate((X_batch_1, X_batch_2), axis = 1)
            X_batch = X_batch_1
            y_batch_1 = self.Y_permuted[self.last_start: X.shape[1]]
#             y_batch_2 = self.Y_permuted[:, 0: batch_size - x_1]
#             y_batch = np.concatenate((y_batch_1, y_batch_2))
            y_batch = y_batch_1
        
        
        
        
        return X_batch, y_batch

In [7]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [8]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.eye(num_classes)
    y = y.reshape(-1)
    y_one_hot = y_one_hot[:,y]
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [9]:
# Load the data
data_root_path = '/home/aayush/ColumbiaCourseProjects/Deep Learning/Assignments/1/Myown/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')
print(y_train.shape)

{'horse': 7, 'automobile': 1, 'deer': 4, 'dog': 5, 'frog': 6, 'cat': 3, 'truck': 9, 'ship': 8, 'airplane': 0, 'bird': 2}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done
(50000,)


## Part 1

#### Simple fully-connected deep neural network

In [10]:
layer_dimensions = [X_train.shape[0], 300, 200, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
# NN.train(X_tra4in, y_train, iters=, alpha=, batch_size=, print_every=)
NN.train(X_train, y_train, batch_size = 5)

(300, 3072)
(200, 300)
(10, 200)
Cost: 3.27987382559, Train accuracy: 0.0
Cost: 3.27111864969, Train accuracy: 0.0
Cost: 3.2677147798, Train accuracy: 0.0
Cost: 3.24224628853, Train accuracy: 0.2
Cost: 3.25178956055, Train accuracy: 0.0
Cost: 3.2205431634, Train accuracy: 0.2
Cost: 3.25149473996, Train accuracy: 0.2
Cost: 3.25421685236, Train accuracy: 0.0
Cost: 3.2528384415, Train accuracy: 0.0
Cost: 3.25316239081, Train accuracy: 0.2


In [None]:
# y_predicted = NN.predict(X_test)
# save_predictions('ans1-uni', y_predicted)

In [None]:
# test if your numpy file has been saved correctly
# loaded_y = np.load('ans1-uni.npy')
# print(loaded_y.shape)
# loaded_y[:10]

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [None]:
# NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
# NN2.train(X_train, y_train, iters=1000, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
# y_predicted2 = NN2.predict(X)
# save_predictions(y_predicted, 'ans2-uni')