In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [2]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        # initialize paramters
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        self.layer_dimensions = layer_dimensions.copy()
        X_size = self.layer_dimensions[0]
        self.layer_dimensions.pop(0) 
        
        self.sample_mean = 0
        self.sample_stdDev = 0
        
        self.momentum_solver = 0
        
        self.decay_alpha = 0
        
        self.batch_count = 1
        self.predict_mode = 0
        ncells_prev = X_size
        # W and b are initiliazed here (Random initialization - maybe later switch to Xvier init)
        for layer_id,cells in enumerate(self.layer_dimensions):
            self.parameters['W'+str(layer_id+1)] = np.random.randn(cells, ncells_prev) * 0.02
            self.parameters['V'+str(layer_id+1)] = np.zeros((cells,ncells_prev))
            self.parameters['b'+str(layer_id+1)] = np.zeros((cells, 1))
            ncells_prev = cells
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.dot(W,A)+b
        return Z,(Z,A,W)
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        if activation == 'relu':
            A = self.relu(A)
        else:
            A = self.softmax(A)
            
        return A


    def relu(self, X):
        return np.maximum(0,X)
    
    def softmax(self,X):
        shiftx = X - np.max(X)
        s= np.sum(np.exp(shiftx), axis=0, keepdims=True)
        return np.exp(shiftx) / s

            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        #M = np.random.binomial(1, 1-prob, size=A.shape)
        p = prob
        M  = (np.random.rand(*A.shape) < p) / p # first dropout mask. Notice /p!

        if self.predict_mode == 1:
            A = A
        else:
            A *= M

        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache_all = []
        parameters = self.parameters
        layers = self.layer_dimensions
        AL = X
        for layer_id, cells in enumerate(layers):

            W = parameters['W'+str(layer_id+1)]
            b = parameters['b'+str(layer_id+1)]
            Z, cache = self.affineForward(AL,W,b)
            M = None
            if layer_id != len(layers)-1:
                A = self.activationForward(Z)
                if(self.drop_prob > 0 and layer_id > 0):
                    A,M = self.dropout(A,self.drop_prob)
            else:
                A = self.activationForward(Z,'softmax')
    
            cache_all.append((cache,M))

            AL = A

        cache_all.append((None, AL, None))
            
        return AL, cache_all
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        probs = np.copy(AL.T)
        num_samples = probs.shape[0]
        corect_logprobs = -np.log(probs[range(num_samples),y])
        data_loss = np.sum(corect_logprobs)/num_samples
        
        sum_reg = 0
        if self.reg_lambda > 0:
            # add regularization
            for layer_id in range(len(self.layer_dimensions)):
                W = self.parameters['W'+str(layer_id+1)]
                sum_reg += np.sum(W*W)

        Loss = data_loss+sum_reg*self.reg_lambda*0.5
               
      
        
        # gradient of cost
        dscores = probs
        dscores[range(num_samples),y] -= 1
        dscores /= num_samples
        dAL = dscores.T
        ''' 
        dAL = probs.T
        Y = one_hot(y)
        dAL = np.multiply(Y,dAL)
        for i in range(num_samples):
            dAL[y[i],i] = 1/dAL[y[i],i]
        '''
        return Loss+sum_reg, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        Z,A_Prev,W = cache
        dZ = self.activationBackward(dA_prev, cache)
        m = dZ.shape[1] # I still don't know why move 1/m here from backpropagate_cost.
        dA = np.dot(W.T, dZ)
        dW = np.dot(dZ, A_Prev.T)
        db = (1/m)*np.sum(dZ, axis=1).reshape(-1,1)
        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        Z,A_Prev,W = cache
        if activation == "relu":
            dA[Z <=0] = 0
            return dA
        

        
    def relu_derivative(self, dx, cached_x):
        return np.multiply(dx, (cached_x >=0).astype(np.float32))
        

    def dropout_backward(self, dA, cache):
        M = cache
        dA = dA*M*(1/self.drop_prob)
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        # gardient would be a dictionary storing in the form of gradients[key] = tuple
        # key = layer id
        # tuple = (dZ,dA,db)
        gradients = {}
        
        n = len(self.layer_dimensions)
        
        # Last Layer getting dZ from the softmax function and the computing db
        (Z, A_prev, W),M = cache[n-1]
        tmp_z,AL,tmp_w = cache[n]
        dZL = dAL
        #dZL = np.multiply(dAL, (AL-np.power(AL,2)))
        m = dZL.shape[1]    
        #print(Z.shape, A_prev.shape, W.shape, dAL.shape, dZL.shape)

        dbL = 1/m * np.sum(dZL, axis=1).reshape(-1,1)
        dWL = np.dot(dZL, A_prev.T)
        dA = np.dot(W.T, dZL)

        gradients[n] =  (dAL,dWL,dbL)
        
        dA_prev = dA
        # for the rest of layers upto 1
        for i in np.arange(start=(len(cache)-2),stop = 0,step = -1):


            cache_layer, M = cache[i-1]

            if self.drop_prob > 0 and M is not None:
                #call dropout_backward
                dA_prev  = self.dropout_backward(dA_prev,M)

            dA,dW,db = self.affineBackward(dA_prev,cache_layer)
            
            
            if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
                dW += cache_layer[2]*self.reg_lambda
            
            gradients[i] = (dA_prev,dW,db)
            dA_prev = dA

        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        gamma = 0.9
        
        for i in range(len(self.layer_dimensions)):
            dA, dW, db = gradients[i+1]
            
            if(self.momentum_solver == 0):
                self.parameters['W'+str(i+1)] -= dW * alpha
                self.parameters['b'+str(i+1)] -= db * alpha
            else:
                self.parameters['V'+str(i+1)] = gamma * self.parameters['V'+str(i+1)] - dW * alpha
                self.parameters['W'+str(i+1)] += self.parameters['V'+str(i+1)]
                self.parameters['b'+str(i+1)] -= db * alpha

    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=5000, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        #normalise the data        
        X = X.T.copy()
        self.sample_mean = np.mean(X, axis = 0)
        self.sample_stdDev = np.std(X, axis = 0)
        X -= self.sample_mean
        X /= self.sample_stdDev        
        X = X.T
            
        X_train, Y_train = X[:,5000:], y[5000:]
        X_dev, Y_dev = X[:,:5000], y[:5000]
        
        
        for i in range(0, iters+1):
            if i ==  5000 and (self.decay_alpha == 1):
                alpha /= 10
            # get minibatch
            X_batch, Y_batch = self.get_batch(X_train,Y_train, batch_size)
           
            # forward prop
            AL, cache = self.forwardPropagation(X_batch)
            
            # compute loss
            cost, dAL = self.costFunction(AL,Y_batch)
            
            # compute gradients
            gradients = self.backPropagation(dAL, Y_batch, cache)


            # update weights and biases based on gradient
            self.updateParameters(gradients,alpha)

            if i % print_every == 0:
          
                # prediction
                Y_predict = np.argmax(AL, axis=0)
                # train accuracy
                train_accuracy = np.sum(np.equal(Y_predict, Y_batch)) / Y_predict.shape[0]
                
                # For dev set:
                self.predict_mode = 1
                AL,cache = self.forwardPropagation(X_dev)
                Y_predict = np.argmax(AL,axis=0)
                self.predict_mode = 0
                dev_accuracy = np.sum(np.equal(Y_predict, Y_dev))/Y_predict.shape[0]
                # print cost, train and validation set accuracies
                print('Iteration: '+str(i)+'| cost = '+str(cost)+'| train accuracy = '+str(train_accuracy*100)+'| dev_accuracy = '+str(dev_accuracy*100))
                                  
                                  
    def predict(self, X):
        """
        Make predictions for each sample
        """
        # Normalize the data:
        X = X.T.copy()
        X -= self.sample_mean
        X /= self.sample_stdDev        
        X = X.T
        
        # call forwardpropagation:
        self.predict_mode = 1
        AL, cache = self.forwardPropagation(X)
        self.predict_mode = 0
        
        return np.argmax(AL,axis=0)
    
    def set_momentum_solver(self, val):
        '''
        param val: turns on/off the momentum solver:
        '''
        self.momentum_solver = val
        

    def set_alpha_decay(self, val):
        '''
        param val: turns on/off the alpha decay:
        '''
        self.decay_alpha = val
        
        
    def get_batch(self, X, Y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """

        n = self.batch_count
        if n*batch_size > X.shape[1]:

            permutation = np.random.permutation(X.shape[1])
            X = X[:, permutation]
            Y = Y[permutation]

            self.batch_count = 2
            n = 1
        else:
            self.batch_count += 1
            
        lb = batch_size*(n-1)
        ub = batch_size*(n)

        X = X[:,lb:ub]
        Y = Y[lb:ub]
        return X,Y

In [3]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [4]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [5]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'bird': 2, 'ship': 8, 'cat': 3, 'horse': 7, 'dog': 5, 'truck': 9, 'frog': 6, 'automobile': 1, 'deer': 4}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [6]:
layer_dimensions = [X_train.shape[0],  150, 250, 10]
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=15000, alpha=0.01, batch_size=128, print_every=1000)

Iteration: 0| cost = 2.30467222539| train accuracy = 6.25| dev_accuracy = 8.5
Iteration: 1000| cost = 1.78348463271| train accuracy = 37.5| dev_accuracy = 35.4
Iteration: 2000| cost = 1.72424684482| train accuracy = 39.0625| dev_accuracy = 42.08
Iteration: 3000| cost = 1.47029105182| train accuracy = 42.1875| dev_accuracy = 46.44
Iteration: 4000| cost = 1.51453469116| train accuracy = 44.53125| dev_accuracy = 47.82
Iteration: 5000| cost = 1.34210721514| train accuracy = 55.46875| dev_accuracy = 49.02
Iteration: 6000| cost = 1.19566669899| train accuracy = 64.84375| dev_accuracy = 50.12
Iteration: 7000| cost = 1.37244095542| train accuracy = 49.21875| dev_accuracy = 51.1
Iteration: 8000| cost = 1.33402186512| train accuracy = 50.78125| dev_accuracy = 51.44
Iteration: 9000| cost = 1.14778826759| train accuracy = 61.71875| dev_accuracy = 51.6
Iteration: 10000| cost = 1.22164788011| train accuracy = 58.59375| dev_accuracy = 51.98
Iteration: 11000| cost = 1.01579464455| train accuracy = 63.

In [7]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-ck2840', y_predicted)

In [8]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-ck2840.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 9, 0, 5, 5, 2, 8, 4, 0, 1])

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [9]:
layer_dimensions = [X_train.shape[0], 150, 250, 10]
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.5, reg_lambda=0.01)
NN2.train(X_train, y_train, iters=10000, alpha=0.01, batch_size=256, print_every=1000)

Iteration: 0| cost = 203.364249386| train accuracy = 6.640625| dev_accuracy = 8.7
Iteration: 1000| cost = 174.396629947| train accuracy = 36.328125| dev_accuracy = 39.68
Iteration: 2000| cost = 151.256843929| train accuracy = 48.828125| dev_accuracy = 44.7
Iteration: 3000| cost = 133.834210717| train accuracy = 45.703125| dev_accuracy = 48.1
Iteration: 4000| cost = 120.552827957| train accuracy = 58.984375| dev_accuracy = 49.7
Iteration: 5000| cost = 111.421406246| train accuracy = 56.25| dev_accuracy = 51.32
Iteration: 6000| cost = 105.463870913| train accuracy = 62.890625| dev_accuracy = 52.32
Iteration: 7000| cost = 102.366101653| train accuracy = 58.59375| dev_accuracy = 53.12
Iteration: 8000| cost = 101.163659766| train accuracy = 63.671875| dev_accuracy = 52.82
Iteration: 9000| cost = 101.721363484| train accuracy = 60.546875| dev_accuracy = 53.08
Iteration: 10000| cost = 103.578384138| train accuracy = 56.25| dev_accuracy = 52.76


In [12]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-ck2840', y_predicted2)

## Part 3: Optional effort to boost accuracy:
#### Added alpha decay

In [13]:
layer_dimensions = [X_train.shape[0], 150, 250, 10]
NN3 = NeuralNetwork(layer_dimensions, drop_prob=0.5, reg_lambda=0.01)
NN3.set_alpha_decay(1)
NN3.train(X_train, y_train, iters=7000, alpha=0.1, batch_size=256, print_every=1000)

Iteration: 0| cost = 203.364249386| train accuracy = 6.640625| dev_accuracy = 12.1
Iteration: 1000| cost = 83.9475098978| train accuracy = 47.65625| dev_accuracy = 48.98
Iteration: 2000| cost = 88.7660569702| train accuracy = 53.90625| dev_accuracy = 51.42
Iteration: 3000| cost = 102.600975631| train accuracy = 54.296875| dev_accuracy = 49.52
Iteration: 4000| cost = 113.197249418| train accuracy = 62.890625| dev_accuracy = 51.06
Iteration: 5000| cost = 119.809274982| train accuracy = 48.4375| dev_accuracy = 48.2
Iteration: 6000| cost = 113.684835456| train accuracy = 69.140625| dev_accuracy = 56.2
Iteration: 7000| cost = 112.270240211| train accuracy = 68.75| dev_accuracy = 56.86


In [14]:
y_predicted3 = NN3.predict(X_test)
save_predictions('ans3-ck2840', y_predicted3)