### Imports

In [21]:
import numpy as np
from matplotlib import pyplot as plt
from keras.utils import to_categorical as make_class_categorical
import _pickle as pickle
from tqdm import tqdm
import pdb

### Initialize weights

In [2]:
def initialize_weights(d, m, K, variance=0.01):

    np.random.seed(400)
    
    W1= np.random.normal(0, variance, size=(m, d) )
    b1= np.random.normal(0, variance, size=(m, 1) )

    W2 = np.random.normal(0, variance, size=(K, m))
    b2 = np.random.normal(0, variance, size=(K, 1))

    return W1, b1, W2, b2

### Load Batch

In [4]:
def LoadBatch(filename):
    # borrowed from https://www.cs.toronto.edu/~kriz/cifar.html
    def unpickle(file):
        with open(file, 'rb') as fo:
            dict = pickle.load(fo, encoding='latin1')
        return dict

    dictionary = unpickle(filename)

    # borrowed from https://stackoverflow.com/questions/16977385/extract-the-nth-key-in-a-python-dictionary?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    def ix(dic, n):  # don't use dict as  a variable name
        try:
            return list(dic)[n]  # or sorted(dic)[n] if you want the keys to be sorted
        except IndexError:
            print('not enough keys')

    garbage = ix(dictionary, 1)
    y = dictionary[garbage]
    Y = np.transpose(make_class_categorical(y, 10))
    garbage = dictionary['data']
    X = np.transpose(garbage) / 255.0

    return X, Y, y


### ReLU


In [18]:
def ReLU(x):

    return np.maximum(x, 0)

### Softmax

In [6]:
def softmax(X, theta=1.0, axis=None):

    # Softmax over numpy rows and columns, taking care for overflow cases
    # Many thanks to https://nolanbconaway.github.io/blog/2017/softmax-numpy
    # Usage: Softmax over rows-> axis =0, softmax over columns ->axis =1

    """
    Compute the softmax of each element along an axis of X.
    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.
    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis=axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis=axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

### Evaluate Classifier

In [28]:
def EvaluateClassifier(X, W1, b1, W2, b2):

    s1= np.dot(W1,X)+ b1
    h=ReLU(s1)
    s=np.dot(W2,h)+b2
    p= softmax(s, axis=1)

    return p, h, s1

### Predict classes

In [8]:
def predictClasses(p):
    
    return np.argmax(p, axis=0)

### Compute Gradients

In [89]:
def ComputeGradients(X, Y, W1, b1, W2, b2, regularization_term):
    
    def IndXPositive(x):
        above_zero_indices = x > 0
        below_zero_indices = x <= 0
        x[above_zero_indices] = 1
        x[below_zero_indices] = 0

        return x

    """
    Computes gradient descent updates on a batch of data

    :param X: Input data
    :param Y: One-hot representation of the true labels of input data X
    :param W1: Weight matrix of the first layer
    :param b1: Bias vector of the first layer
    :param W2: Weight matrix of the second layer
    :param b2: Bias vector of the second layer
    :param regularization_term: Contribution of the regularization in the weight updates

    :return: Weight and bias updates of the first and second layer of our network
    """

    # Evaluate the classifier to the batch
    p, h, s1 = EvaluateClassifier(X=X, W1=W1, b1=b1, W2=W2, b2=b2)
    print(s1.shape)
    
    grad_W1 = np.zeros(W1.shape)
    grad_b1 = np.zeros(b1.shape)

    grad_W2 = np.zeros(W2.shape)
    grad_b2 = np.zeros(b2.shape)
    
    for datum in range(X.shape[1]):
    
        # Back-propagate second layer at first
    
        # Gradient of J w.r.t second bias vector is the g vector:
        g = (Y[:,datum]-p[:,datum]).reshape(Y.shape[0], 1).T
        grad_b2 += g.T
        # Gradient of J w.r.t second weight matrix is the matrix:
        grad_W2 += np.dot(g.T, h[:, datum].reshape(h.shape[0], 1).T) 
    
        # Back-propagate the gradient vector g to the first layer
        g = np.copy( np.dot(g, W2) )
        g = np.copy( np.dot( g, np.diag(1*(s1[:, datum]>0)) ) ) 
        
        grad_b1 += g.T
        grad_W1 += np.dot(g.T , X[:,datum].reshape(X.shape[0], 1).T)
        
    grad_W1 /= X.shape[1]
    grad_b1 /= X.shape[1]
    grad_W2 /= X.shape[1]
    grad_b2 /= X.shape[1]

    # Add regularizers
    grad_W1+= 2 * regularization_term * W1
    grad_W2+= 2 * regularization_term * W2

#     return grad_W1, grad_b1, grad_W2, grad_b2

## RUN EXPERIMENTS

### Get training, validation and test sets

In [11]:
X_training_1, Y_training_1, y_training_1 = LoadBatch('../../cifar-10-batches-py/data_batch_1')
X_training_2, Y_training_2, y_training_2 = LoadBatch('../../cifar-10-batches-py/data_batch_2')
X_test, _ , y_test= LoadBatch('../../cifar-10-batches-py/test_batch')

### Compute Gradients

In [12]:
W1, b1, W2, b2 = initialize_weights(d = X_training_1.shape[0], m=50, K=10)

In [88]:
ComputeGradients(X_training_1[:, 0:20], Y_training_1[:, 0:20], W1, b1, W2, b2, regularization_term=0)

(50, 20)


ValueError: shapes (50,1) and (50,50) not aligned: 1 (dim 1) != 50 (dim 0)