In [430]:
import numpy as np
import pickle
import numpy.matlib
import matplotlib.pyplot as plt
import sys

In [463]:
###############################################################

# Matlab -> Python functions

###############################################################

# Loades an entire batch
def LoadBatch(filename):
	""" Copied from the dataset website """ 
	with open('Datasets/'+filename, 'rb') as fo:
		dict = pickle.load(fo, encoding='bytes') 
	return dict

# Calculate softmax for class class estimation of each image (vector)
def softmax(x):
	""" Standard definition of the softmax function """
	exp_x = np.exp(x)
	return exp_x / np.sum(exp_x, axis=0)

def ComputeGradsNum(W1, W2, b1, b2, X, Y,lambd, h=0.00001):
    
    grad_W2 = np.zeros(shape=W2.shape)
    grad_b2 = np.zeros(shape=b2.shape)
    grad_W1 = np.zeros(shape=W1.shape)
    grad_b1 = np.zeros(shape=b1.shape)   
    c = ComputeCost(X, Y, W1, W2,b1, b2, lambd)
    
    for i in range(b1.shape[0]):
        b1_try = b1.copy()
        b1_try[i,0] = b1_try[i,0]+h
        c2 = ComputeCost(X, Y, W1, W2,b1_try, b2, lambd)
        grad_b1[i,0] = (c2-c)/h
    
    for i in range(W1.shape[0]):
        for j in range(W1.shape[1]):
            W1_try = W1.copy()
            W1_try[i,j] = W1_try[i,j]+h
            c2 = ComputeCost(X, Y, W1_try, W2,b1, b2, lambd)
            grad_W1[i,j] = (c2-c)/h
    
    for i in range(b2.shape[0]):
        b2_try = b2.copy()
        b2_try[i,0] = b2_try[i,0]+h
        c2 = ComputeCost(X, Y, W1,W2,b1,  b2_try, lambd)
        grad_b2[i,0] = (c2-c)/h
    
    for i in range(W2.shape[0]):
        for j in range(W2.shape[1]):
            W2_try = W2.copy()
            W2_try[i,j] = W2_try[i,j]+h
            c2 = ComputeCost(X, Y, W1, W2_try,b1, b2, lambd)
            grad_W2[i,j] = (c2-c)/h
    
    return grad_W1,grad_W2,grad_b1,grad_b2

# Allows for efficiently view the images in a directory or 
# in a *Matlab* array or cell array
def montage(W):
	""" Display the image for each label in W """
	import matplotlib.pyplot as plt
	fig, ax = plt.subplots(2,5)
	for i in range(2):
		for j in range(5):
			im  = W[i*5+j,:].reshape(32,32,3, order='F')
			sim = (im-np.min(im[:]))/(np.max(im[:])-np.min(im[:]))
			sim = sim.transpose(1,0,2)
			ax[i][j].imshow(sim, interpolation='nearest')
			ax[i][j].set_title("y="+str(5*i+j))
			ax[i][j].axis('off')
	plt.show()



In [464]:
# Read data
X_train, Y_train, y_test = ReadData('data_batch_1')
X_val_train, Y_val_train, y_val_test = ReadData('data_batch_2')

#X_batch_1, Y_b1, y_b1 = ReadData('data_batch_1')
#X_batch_2, Y_b2, y_b2 = ReadData('data_batch_2')
#X_batch_3, Y_b3, y_b3 = ReadData('data_batch_3')
#X_batch_4, Y_b4, y_b4 = ReadData('data_batch_4')
#X_batch_5, Y_b5, y_b5 = ReadData('data_batch_5')

#X_train = np.stack((X_batch_1, X_batch_2, X_batch_3, X_batch_4, X_batch_5))
#Y_train = np.stack((Y_b1, Y_b2, Y_b3, Y_b4, Y_b5))
#y_train = np.stack((y_b1, y_b2, y_b3, y_b4, y_b5))

#X_train, X_val_train = X_train[:, 100]
X_test_train, Y_test_train, y_test_test = ReadData('test_batch')

# Gets mean and std of training data
X_mean, X_std = GetMeanAndStd(X_train)

In [None]:
###############################################################

# My functions

###############################################################

# Read pixel data, labels (classes), one-hot rep. of labels (classes)
# Divide pixel data by 255 for correct format
def ReadData(filename):
    data_batch = LoadBatch(filename)
    pixel_data = data_batch[b'data'].T
    labels = data_batch[b'labels']
    one_hot = np.eye(10)[labels].T
    return pixel_data, one_hot, labels 

# Normalize w.r.t. training data mean and standard deviation
# Normalization of input so that the inputs are at a comparable range
def Normalize(train, validation,test=None):
        train=np.float64(train)
        validation=np.float64(validation)
        
        mean_X =train.mean(axis=1)

        std_X=train.std(axis=1)

        train=train-mean_X[:,None]

        train=train/std_X[:,None]
        validation=validation-mean_X[:,None]
        validation=validation/std_X[:,None]
        
        if(test is not None):
            test=np.float64(test)
            test=test-mean_X[:,None]
            test=test/std_X[:,None]
            return train,validation,test;
        
        return train,validation;

# First init of model params W(eights) and b(ias)
# Init done with 0 mean and 1 / sqrt of d and m
# Random seed for selecting the same rndm numbers for each execution
def GetWeightAndBias(X, Y, m=50):
    
    weights = list()
    bias = list()
    d = X.shape[0]
    k = 10

    std_d = 1 / np.sqrt(d)
    std_m = 1 / np.sqrt(m)
    
    # W1 = m (50) x d (3072)
    # W2 = K (10) x m (50)
    np.random.seed(400)
    weights.append(np.random.normal(loc=0.0, scale=std_d, size=(m, d)))
    weights.append(np.random.normal(loc=0.0, scale=std_m, size=(k, m)))
        
    # b1 = m (50) x 1
    # b2 = K (10) x 1
    np.random.seed(400)
    bias.append(np.random.normal(loc=0.0, scale=std_d, size=(m,1)))#np.zeros(shape=(m, 1))
    bias.append(np.random.normal(loc=0.0, scale=std_m, size=(k,1)))#np.zeros(shape=(b_size[0], 1))
    
    # OLD
    # b = np.random.normal(loc=0.0, scale=0.01, size=(b_size[0], 1))
    # W = np.random.normal(loc=0.0, scale=0.01, size=(10, m))

    return weights, bias

# Evaluation of the network function
# Agan, Softmax returns each probability for each class label
def EvaluateClassifier(X, W1, W2,b1, b2):
   
    # Saving this for later
    #activaiton_list = list() 
    # just 'hard coded' 2 layer NN for now
    # s1 = np.dot(W[0], X) + b[0]
    # Out put of nodes, first layer
    # h1 = np.maximum(0, s1)
    # s2 = np.dot(W[1], h1)+ b[1]
    # Out put of nodes, second layer
    
    s1=W1@X+b1
    #relu    
    h1=s1 * (s1 > 0)
    s2=W2@h1+b2

    P = softmax(s2)
    act_vals = h1
    return P, act_vals

# Total cost of a set of images:
# 1. Regularization term, calculate: lambda * sum(W^2 ij)
# 2. Sum it with l_cross + regularization term -> for each x,y in D
# 3. Multiply everything with 1 / length of D
def ComputeCost(X, Y, W1, W2, b1, b2, lambd):

    # Saving for later
    # Calculate P using softmax
    #P, act_vals = EvaluateClassifier(X, W, b)
    
    # Calculate cross-entropy-loss
    #l_cross = -np.sum(np.multiply(Y, np.log(P)))
    
    # Calculate regularization term
    #sigma = lambda x, k: for i in range(k) np.sum(np.square(x))
    
    #not sure about this one
    # reg_term = lambd * np.sum([np.sum(np.square(w)) for w in W])
    
    # Calculate total cost of the set of imgs
    # J = (1 / len(X[1])) * l_cross + reg_term
    
    P, H = EvaluateClassifier(X, W1, W2, b1, b2)
    lcr =- np.sum(np.multiply(Y, np.log(P)))
    Reg_term = lambd*((W1**2).sum()+(W2**2).sum())
    J = lcr/X.shape[1]+Reg_term
    return J


# Accuracy of the network's predictions
# Percentage of examples for which it gets the correct answer
def ComputeAccuracy(X, y, W1, W2, b1, b2):
    P, act_vals = EvaluateClassifier(X,W1,  W2,b1, b2)
    acc = np.mean(y == np.argmax(P, axis=0))
    
    return acc
    
# Compute gradients of the cost function to see the curve of cost decrease 
# Forward pass is already done since we have already calculated P
def ComputeGradients(act_vals, X, Y, P, W1, W2,lambd):

    n_b1 = X.shape[1]
    K = Y.shape[0]
    d = act_vals.shape[0]

    # Backward pass
    G_batch = -(Y - P)
    
    # Backward pass for W + reg term fix reg later
    grad_W2 = np.dot(G_batch, act_vals.T)/ n_b1 + 2 * lambd * W2
        
    # Backward pass for b
    grad_b2=(np.dot(G_batch,np.ones(shape=(n_b1,1)))/n_b1).reshape(K,1)

    # Backward pass second layer
    G_batch = W2.T @ G_batch

    # g_test = G_batch * act_vals.T
    
    # 1 as in use this node 
    G_batch = G_batch*(act_vals>0) #, where=(act_vals >0))
    
    grad_W1 = np.dot(G_batch, X.T) / n_b1 + 2 * lambd * W1
    grad_b1=(np.dot(G_batch,np.ones(shape=(n_b1,1)))/n_b1).reshape(d,1)


    return grad_W1, grad_W2, grad_b1, grad_b2

# Check if my analytical gradients 
# Using centered difference function
# If the differenc is < 1e-6, the analytical gradients are fine
def CompareGradients(act_vals, X,Y, W1, W2, b1,b2, lambd, threshold):
    
    P, act_vals = EvaluateClassifier(X, W1, W2, b1,b2)

    #Calculate gradients
    grad_W1_a, grad_W2_a, grad_b1_a, grad_b2_a = ComputeGradients(act_vals, X, Y,P, W1,W2, lambd)
    grad_W1_n,grad_W2_n, grad_b1_n, grad_b2_n = ComputeGradsNum(W1, W2, b1,b2, X, Y,lambd, h=0.00001)

    # Calculate differences
    w_rel_error_1 = np.sum(np.abs(grad_W1_a - grad_W1_n)) / np.maximum(0.001, np.sum(np.abs(grad_W1_a) + np.abs(grad_W1_n)))
    w_rel_error_2 = np.sum(np.abs(grad_W2_a - grad_W2_n)) / np.maximum(0.001, np.sum(np.abs(grad_W2_a) + np.abs(grad_W2_n)))

    b_rel_error_1 = np.sum(np.abs(grad_b1_a - grad_b1_n)) / np.maximum(0.001, np.sum(np.abs(grad_b1_a) + np.abs(grad_b1_n)))
    b_rel_error_2 = np.sum(np.abs(grad_b2_a - grad_b2_n)) / np.maximum(0.001, np.sum(np.abs(grad_b2_a) + np.abs(grad_b2_n)))

    # Check differences
    if (w_rel_error_1 and w_rel_error_2) and (b_rel_error_2 and b_rel_error_1) < threshold:
        print("Analytical ok")
    else:
        print("Gradient difference too high")

 
def MiniBatchGD2(X, Y, y, GDparams, W1, W2, b1, b2, X_val=None, Y_val=None, y_val=None, lambd= 0 ):
    n = X.shape[1]
    (eta_min,eta_max,step_size,n_batch,cycles)=GDparams
    metrics = {'updates':[-1], 
               'Loss_scores':[ComputeCost(X, Y, W1, W2, b1, b2, lambd)], 
               'acc_scores':[ComputeAccuracy(X, y, W1, W2, b1, b2)]}
    if X_val is not None:
        metrics['Loss_val_scores'] = [ComputeCost(X_val, Y_val, W1, W2,b1, b2, lambd)]
        metrics['acc_val_scores'] = [ComputeAccuracy(X_val, y_val, W1, W2, b1, b2)]
    batches = dict()

    for j in range(n//n_batch):
            j_start = (j)*n_batch ;
            j_end = (j+1)*n_batch;
            inds = range(j_start,j_end);
            y_batch = [y[index] for index in inds]
            X_batch = X[:, inds];
            Y_batch = Y[:, inds];
            batches[j]=(X_batch,Y_batch,y_batch)
    j = 0
    
    for l in range(cycles):
        for t in range(2*l*step_size, 2*(l+1)*step_size):
            
            if t>= 2*l*step_size and t<(2*l+1)*step_size:
                eta = eta_min+(t-2*l*step_size)/step_size*(eta_max-eta_min)
            elif t>=(2*l+1)*step_size and t<2*(l+1)*step_size:
                eta = eta_max-(t-(2*l+1)*step_size)/step_size*(eta_max-eta_min)

            X_batch, Y_batch, y_batch = batches[j]
            P_batch, H_batch = EvaluateClassifier(X_batch, W1, W2,b1, b2)
            grad_W1, grad_W2, grad_b1, grad_b2 = ComputeGradients(H_batch, X_batch, Y_batch, P_batch, W1, W2,lambd)

          #  print(W1)
           # if(math.isnan(W1[0][0])):
           #     print("NU ÄR DET NAN ")
           #     print("J = ", j)
           #     print(W1)
             #   hej()
                # .1 * 1.5
                # .0001 * 1.5
            W1 -= eta*grad_W1
            b1 -= eta*grad_b1
            W2 -= eta*grad_W2
            b2 -= eta*grad_b2
            j += 1
            if j>(n//n_batch-1):
                # set j = 0 will start new epoch
                j = 0
                metrics['updates'].append(t+1)
                metrics['acc_scores'].append(ComputeAccuracy(X, y, W1, W2, b1, b2))
                metrics['Loss_scores'].append(ComputeCost(X, Y, W1, W2, b1, b2,lambd))

                if X_val is not None:
                    metrics['acc_val_scores'].append(ComputeAccuracy(X_val, y_val, W1, W2,b1, b2))
                    metrics['Loss_val_scores'].append(ComputeCost(X_val, Y_val, W1, W2,b1, b2, lambd))
                message = "In update "+str(t+1)+'/'+str(2*cycles*step_size)+" finishes epoch "+ \
                          str(len(metrics['updates'])-1)+": loss="+str(metrics['Loss_scores'][-1])+ \
                          " and accuracy="+str(metrics['acc_scores'][-1])+" (training set) \r"
                sys.stdout.write(message)
            
        
    
    return W1, b1, W2, b2, metrics

In [465]:
# Normalize all data w.r.t. mean and std of training data
X_train_normalized, X_val_train_normalized, X_test_train_normalized = Normalize(X_train, X_val_train, X_test_train, X_mean, X_std)

In [476]:
# Create model params W and b
W, b = GetWeightAndBias(X_train_normalized, Y_train, m=50)
W1 = W[0]
W2 = W[1]
b1 = b[0]
b2 = b[1]

print(W1.shape)
print(W2.shape)
print(b1.shape)
print(b2.shape)

(50, 3072)
(10, 50)
(50, 1)
(10, 1)


In [477]:
# Model evaluation (take softmax)
P, act_vals = EvaluateClassifier(X_train_normalized, W1, W2, b1, b2)

In [478]:
# X = dxn array of images (columns)
# Y = 1xn vector of labels (in one-hot) for X
# J = scalar corresponing to sum of the loss of the network's predictions,
# in X relative to ground truth labels and reg. term on W. 
# Lambda = specifies how much penalty to be added 
J = ComputeCost(X_train_normalized, Y_train, W1, W2, b1, b2, lambd = 0.005)

In [479]:
A = ComputeAccuracy(X_train_normalized, y_test, W1, W2, b1, b2)

In [480]:



lmb = 0.05
grad_W1, grad_W2, grad_b1, grad_b2 = ComputeGradients(act_vals, X_train_normalized, 
                                                      Y_train,P,
                                                      W1, W2, lmb)


In [481]:
threshold = 1e-5

CompareGradients(act_vals, X_train_normalized[0:20, [0]],Y_train[:, [0]], W1[:, 0:20], W2, b1,b2, 0, threshold)

#CompareGradients(X_train[0:1000, [1]], Y_train[:, [1]], W[:, 0:1000], b, 0, threshold)
#CompareGradients(X_train[0:3072, [1]], Y_train[:, [1]], W[:, 0:3072], b, 0, threshold)

Analytical ok


In [482]:
# Hyperparams
#GDparams = {'n_batch': 10, 'eta_min': 1e-5, 'eta_max':1e-1, 'cycles': 2}
lambd = 0

eta_min=1e-5
eta_max=1e-1
step_size=500
n_batch=10
cycles=2
GDparams=(eta_min,eta_max,step_size,n_batch,cycles)

# Train data
#cost, accuracy_list, s_im, W_upd, b_upd = MiniBatchGD(X_train_normalized, Y_train, GDparams, W, b, lambd)
print(X_train_normalized.dtype)
# Validation data
print(b1.shape)
print(b2.shape)
W1_upd, b1_upd, W2_upd, b2_upd, metrics = MiniBatchGD2(X_train_normalized, Y_train, y_test, GDparams, 
                                                       W1, W2, b1, b2,  
                                                       X_val_train_normalized, 
                                                       Y_val_train, y_val_test, lambd)

print(metrics)

float64
(50, 1)
(10, 1)


  return np.exp(x) / np.sum(np.exp(x), axis=0)
  return np.exp(x) / np.sum(np.exp(x), axis=0)
  G_batch = G_batch*(act_vals>0) #, where=(act_vals >0))


{'updates': [-1, 1000, 2000], 'Loss_scores': [2.543752870781795, nan, nan], 'acc_scores': [0.0898, 0.1005, 0.1005], 'Loss_val_scores': [2.533495633039159, nan, nan], 'acc_val_scores': [0.092, 0.0984, 0.0984]}


In [None]:
# Plotting
fig, ax = plt.subplots()  
ax.plot(np.arange(GDparams['n_epochs']), cost, 'b', label='Training loss')  
ax.plot(np.arange(GDparams['n_epochs']), cost_val, 'r', label='Validation loss') 
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Error vs. Training Epoch')
plt.legend()
plt.show()

print("Current lambda:", lambd) 
print("Current n_batch:", GDparams['n_batch'])
print("Current eta:", GDparams['eta'])
print("Current n_epochs:", GDparams['n_epochs'])
print("Accuracy test data: ", Acc)

# Visualization of weight matrix W for each epoch
montage(W_upd)

In [None]:
def initialize_network(data_train, label_names):
    weights = list()
    bias = list()
    weights.append(np.random.normal(0, 1 / np.sqrt(data_train.shape[0]),
                                    (HIDDEN_NODES, data_train.shape[0])))  # Dim: m x d
    weights.append(np.random.normal(0, 1 / np.sqrt(HIDDEN_NODES),
                                    (len(label_names), HIDDEN_NODES)))  # Dim: k x m
    bias.append(np.zeros((HIDDEN_NODES, 1)))  # Dim: m x 1
    bias.append(np.zeros((len(label_names), 1)))  # Dim: k x 1

    return weights, bias


def forward_pass(data_train, weights, bias):
    output = list()  # Output of previous layer list
    s_list = list()  # s values list
    output.append(np.copy(data_train))
    s_list.append(compute_s(data_train, weights[0], bias[0]))
    for i in range(1, len(weights)):
        output.append(compute_h(s_list[-1]))
        s_list.append(compute_s(output[-1], weights[i], bias[i]))

    return output, s_list


def cyclical_update(t, n_s, eta_min, eta_max):
    cycle = int(t / (2 * n_s))  # Number of complete cycles elapsed
    if 2 * cycle * n_s <= t <= (2 * cycle + 1) * n_s:
        return eta_min + (t - 2 * cycle * n_s) / n_s * (eta_max - eta_min)
    if (2 * cycle + 1) * n_s <= t <= 2 * (cycle + 1) * n_s:
        return eta_max - (t - (2 * cycle + 1) * n_s) / n_s * (eta_max - eta_min)


def softmax(s):
    return np.exp(s) / np.sum(np.exp(s), axis=0)


def compute_h(s):
    return np.maximum(0, s)


def l_cross(y, p):
    return -np.log(np.sum(y * p, axis=0))


def compute_s(data, weight, bias):
    s = weight @ data + bias  # Dim: k x n

    return s


def compute_loss(data, labels, weights, bias):
    p = softmax(forward_pass(data, weights, bias)[1][-1])  # Value of s_2 computed in the forward pass
    l_cross_sum = np.sum(l_cross(labels, p))

    return (1 / data.shape[1]) * l_cross_sum


def compute_cost(data, labels, weights, bias, lmb):
    loss = compute_loss(data, labels, weights, bias)
    reg = lmb * np.sum([np.sum(np.square(w)) for w in weights])  # Regularization term L2

    return loss + reg


def compute_accuracy(data, labels, weights, bias):
    p = softmax(forward_pass(data, weights, bias)[1][-1])  # Value of s_2 computed in the forward pass
    prediction = np.argmax(p, axis=0)
    real = np.argmax(labels, axis=0)

    return np.sum(real == prediction) / len(real)


def compute_grads_analytic(data, labels, weights, lmb, p):
    grad_weights = list()
    grad_bias = list()
    # Last layer --> data[0] is the original input
    g = -(labels - p)  # Dim: k x n
    grad_weights.append((g @ data[-1].T) / data[0].shape[1] + 2 * lmb * weights[-1])
    grad_bias.append(np.sum(g, axis=1)[:, np.newaxis] / data[0].shape[1])
    # Remaining layers
    for i in reversed(range(len(data) - 1)):  # Reverse traversal of the lists
        g = weights[i + 1].T @ g  # Multiply by previous weight
        diag = np.copy(data[i + 1])  # Perform a copy of the output of the previous layer
        diag[diag > 0] = 1  # Transform every element > 0 into 1
        # diag[diag < 0] = 0  # Transform every element < 0 into 0
        g = g * diag  # Element multiplication by diagonal of the indicator over data[i]
        grad_weights.append((g @ data[i].T) / data[0].shape[1] + 2 * lmb * weights[i])
        grad_bias.append(np.sum(g, axis=1)[:, np.newaxis] / data[0].shape[1])
    grad_weights.reverse(), grad_bias.reverse()  # Reverse lists to return the same order

    return grad_weights, grad_bias