**Installing Wandb**

In [None]:
!pip install wandb

**Import statements**

In [17]:
import wandb

In [3]:
from keras.datasets import fashion_mnist
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [4]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)

# **Question 1**

In [26]:
(trainX, trainy), (testX, testy) = fashion_mnist.load_data()
wandb.init(
    project="Assignment 1",
    entity="cs22m006",
    name="Assignment1_sample_images"
)
def plotImagesOfEachClass():
  image_labels = []
  images = []
  for i in range(len(trainX)):
    if len(image_labels) == len(class_names):
      break
    if class_names[trainy[i]] not in image_labels:
      image_labels.append(class_names[trainy[i]])
      images.append(trainX[i])

  wandb.log({"Sample image for each class ": [wandb.Image(img, caption=caption) for img, caption in zip(images, image_labels)]})

plotImagesOfEachClass()

VBox(children=(Label(value='0.001 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.068419…

# **Question 2 and 3**

In [None]:
(x_train, y_train), (testX, testy) = fashion_mnist.load_data()
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2])
x_train = x_train/255.0
testX = testX.reshape(testX.shape[0], testX.shape[1]*testX.shape[2])
testX = testX/255.0

In [6]:
def initializeWeightAndBias(layer_dims, init_mode = "random uniform"):
  W = []
  bias = []
  np.random.seed(3)
  if(init_mode == "random uniform"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1], layer_dims[layer_num])))
      bias.append((np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1],1))))
  elif(init_mode == "random normal"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1], layer_dims[layer_num]))
      bias.append((np.random.randn(layer_dims[layer_num+1],1)))
  elif(init_mode == "xavier"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1],layer_dims[layer_num])*np.sqrt(2/(layer_dims[layer_num+1]+layer_dims[layer_num])))
      bias.append(np.random.randn(layer_dims[layer_num+1],1)*np.sqrt(2/(layer_dims[layer_num+1])))
  return W, bias

In [7]:
def feedForward(W, bias, X, num_hidden_layers, layer_dims, activation_fun = "tanh"):
  preactivation = []
  activation = []
  activation.append(X.T)
  preactivation.append(X.T)
  for i in range(1, num_hidden_layers+1):
    preactivation.append(bias[i-1] + np.matmul(W[i-1], activation[(i-1)]))
    if(activation_fun == "sigmoid"):
      activation.append(sigmoid(preactivation[i]))
    elif(activation_fun == "tanh"):
      activation.append(tanh(preactivation[i]))
    elif(activation_fun == "reLU"):
      activation.append(reLU(preactivation[i]))
  preactivation.append(bias[-1] + np.dot(W[-1], activation[-1]))
  activation.append(softmax(preactivation[-1]))
  return activation[-1], activation, preactivation

In [8]:
def updateParam(W, gradientW, bias, gradientBias, learning_rate):
  for i in range(0, len(W)):
    W[i] = W[i] - learning_rate*gradientW[i]
    bias[i] = bias[i] - learning_rate*gradientBias[i]
  return W, bias

def updateParamMomentum(W, bias, gradientW, gradientBias, previous_updates_W, previous_updates_Bias, learning_rate, momentum):
  for idx in range(len(gradientW)):
    previous_updates_W[idx] = momentum*previous_updates_W[idx] + gradientW[idx]
    previous_updates_Bias[idx] = momentum*previous_updates_Bias[idx] + gradientBias[idx]
  for i in range(0, len(W)):
    W[i] = W[i] - learning_rate*gradientW[i]
    bias[i] = bias[i] - learning_rate*gradientBias[i]
  return W, bias
  

def updateParamRMS(W, gradientW, bias, gradientBias, learning_rate, v_W, v_bias, beta):
  eps = 1e-6
  for idx in range(0, len(W)):
    v_W_t = beta*v_W[idx] + (1-beta)*np.multiply(gradientW[idx], gradientW[idx])
    v_bias_t = beta*v_bias[idx] + (1-beta)*np.multiply(gradientBias[idx], gradientBias[idx])
    W[idx] = W[idx] - learning_rate*gradientW[idx]/(np.sqrt(v_W_t)+eps)
    bias[idx] = bias[idx] - learning_rate*gradientBias[idx]/(np.sqrt(v_bias_t)+eps)
    v_W[idx] = v_W_t
    v_bias[idx] = v_bias_t
  return W, bias, v_W, v_bias

def updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2):

  epsilon = 1e-6

  for i in range(0, len(W)):
    mdW = beta1*m_W[i] + (1-beta1)*gradientW[i]
    mdBias = beta1*m_bias[i] + (1-beta1)*gradientBias[i]
    vdW = beta2*v_W[i] + (1-beta2)*np.square(gradientW[i])
    vdBias = beta2*v_bias[i] + (1-beta2)*np.square(gradientBias[i])
    m_w_hat = mdW/(1.0 - beta1**t)
    v_w_hat = vdW/(1.0 - beta2**t)
    m_bias_hat = mdBias/(1.0 - beta1**t)
    v_bias_hat = vdBias/(1.0 - beta2**t)

    W[i] = W[i] - (learning_rate * m_w_hat)/np.sqrt(v_w_hat + epsilon)
    bias[i] = bias[i] - (learning_rate * m_bias_hat)/np.sqrt(v_bias_hat + epsilon)

    v_W[i] = vdW
    m_W[i] = mdW
    v_bias[i] = vdBias
    m_bias[i] = mdBias

    return W, bias, v_W, v_bias, m_W, m_bias

In [9]:
def sigmoid(X):
  return 1.0/(1.+np.exp(-X))

def sigmoid_derivative(x):
  return sigmoid(x)*(1-sigmoid(x))

def reLU(x):
  return np.maximum(0,x)

def reLU_derivative(x):
  return 1*(x>0) 

def tanh(x):
  return np.tanh(x)

def tanh_derivative(x):
  return (1 - (np.tanh(x)**2))

def softmax(a):
  return np.exp(a)/np.sum(np.exp(a), axis=0)

In [10]:
def calculateAccuracy(batch_size, X, y, W, bias, num_hidden_layers, layer_dims, activation_fun):
  batch_count = batch_size
  count = 0
  for i in range(0, len(X), batch_size):
    if(i+batch_size>len(X)):
      batch_count = len(X)-i
    hL, activation, preactivation = feedForward(W, bias, X[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)
    for j in range(i, i+batch_count):
      if(np.argmax(hL[:,(j-i)]) == y[j]):
        count+=1
  return (100.0*count)/len(X)

In [11]:
def backward_propogation(y_one_hot, x, y, W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun = "tanh"):
  L = num_hidden_layers+1
  gradientPreactivation = []
  gradientPreactivation.append(activation[L]-y_one_hot)
  gradientWeight = []
  gradientBias = []
  for k in range(L, 0, -1):
    gradientWeight.append(np.matmul(gradientPreactivation[-1], activation[k-1].T)/batch_size)
    gradientBias.append(np.sum(gradientPreactivation[-1], axis=1, keepdims=True)/batch_size)
    if k==1:
      break
    if(activation_fun == "sigmoid"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), sigmoid_derivative(preactivation[k-1])))
    elif(activation_fun == "tanh"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), tanh_derivative(preactivation[k-1])))
    if(activation_fun == "reLU"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), reLU_derivative(preactivation[k-1])))
  return gradientWeight[::-1], gradientBias[::-1]

In [12]:
def cross_entropy(y, y_hat, W, weight_decay):
  loss = 0
  for i in range(len(y)):
    loss += -1.0*np.sum(y[i]*np.log(y_hat[i]))
  
  #L2 regularizaation
  acc = 0
  for i in range(len(W)):
    acc += np.sum(W[i]**2)
  loss += weight_decay*acc
  return loss

def mse(y, y_hat, W, weight_decay):
  loss = 1/2* np.sum((y-y_hat)**2)
  #L2 regularizaation
  acc = 0
  for i in range(len(W)):
    acc += np.sum(W[i]**2)
  loss += weight_decay*acc
  return loss

In [14]:
def optimizers(num_hidden_layers, neurons_in_each_layer, epochs, learning_rate, batch_size, init_mode, activation_fun, loss_function = "cross_entropy", optimizer = "sgd", momentum = 0.9, beta = 0.9, beta1 = 0.9, beta2 = 0.999, weight_decay = 0):
  layer_dims = [trainX.shape[1]]
  for i in range(num_hidden_layers):
    layer_dims.append(neurons_in_each_layer)
  layer_dims.append(k)
  W, bias = initializeWeightAndBias(layer_dims, init_mode)
  y_pred = []
  batch_count = batch_size
  y_one_hot = np.zeros((10, num_images))
  for i in range(num_images):
    y_one_hot[trainy[i]][i] = 1
  v_W = [0]*(num_hidden_layers+1)
  v_bias = [0]*(num_hidden_layers+1)
  m_W, m_bias, gradientW, gradientBias, look_ahead_W, look_ahead_bias, previous_updates_W, previous_updates_Bias = v_W.copy(), v_bias.copy(), v_W.copy(), v_bias.copy(), v_W.copy(), v_bias.copy(), v_W.copy(), v_bias.copy()
  t = 1 #for adam

  for iterationNumber in range(epochs):
    loss=0
    batch_count = batch_size
    for i in range(0, num_images, batch_size):
      if(i+batch_size >= num_images):
        batch_count = num_images-i

      if(optimizer == "nag"):
        for idx in range(len(W)):
          look_ahead_W[idx] = W[idx] - momentum * gradientW[idx]
          look_ahead_bias[idx] = bias[idx] - momentum * gradientBias[idx]

        hL, activation, preactivation = feedForward(look_ahead_W, look_ahead_bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)

        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], look_ahead_W, look_ahead_bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun)
        W, bias = updateParam(W, gradientW, bias, gradientBias, learning_rate)

      elif(optimizer == "nadam"):
        for idx in range(len(W)):
          look_ahead_W[idx] = W[idx] - momentum * gradientW[idx]
          look_ahead_bias[idx] = bias[idx] - momentum * gradientBias[idx]

        hL, activation, preactivation = feedForward(look_ahead_W, look_ahead_bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)

        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], look_ahead_W, look_ahead_bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun)
        W, bias, v_W, v_bias, m_W, m_bias = updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2)
        t += 1

      else:
        hL, activation, preactivation = feedForward(W, bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)

        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun)
  
        if(optimizer == "sgd"):
          W, bias = updateParam(W, gradientW, bias, gradientBias, learning_rate)

        elif(optimizer == "momentum"):
          W, bias = updateParamMomentum(W, bias, gradientW, gradientBias, previous_updates_W, previous_updates_Bias, learning_rate, momentum)
        
        elif(optimizer == "rmsprop"):
          W, bias, v_W, v_bias = updateParamRMS(W, gradientW, bias, gradientBias, learning_rate, v_W, v_bias, beta)

        elif(optimizer == "adam"):
          W, bias, v_W, v_bias, m_W, m_bias = updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2)
          t += 1

      if(iterationNumber==epochs-1):
        for j in range(i, i+batch_count):
          y_pred.append(np.argmax(hL[:,(j-i)]))
      if(loss_function == "cross_entropy"):
        loss += cross_entropy(y_one_hot[:,i:i+batch_count], hL[:,0:batch_count], W, weight_decay)
      elif(loss_function == "mean_squared_error"):
        loss += mse(y_one_hot[:,i:i+batch_count], hL[:,0:batch_count], W, weight_decay)
    
    valid_acc = calculateAccuracy(batch_size, validationX, validationy, W, bias, num_hidden_layers, layer_dims, activation_fun)
    print("validation accuracy at iteration", (iterationNumber+1), "=", valid_acc)
    print("loss at iteration", (iterationNumber+1), "=", loss/(num_images))
  return y_pred, W, bias
    

In [None]:
num_hidden_layers = 5
batch_size = 32
k = len(class_names)
trainX, validationX, trainy, validationy = train_test_split(x_train, y_train, random_state=104, test_size=0.1, shuffle=True)
num_images = len(trainy)
image_size = trainX.shape[1]
neurons_in_each_layer = 32
pred_label, W, bias=optimizers(num_hidden_layers, neurons_in_each_layer, epochs = 10, learning_rate = 0.1, batch_size = 32, init_mode = "random uniform", activation_fun = "sigmoid", loss_function = "cross_entropy", optimizer = "sgd", momentum = 0.9, beta = 0.9, beta1 = 0.9, beta2 = 0.999, weight_decay = 0)

In [18]:
cnt=0
for i in range(len(pred_label)):
  if(pred_label[i]==trainy[i]):
    cnt+=1
print("Accuracy on train data", 100*cnt/len(pred_label))

Accuracy on train data 81.11296296296297


In [None]:
layer_dims = [trainX.shape[1]]
for i in range(num_hidden_layers):
  layer_dims.append(neurons_in_each_layer)
layer_dims.append(k)

In [21]:
batch_count = batch_size
count = 0
for i in range(0, len(validationX), batch_size):
  if(i+batch_size>len(validationX)):
    batch_count = len(validationX)-i-1
  hL, activation, preactivation = feedForward(W, bias, validationX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun = "sigmoid")
  for j in range(i, i+batch_count):
    if(np.argmax(hL[:,(j-i)]) == validationy[j]):
      count+=1
print("Accuracy on validation data", (100.0*count)/len(validationX))

Accuracy on validation data 81.66666666666667


In [23]:
batch_count = batch_size
count = 0
for i in range(0, len(testX), batch_size):
  if(i+batch_size>len(testX)):
    batch_count = len(testX)-i-1
  hL, activation, preactivation = feedForward(W, bias, testX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun = "sigmoid")
  for j in range(i, i+batch_count):
    if(np.argmax(hL[:,(j-i)]) == testy[j]):
      count+=1
print("Accuracy on test data", (100.0*count)/len(testX))

Accuracy on test data 81.2
