**Installing Wandb**

In [None]:
!pip install wandb

**Import statements**

In [10]:
from keras.datasets import fashion_mnist
import wandb
import numpy as np
import random

**Load dataset**

In [None]:
(trainX, trainy), (testX, testy) = fashion_mnist.load_data()

In [11]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)

# **Question 1**

In [None]:
wandb.init(
    project="Assignment 1",
)
def plotImagesOfEachClass():
  image_labels = []
  images = []
  for i in range(len(trainX)):
    if len(image_labels) == len(class_names):
      break
    if class_names[trainy[i]] not in image_labels:
      image_labels.append(class_names[trainy[i]])
      images.append(trainX[i])

  wandb.log({"examples ": [wandb.Image(img, caption=caption) for img, caption in zip(images, image_labels)]})

plotImagesOfEachClass()

# **Question 2 and 3**

In [17]:
(trainX, trainy), (testX, testy) = fashion_mnist.load_data()
num_images = len(trainX)
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)
trainX = trainX.reshape(trainX.shape[0], trainX.shape[1]*trainX.shape[2])
trainX = trainX/255.0
testX = testX.reshape(testX.shape[0], testX.shape[1]*testX.shape[2])
testX = testX/255.0

In [113]:
def initializeWeightAndBias(layer_dims, init_mode = "random uniform"):
  W = []
  bias = []
  np.random.seed(3)
  if(init_mode == "random uniform"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1], layer_dims[layer_num])))
      bias.append((np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1],1))))
  elif(init_mode == "random normal"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1], layer_dims[layer_num]))
      bias.append((np.random.randn(layer_dims[layer_num+1],1)))
  elif(init_mode == "xavier"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1],layer_dims[layer_num])*np.sqrt(2/(layer_dims[layer_num+1]+layer_dims[layer_num])))
      bias.append(np.random.randn(layer_dims[layer_num+1],1)*np.sqrt(2/(layer_dims[layer_num+1])))
  return W, bias

In [171]:
def feedForward(W, bias, X, num_hidden_layers, layer_dims, activation_fun = "tanh"):
  preactivation = []
  activation = []
  activation.append(X.T)
  preactivation.append(X.T)
  for i in range(1, num_hidden_layers+1):
    preactivation.append(bias[i-1] + np.matmul(W[i-1], activation[(i-1)]))
    if(activation_fun == "sigmoid"):
      activation.append(sigmoid(preactivation[i]))
    elif(activation_fun == "tanh"):
      activation.append(tanh(preactivation[i]))
    elif(activation_fun == "reLU"):
      activation.append(reLU(preactivation[i]))
  preactivation.append(bias[-1] + np.dot(W[-1], activation[-1]))
  activation.append(softmax(preactivation[-1]))
  return activation[-1], activation, preactivation

In [154]:
def updateParam(W, gradientW, bias, gradientBias, learning_rate):
  for i in range(0, len(W)):
    W[i] = W[i] - learning_rate*gradientW[i]
    bias[i] = bias[i] - learning_rate*gradientBias[i]
  return W, bias

In [124]:
def sigmoid(X):
  return 1.0/(1.+np.exp(-X))

def sigmoid_derivative(x):
  return sigmoid(x)*(1-sigmoid(x))

def reLU(x):
  return np.maximum(0,x)

def reLU_derivative(x):
  return 1*(x>0) 

def tanh(x):
  return np.tanh(x)

def tanh_derivative(x):
  return (1 - (np.tanh(x)**2))

def softmax(a):
  return np.exp(a)/np.sum(np.exp(a), axis=0)

In [114]:
def backward_propogation(y_one_hot, x, y, W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun = "tanh"):
  L = num_hidden_layers+1
  gradientPreactivation = []
  gradientPreactivation.append(activation[L]-y_one_hot)
  gradientWeight = []
  gradientBias = []
  for k in range(L, 0, -1):
    gradientWeight.append(np.matmul(gradientPreactivation[-1], activation[k-1].T)/batch_size)
    gradientBias.append(np.sum(gradientPreactivation[-1], axis=1, keepdims=True)/batch_size)
    if k==1:
      break
    if(activation_fun == "sigmoid"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), sigmoid_derivative(preactivation[k-1])))
    elif(activation_fun == "tanh"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), tanh_derivative(preactivation[k-1])))
    if(activation_fun == "reLU"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), reLU_derivative(preactivation[k-1])))
  return gradientWeight[::-1], gradientBias[::-1]

In [136]:
def cross_entropy(y, y_hat):
  loss = 0
  for i in range(len(y)):
    loss += -1.0*np.sum(y[i]*np.log(y_hat[i]))
  return loss

def mse(y, y_hat):
  return 1/2* np.sum((y-y_hat)**2)

In [194]:
def stochastic_gradient_descent(num_hidden_layers, layer_dims, epochs, learning_rate, batch_size, init_mode, activation_fun, loss_function = "cross_entropy", optimizer = "sgd", beta = 0.9):
  W, bias = initializeWeightAndBias(layer_dims, init_mode)
  y_pred = []
  batch_count = batch_size
  y_one_hot = np.zeros((10, num_images))
  for i in range(num_images):
    y_one_hot[trainy[i]][i] = 1

  for iterationNumber in range(epochs):
    loss=0
    for i in range(0, num_images, batch_size):
      if(i+batch_size > num_images):
        batch_count = num_images-i-1

      hL, activation, preactivation = feedForward(W, bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)

      if(loss_function == "cross_entropy"):
        loss += cross_entropy(y_one_hot[:,i:i+batch_count], hL[:,0:batch_count])
      elif(loss_function == "mean_squared_error"):
        loss += mse(y_one_hot[:,i:i+batch_count], hL[:,0:batch_count])

      gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun)
      if(iterationNumber==epochs-1):
        for j in range(i, i+batch_count):
          y_pred.append(np.argmax(hL[:,(j-i)]))

      if(optimizer == "sgd"):
        W, bias = updateParam(W, gradientW, bias, gradientBias, learning_rate)

      elif(optimizer == "momentum"):
        
        if(i==0 and iterationNumber == 0):
          previous_updates_W = gradientW
          previous_updates_Bias = gradientBias
        else:
          for idx in range(len(gradientW)):
            previous_updates_W[idx] = beta*previous_updates_W[idx] + gradientW[idx]
            previous_updates_Bias[idx] = beta*previous_updates_Bias[idx] + gradientBias[idx]
        print(i, len(gradientW), len(gradientBias), len(previous_updates_W), len(previous_updates_Bias), gradientBias[0].shape, previous_updates_Bias[0].shape)
        W, bias = updateParam(W, previous_updates_W, bias, previous_updates_Bias, learning_rate)

    print("loss at iteration", (iterationNumber+1), "=", loss/(num_images))
  return y_pred, W, bias
    

In [191]:
num_hidden_layers = 5
neurons_in_each_layer = 128
batch_size = 32
k = len(class_names)
num_images = len(trainX)
image_size = trainX.shape[1]
layer_dims = [image_size]
random.seed(3)
for l in range(num_hidden_layers):
  layer_dims.append(random.randint(10,100))
layer_dims.append(k)
pred_label, W, bias=stochastic_gradient_descent(num_hidden_layers, layer_dims, epochs = 10, learning_rate = 0.1, batch_size = 32, init_mode = "random uniform", activation_fun = "sigmoid", loss_function = "cross_entropy", optimizer = "momentum", beta = 0.8)

loss at iteration 1 = 1.2305062080793658
loss at iteration 2 = 0.602596792798347
loss at iteration 3 = 0.5115135463691484
loss at iteration 4 = 0.46887541739006594
loss at iteration 5 = 0.44180074324818813
loss at iteration 6 = 0.42221693113918535
loss at iteration 7 = 0.40690917231035767
loss at iteration 8 = 0.3943298859098584
loss at iteration 9 = 0.38363033232635024
loss at iteration 10 = 0.37430452013329585


In [192]:
cnt=0
for i in range(len(pred_label)):
  if(pred_label[i]==trainy[i]):
    cnt+=1
print("Accuracy on train data", 100*cnt/len(pred_label))

Accuracy on train data 86.49833333333333


In [193]:
batch_count = batch_size
count = 0
for i in range(0, len(testX), batch_size):
  if(i+batch_size>len(testX)):
    batch_count = len(testX)-i-1
  hL, activation, preactivation = feedForward(W, bias, testX[i:i+batch_count], num_hidden_layers, neurons_in_each_layer, activation_fun = "sigmoid")
  for j in range(i, i+batch_count):
    if(np.argmax(hL[:,(j-i)]) == testy[j]):
      count+=1
print("Accuracy on test data", (100.0*count)/len(testX))

Accuracy on test data 84.73
