### **Import Libraries**

In [85]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [86]:
# !pip install wandb

In [87]:
# import wandb

## **Downloading Data**

In [88]:
from keras.datasets import fashion_mnist

In [89]:
fashion_mnist

<module 'keras.datasets.fashion_mnist' from '/usr/local/lib/python3.8/dist-packages/keras/datasets/fashion_mnist.py'>

In [90]:
(trainX, trainY), (testX, testY) = fashion_mnist.load_data()


## **Data pre-processing**

In [91]:
# Split the data into training and testing
(trainX, trainY), (testX, testY) = fashion_mnist.load_data()

# Print the dimensions of the dataset
print('Train: X = ', trainX.shape)
print('Test: X = ', testX.shape)


Train: X =  (60000, 28, 28)
Test: X =  (10000, 28, 28)


In [92]:
class_labels = ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"]

### **Q1**

---



In [93]:
# j=0 #j corresponds to the spot index in the subplot 
# classSet=set()
# Images=[]
# for i in range(100,150):#i corresponds to the img index 
#   if(j>9):
#     break
#   if(trainY[i] not in classSet):
#     classSet.add(trainY[i])
#     Images.append(wandb.Image(trainX[i], caption=class_labels[trainY[i]]))
#     plt.subplot(2,5,j+1);j+=1
#     plt.imshow(trainX[i], cmap="Greys")
#     plt.axis('off') # off the axis
#     plt.title('{}'.format(class_labels[trainY[i]]))

In [94]:
# wandb.init()

In [95]:
# wandb.log({"Examples for each class": Images})


### **Q2**

---


## **Data Preprocessing**

In [96]:
trainX = trainX.reshape(trainX.shape[0], 784)
testX = testX.reshape(testX.shape[0], 784)

In [97]:
#feature Scaling
trainX=trainX/255.0
testX=testY/255.0

In [98]:
# Split the X_train into a training set and validation set
trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size=0.2, random_state=100)

## **Functions**

In [99]:
def g(a):        #sigmoid
    # a=a-np.max(a)

    # a = np.float128(a)
    return 1.0 / (np.exp(-a)+1.0)

def o(a):#softmax
    # a=a-np.max(a) 
    # a = np.float128(a)
    return np.exp(a)/np.sum(np.exp(a),axis=0)

def grad_sigmoid(a):
  return g(a)*(1-g(a))

def Relu(x):
    return np.maximum(0,x)

def tanh(x):
    return np.tanh(x)

def grad_Relu(x):
    return 1*(x>0) 

def grad_tanh(x):
    return (1 - (np.tanh(x)**2))


### **Q3**



In [100]:
numSamples=trainX.shape[0]
numSamples

48000

### **Model**

In [110]:
class model():
  def __init__(self): 
    self.numLayers=4 #3 hidden layers
    self.numHiddenLayers=self.numLayers-1
    self.numNeurons=32
    self.numClasses=10
    self.grad_w=[]
    self.grad_b=[]
    self.y_pred=[]
    self.u_w=0
    self.u_b=0
    self.m_w=0
    self.m_b=0
    self.W_L=[]
    self.b_L=[]


  def initialize(self):
    #Initialising weights and Biases
    W = []
    W.append((np.random.uniform(-1,1,(784,self.numNeurons))))
    for i in range (2 , self.numHiddenLayers+1): #Hiddenlayer 1 to last hidden layer (starts from 2 coz first layer is init just above)
      W.append((np.random.uniform(-1,1,(self.numNeurons,self.numNeurons))))
    W.append((np.random.uniform(-1,1,(self.numNeurons,self.numClasses))))
    self.W= np.array(W)

    b = []
    for i in range (1 , self.numLayers): #Hiddenlayer1 to last hidden layer
      b.append(np.zeros((self.numNeurons,1)))
    b.append(np.zeros((self.numClasses,1)))
    self.b= np.array(b)

  def back_propagation(self,Y,batch_size):
    grad_a=[None]*(self.numLayers)
    grad_b=[None]*(self.numLayers)
    grad_h=[None]*(self.numLayers)
    grad_w=[None]*(self.numLayers)
    oneHot_y=self.compute_oneHot_y(Y)

    grad_a[self.numLayers-1]=self.y_pred-oneHot_y.T #k-dim
    h=self.activation
    a=self.preActivation
    W=self.W
    for k in range (self.numLayers-1,-1,-1): #reverse loop

      grad_w[k]=np.matmul(grad_a[k], h[k].T)
      grad_b[k]=np.sum(grad_a[k],axis=1,keepdims=True)/batch_size
      grad_h[k]=np.matmul(W[k],grad_a[k])

      # print(grad_a[k].shape, h[k].shape,grad_w[k].shape)

      if(k>0):
        # print(grad_a[k-1].shape)
        # print(k)
        # print(grad_h[k].shape)
        # print(grad_sigmoid(a[k-1]).shape)
        grad_a[k-1] =grad_h[k] * grad_sigmoid(a[k-1])
    self.grad_b,self.grad_w=grad_b,grad_w

  def feed_forward(self,X):
    a=[None]*(self.numLayers)
    h=[None]*(self.numLayers)
    k=0
    h[0]=X.T
    for k in range(0, self.numLayers-1): #for all layers
      a_k=self.b[k]+np.matmul(self.W[k].T,h[k]) #0-based Indexing
      h_k=g(a_k)
      a[k]=(a_k)
      h[k+1]=(h_k)
    a[self.numLayers-1]=self.b[self.numLayers-1]+np.matmul(self.W[self.numLayers-1].T,h[self.numLayers-1])
    output=o(a[self.numLayers-1])#softmax
    self.activation,self.preActivation=h,a
    self.y_pred=output

  def train(self,trainX,trainY,batch_size,epochs,beta,beta_1,beta_2,neta,t):
    self.initialize()

    #if optimiser=='nag'
    # self.init_nag()

    for j in range(0,epochs):
      for i in range(0, trainX.shape[0],batch_size):
       
        #if optimiser=='nag'
        # W_copy=self.W
        # b_copy=self.b


        self.feed_forward(trainX[i:i+batch_size])
        self.back_propagation(trainY[i:i+batch_size],batch_size)
        Grad_w=np.array(self.grad_w)
        for i in range(0,Grad_w.shape[0]):
          Grad_w[i]=Grad_w[i].T
        Grad_b=np.array(self.grad_b)

        # self.update_sgd(neta,Grad_w,Grad_b)
        # self.update_mom(neta,beta,Grad_w,Grad_b)
        # self.update_nag(neta,beta,Grad_w,Grad_b,W_copy,b_copy)
        # self.update_rmsProp(neta,beta,Grad_w,Grad_b)
        # self.update_adam(neta,beta_1,beta_2,Grad_w,Grad_b,t)
        self.update_nadam(neta,beta_1,beta_2,Grad_w,Grad_b,t)
        t+=1
        
      self.feed_forward(trainX)

      print(self.compute_loss(self.y_pred,self.compute_oneHot_y(trainY),trainX.shape[0]))
      #loss is computed over the entire Xtrain 


  def test(self,valX,valY,beta,neta):
    self.feed_forward(valX)
    print("Accuracy: ")
    print(self.get_accuracy(valY,self.y_pred.T)*100)

  def update_sgd(self,neta,Grad_w,Grad_b):
        self.W=self.W-neta*Grad_w
        self.b=self.b-neta*Grad_b
  
  def update_mom(self,neta,beta,Grad_w,Grad_b):
        self.u_w=beta*self.u_w+(1-beta)*Grad_w
        self.u_b=beta*self.u_b+(1-beta)*Grad_b
        self.W=self.W-neta*self.u_w
        self.b=self.b-neta*self.u_b
  
  def update_nag(self,neta,beta,Grad_w,Grad_b,W_copy,b_copy):
        self.W=W_copy
        self.b=b_copy
        self.u_w=beta*self.u_w+(1-beta)*Grad_w
        self.u_b=beta*self.u_b+(1-beta)*Grad_b
        self.W=self.W-neta*self.u_w
        self.b=self.b-neta*self.u_b


  def init_nag(self):
    W_L = []
    W_L.append((np.zeros([784,self.numNeurons])))
    for i in range (2 , self.numHiddenLayers+1): #Hiddenlayer 1 to last hidden layer (starts from 2 coz first layer is init just above)
      W_L.append((np.zeros([self.numNeurons,self.numNeurons])))
    W_L.append((np.zeros([self.numNeurons,self.numClasses])))
    W_L= np.array(W_L)

    b_L = []
    for i in range (1 , self.numLayers): #Hiddenlayer1 to last hidden layer
      b_L.append(np.zeros((self.numNeurons,1)))
    b_L.append(np.zeros((self.numClasses,1)))
    b_L= np.array(b_L)
    self.W_L=W_L
    self.b_L=b_L

  def pre_update_nag(self):
    self.W_L=self.W-neta*self.u_w
    self.b_L=self.b-neta*self.u_b
    self.W=self.W_L
    self.b=self.b_L

  def update_rmsProp(self,neta,beta,Grad_w,Grad_b):
        self.u_w=beta*self.u_w+(1-beta)*Grad_w*Grad_w
        self.u_b=beta*self.u_b+(1-beta)*Grad_b*Grad_b
        self.W=self.W-(neta/(self.u_w**0.5+10**-8))*Grad_w
        self.b=self.b-(neta/(self.u_b**0.5+10**-8))*Grad_b
  
  def update_adam(self,neta,beta_1,beta_2,Grad_w,Grad_b,t):

        self.m_w=beta_1*self.m_w+(1-beta_1)*Grad_w
        self.m_b=beta_1*self.m_b+(1-beta_1)*Grad_b

        self.u_w=beta_2*self.u_w+(1-beta_2)*Grad_w*Grad_w
        self.u_b=beta_2*self.u_b+(1-beta_2)*Grad_b*Grad_b

        m_w_hat=self.m_w/(1-beta_1**t)
        m_b_hat=self.m_b/(1-beta_1**t)

        self.W=self.W-(neta/(self.u_w**0.5+10**-8))*m_w_hat
        self.b=self.b-(neta/(self.u_b**0.5+10**-8))*m_b_hat

  def update_nadam(self,neta,beta_1,beta_2,Grad_w,Grad_b,t):

        self.m_w=beta_1*self.m_w+(1-beta_1)*Grad_w
        self.m_b=beta_1*self.m_b+(1-beta_1)*Grad_b

        self.u_w=beta_2*self.u_w+(1-beta_2)*Grad_w*Grad_w
        self.u_b=beta_2*self.u_b+(1-beta_2)*Grad_b*Grad_b

        m_w_hat=self.m_w/(1-beta_1**t)
        m_b_hat=self.m_b/(1-beta_1**t)

        self.W=self.W-(neta/(self.u_w**0.5+10**-8))*(beta_1*m_w_hat+((1-beta_1)*Grad_w)/(1-beta_1**t))
        self.b=self.b-(neta/(self.u_b**0.5+10**-8))*(beta_1*m_b_hat+((1-beta_1)*Grad_b)/(1-beta_1**t))

  def compute_oneHot_y(self,Y):
    oneHot_y=[]
    for i in range(0,Y.shape[0]):
      temp=np.zeros(self.numClasses)
      temp[Y[i]]=1
      oneHot_y.append(temp)
    oneHot_y=np.array(oneHot_y)
    return oneHot_y

  def compute_loss(self,y_pred,oneHot_y,numImages):
     return (-1.0 * np.sum(np.multiply(oneHot_y.T, np.log(y_pred)))/numImages)


  def get_accuracy(self,Y_true,Y_pred):
    count = 0
    for i in range(Y_true.shape[0]):
      max_i = 0
      #index of predicted class
      max_j = 0
      for j in range(10):
        if (Y_pred[i][j]>max_i):
          max_j = j
          max_i = Y_pred[i][j]
      if (Y_true[i] == max_j):
        count+=1
      accuracy=count/Y_true.shape[0]
    return accuracy

  

In [None]:
obj = model()

epochs=30
neta=0.01
beta=0.999
beta_1=0.9
beta_2=0.999
t = 1 # initialize timestep for Adam optimizer


Samples=numSamples
batch_size=32
obj.train(trainX,trainY,batch_size,epochs,beta,beta_1,beta_2,neta,t)
#there is no back_prop in test.. so need not actually give beta, beta_1, neta, t etc..!
obj.test(valX,valY,beta,neta)