In [None]:
!pip install wandb



In [None]:
from tensorflow import keras
from keras.datasets import fashion_mnist
from keras.datasets import mnist
import numpy as np
from matplotlib import pyplot as plt
import random
import wandb
import argparse
from datetime import datetime
import copy

class FeedForward:

    def __init__(self):

        # defining the default parameters
        self.parameters = {
            "wandb_project": "DL Assignment 1",
            "wandb_entity": "cs22m019",
            "dataset": "fashion_mnist",
            "epochs": 5,
            "batch_size": 32,
            "loss": "cross_entropy",
            "optimizer": "gd",
            "learning_rate": 0.1,
            "momentum": 0.01,
            "beta": 0.5,
            "beta1": 0.5,
            "beta2": 0.5,
            "epsilon": 0.000001,
            "weight_decay": 0.0,
            "weight_init": "random",
            "num_layers": 3,
            "hidden_size": 128,
            "activation": "sigmoid",
            "output_function": "softmax"
        }

        # updating the parameters to the parameters given in command line
        # self.update_parameters()

       
        # loading training and test data from fashion_mnist dataset or mnist dataset
        if (self.parameters["dataset"] == "fashion_mnist"):
            (self.x_train, self.y_train), (self.x_test,self.y_test) = fashion_mnist.load_data()
        else:
            (self.x_train, self.y_train), (self.x_test,self.y_test) = mnist.load_data()

        # normalizing data points
        self.x_train = self.x_train / 255
        self.x_test = self.x_test / 255

        # computing number of samples in training and test data
        self.train_n_samples = self.x_train.shape[0]
        self.test_n_samples = self.x_test.shape[0]

        # spiltting the data -> 90% train,10% test 
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        self.x_validate = self.x_train[: self.train_n_samples // 10]
        self.y_validate = self.y_train[: self.train_n_samples // 10]

        self.x_train = self.x_train[self.train_n_samples // 10:]
        self.y_train = self.y_train[self.train_n_samples // 10:]

        self.train_n_samples = self.x_train.shape[0]

        # list of label titles -> actual output
        self.title = ["T-shirt/top", "Trouser", "PullOver", "Dress",
                      "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle Boot"]
        self.no_of_label = len(self.title)

        # setting the class variables
        self.epoch = self.parameters["epochs"]
        self.batch_size = self.parameters["batch_size"]
        self.lossFunction = self.parameters["loss"]
        self.optimizer = self.parameters["optimizer"]
        self.learningRate = self.parameters["learning_rate"]
        self.weightInitialization = self.parameters["weight_init"]
        self.L = self.parameters["num_layers"] + 1
        self.hl = self.parameters["num_layers"]
        self.nnl = self.parameters["hidden_size"]
        self.activationFunction = self.parameters["activation"]
        self.outputFunction = self.parameters["output_function"]
        self.weight_decay = self.parameters["weight_decay"]
        self.k = len(self.title)
        self.d = self.x_train.shape[1] * self.x_train.shape[2]
        self.n = self.train_n_samples
        self.weights = {}
        self.bias = {}
        self.wHistory = {}
        self.bHistory = {}
        self.wMomentum = {}
        self.bMomentum = {}
        self.prev_wHistory = {}
        self.prev_bHistory = {}
        self.pre_activation = {}
        self.post_activation = {}
        
    # updates the default parameters with the paramters given in command line
    def update_parameters(self):

        parser = argparse.ArgumentParser(description='DL Assignment 1 Parser')

        parser.add_argument('-wp', '--wandb_project',
                            type=str, metavar='', help='wandb project')
        parser.add_argument('-we', '--wandb_entity', type=str,
                            metavar='', help='wandb entity')
        parser.add_argument('-d', '--dataset', type=str,
                            metavar='', help='dataset')
        parser.add_argument('-e', '--epochs', type=int,
                            metavar='', help='epochs')
        parser.add_argument('-b', '--batch_size', type=int,
                            metavar='', help='batch size')
        parser.add_argument('-l', '--loss', type=str, 
                            metavar='', help='loss')
        parser.add_argument('-o', '--optimizer', type=str,
                            metavar='', help='optimizer')
        parser.add_argument('-lr', '--learning_rate',
                            type=float, metavar='', help='learning rate')
        parser.add_argument('-m', '--momentum', type=float,
                            metavar='', help='momentum')
        parser.add_argument('-beta', '--beta', type=float,
                            metavar='', help='beta')
        parser.add_argument('-beta1', '--beta1', type=float,
                            metavar='', help='beta1')
        parser.add_argument('-beta2', '--beta2', type=float,
                            metavar='', help='beta2')
        parser.add_argument('-eps', '--epsilon', type=float,
                            metavar='', help='epsilon')
        parser.add_argument('-w_d', '--weight_decay',
                            type=float, metavar='', help='weight decay')
        parser.add_argument('-w_i', '--weight_init', type=str,
                            metavar='', help='weight init')
        parser.add_argument('-nhl', '--num_layers', type=int,
                            metavar='', help='num layers')
        parser.add_argument('-sz', '--hidden_size', type=int,
                            metavar='', help='hidden size')
        parser.add_argument('-a', '--activation', type=str,
                            metavar='', help='activation')
        parser.add_argument('-of', '--output_function',
                            type=str, metavar='', help='output function')
        args = parser.parse_args()

        if (args.wandb_project != None):
            self.parameters["wandb_project"] = args.wandb_project
        if (args.wandb_entity != None):
            self.parameters["wandb_entity"] = args.wandb_entity
        if (args.dataset != None):
            self.parameters["dataset"] = args.dataset
        if (args.epochs != None):
            self.parameters["epochs"] = args.epochs
        if (args.batch_size != None):
            self.parameters["batch_size"] = args.batch_size
        if (args.loss != None):
            self.parameters["loss"] = args.loss
        if (args.optimizer != None):
            self.parameters["optimizer"] = args.optimizer
        if (args.learning_rate != None):
            self.parameters["learning_rate"] = args.learning_rate
        if (args.momentum != None):
            self.parameters["momentum"] = args.momentum
        if (args.beta != None):
            self.parameters["beta"] = args.beta
        if (args.beta1 != None):
            self.parameters["beta1"] = args.beta1
        if (args.beta2 != None):
            self.parameters["beta2"] = args.beta2
        if (args.epsilon != None):
            self.parameters["epsilon"] = args.epsilon
        if (args.weight_decay != None):
            self.parameters["weight_decay"] = args.weight_decay
        if (args.weight_init != None):
            self.parameters["weight_init"] = args.weight_init
        if (args.num_layers != None):
            self.parameters["num_layers"] = args.num_layers
        if (args.hidden_size != None):
            self.parameters["hidden_size"] = args.hidden_size
        if (args.activation != None):
            self.parameters["activation"] = args.activation

    # function to initialize weights and bias based on type -> random or Xavier initialization
    def weightsAndBiasInitializer(self):
        if self.weightInitialization == "Xavier":

            # first and last matrix 
            self.weights["w1"] = np.random.uniform(-np.sqrt(6 / (self.nnl + self.d)), np.sqrt(6 / (self.nnl + self.d)), (self.nnl, self.d))
            self.weights["w" + str(self.L)] = np.random.uniform(-np.sqrt(6 / (self.k + self.nnl)),np.sqrt(6 / (self.k + self.nnl)), (self.k, self.nnl))

            # Intermediate Matrices
            for i in range(2, self.L):
                self.weights["w" + str(i)] = np.random.uniform(-np.sqrt(6 / (self.nnl + self.nnl)), np.sqrt(6 / (self.nnl + self.nnl)), (self.nnl, self.nnl))

            # Last Vector
            self.bias["b" + str(self.L)] = np.random.uniform(-np.sqrt(6 / (self.k + 1)),np.sqrt(6 / (self.k + 1)), (self.k))

            for i in range(1, self.L):
                self.bias["b" + str(i)] = np.random.uniform(-np.sqrt(6 / (self.nnl + 1)),np.sqrt(6 / (self.nnl + 1)), (self.nnl))

        if self.weightInitialization == "random":
            # initailzation of weights
            '''
                  W1 = (d,nnl)
                  W2,..,W(L - 1) = (nnl,nnl)
                  WL = (k,nnl)
            '''
            w1 = np.random.normal(0, 0.5, size=(self.nnl, self.d))
            self.weights["w1"] = w1
            for i in range(2, self.L):
                self.weights["w" + str(i)] = np.random.normal(0,0.5, size=(self.nnl, self.nnl))
            self.weights["w" + str(self.L)] = np.random.normal(0,0.5, size=(self.k, self.nnl))

            # initialization of bias
            for i in range(1, self.L):
                self.bias["b" + str(i)] = np.random.normal(0,0.5, size=(self.nnl))
            self.bias["b" + str(self.L)] = np.random.normal(0,0.5, size=(self.k))

    # function to initialize momentum for weights and bias
    def momentumInitializer(self):

        # initializing momentum for weights
        w1 = np.zeros((self.nnl, self.d))
        self.wMomentum["w1"] = w1
        for i in range(2, self.L):
            self.wMomentum["w" + str(i)] = np.zeros((self.nnl, self.nnl))
        self.wMomentum["w" + str(self.L)] = np.zeros((self.k, self.nnl))

        # initializing momentum for bais
        for i in range(1, self.L):
            self.bMomentum["b" + str(i)] = np.zeros((self.nnl))
        self.bMomentum["b" + str(self.L)] = np.zeros((self.k))

    # function to initialize history for weights and bias
    def historyInitializer(self):

        # initializing history for weights
        w1 = np.zeros((self.nnl, self.d))
        self.wHistory["w1"] = w1
        for i in range(2, self.L):
            self.wHistory["w" + str(i)] = np.zeros((self.nnl, self.nnl))
        self.wHistory["w" + str(self.L)] = np.zeros((self.k, self.nnl))

        # initializing history for bais
        for i in range(1, self.L):
            self.bHistory["b" + str(i)] = np.zeros((self.nnl))
        self.bHistory["b" + str(self.L)] = np.zeros((self.k))

    # function used to implement different activation functions
    def activation_func(self, vector):
        if self.activationFunction == "sigmoid":
            return 1.0 / (1 + np.exp(-(vector)))
        
        if self.activationFunction == "tanh":
            return np.tanh(vector)
        
        if self.activationFunction == "ReLU":
            return np.maximum(0,vector)

    # function used to implement different output functions
    def output_func(self, vector):
        if self.outputFunction == "softmax":

            vector = vector - vector[np.argmax(vector)]

            return np.exp(vector) / np.sum(np.exp(vector))

    # function generating one-hot vector
    def oneHotVector(self, size, index):
        oneHot = np.zeros(size)
        oneHot[index] = 1.0
        return oneHot

    # function returning the differentiation of activation function
    def differentiation(self, vector):

        if self.activationFunction == "sigmoid":
            return (1.0 / (1 + np.exp(-(vector)))) * (1 - 1.0 / (1 + np.exp(-(vector))))

        if self.activationFunction == "tanh":
            return 1 - (np.tanh(vector)) ** 2

        if self.activationFunction == "ReLU":
            t = np.maximum(0,vector)
            t[t > 0] = 1
            return t

    # regularization
    def regularize(self):
        reg_term = 0
        validation_size = self.y_validate.shape[0]
        
        for (key,value) in self.weights.items():
          reg_term += (np.sum(self.weights[key] ** 2))
        reg_term = (self.weight_decay / (2 * validation_size)) * reg_term

        return reg_term

    # function returning the loss function value
    def loss_function(self, y_predicted, index):

        if self.lossFunction == "cross_entropy":
            t = 1e-8
            return (-1)*np.log(y_predicted[index] + t)

        if self.lossFunction == "mean_squared_error":
            y = self.oneHotVector(size=self.no_of_label, index=index)
            return np.sum((y_predicted - y) ** 2)

    # forward propagation - computes pre_activation vector,post_activation vector for each layer and predicts y at last layer
    def forward_propagation(self, input, index):

        # Populating pre_activation and post_activation vectors to dictionary in each layer for input[index]
        for k in range(1, self.L):

            # for first layer,post activation will be input
            if (k == 1):
                ''' flattening the input: 
                    -input(60000,28,28)
                    -input[index] size = (28,28)
                    -flattening input[index] gives size (784,1) = (d,1) where d is dimension of input
                    post_activation[h0] size = (d,1)
                    bias[b1] size = (nnl,1)
                    weights[w1] size = (nnl,d)
                    Therefore we get pre_activation[a1] size = (nnl,1) for all layer except last layer
                '''
                self.post_activation["h" + str(k - 1)] = input[index].flatten()

            # computing a(k) = b(k) + w(k)*h(k - 1) for each input[index]
            self.pre_activation["a" + str(k)] = self.bias["b" + str(k)] + np.dot(self.weights["w" + str(k)], self.post_activation["h" + str(k - 1)])
           
            # computing h(k) = g(a(k)) where g is activation function
            self.post_activation["h" + str(k)] = self.activation_func(self.pre_activation["a" + str(k)])

        # computing pre_activation for last layer
        self.pre_activation["a" + str(self.L)] = self.bias["b" + str(self.L)] + np.dot(self.weights["w" + str(self.L)], self.post_activation["h" + str(self.L - 1)])

        # prediction y (y_hat) = O(a(L)) where O is output function
        # self.pre_activation["a" + str(self.L)] = self.pre_activation["a" + str(self.L)] / np.linalg.norm(self.pre_activation["a" + str(self.L)])
        self.post_activation["h" +str(self.L)] = self.output_func(self.pre_activation["a" + str(self.L)])

    # performs back propagation and returns gradients of weights and bias
    def backward_propagation(self, index, actual_y):

        grad_pre_activation = {}
        grad_post_activation = {}
        grad_weights = {}
        grad_bias = {}

        predicted_y = self.post_activation["h" + str(self.L)]

        # Computing output gradient
        one_hot_vector = self.oneHotVector(self.no_of_label, actual_y[index])
        if self.lossFunction == "cross_entropy" :
          grad_pre_activation["a" + str(self.L)] = (predicted_y - one_hot_vector)
        else :
          grad_pre_activation["a" + str(self.L)] = -2 * (one_hot_vector - predicted_y) * (predicted_y * (np.ones(self.no_of_label) - predicted_y))
       
        
        k = self.L
        while k > 0:

            # Computing gradient w.r.t parameters - weight and bais
            '''
              np.reshape(grad_pre_activation["a" + str(L)],(-1,1)) = (k,1)
              np.reshape(post_activation["h" + str(L - 1)],(1,-1)) = (1,nnl)
            '''
            grad_weights["w" + str(k)] = np.dot(np.reshape(grad_pre_activation["a" + str(k)], (-1, 1)), np.reshape(self.post_activation["h" + str(k - 1)], (1, -1)))
            grad_bias["b" + str(k)] = grad_pre_activation["a" + str(k)]

            if k != 1:
                # Computing gradient w.r.t layer below (post_activation)
                grad_post_activation["h" + str(k - 1)] = np.dot(self.weights["w" + str(k)].T, np.reshape(grad_pre_activation["a" + str(k)], (-1, 1))).flatten()

                # Computing gradient w.r.t layer below (pre_activation)
                g_dash = self.differentiation(self.pre_activation["a" + str(k - 1)])
                grad_pre_activation["a" +str(k - 1)] = grad_post_activation["h" + str(k - 1)] * g_dash

            k = k - 1
        return grad_weights, grad_bias

    # function to make accumalated gradients zero
    def make_accumalate_zero(self):

        acc_grad_weights = {}
        acc_grad_bias = {}

        # accumalated weights are set to zero
        acc_grad_weights["w1"] = np.zeros((self.nnl, self.d))
        for i in range(2, self.L):
            acc_grad_weights["w" + str(i)] = np.zeros((self.nnl, self.nnl))
        acc_grad_weights["w" + str(self.L)] = np.zeros((self.k, self.nnl))

        # accumalated bias are set to zero
        for i in range(1, self.L):
            acc_grad_bias["b" + str(i)] = np.zeros((self.nnl))
        acc_grad_bias["b" + str(self.L)] = np.zeros((self.k))

        return acc_grad_weights, acc_grad_bias

    # runs stochastic gradient descent for one epoch
    def oneEpochSGD(self, epoch):
        ''' Executes A Single Epoch for Stochastic Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''
        # print("in sgd")
        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # total Loss for epoch
        loss_input = 0
        count = 0

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perform forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # compute loss
            loss_input += self.loss_function(predicted_y, actual_y[index])

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(index, actual_y)

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):
                # update weights
                for (key, value) in self.weights.items():
                    self.weights[key] = self.weights[key] - ((self.learningRate / self.batch_size) * grad_weights[key])

                # update bias
                for (key, value) in self.bias.items():
                    self.bias[key] = self.bias[key] - ((self.learningRate / self.batch_size) * grad_bias[key])

        # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:
            # update weights
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] - ((self.learningRate / self.batch_size) * grad_weights[key])

            # update bias
            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - ((self.learningRate / self.batch_size) * grad_bias[key])

        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()

        # compute validationAccuracy,validationLoss avergaed over test size 
        # print("in test")
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs momentum gradient descent for one epoch
    def oneEpochMOMENTUM(self, epoch):
        ''' Executes A Single Epoch for Momentum Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''

        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # maintaining previous history for weights and bias
        self.prev_wHistory, self.prev_bHistory = self.wHistory, self.bHistory

        # Total Loss for epoch
        loss_input = 0
        count = 0  
        beta = self.parameters["momentum"]

        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perform forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # compute loss
            loss_input += self.loss_function(predicted_y, actual_y[index])

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(
                index, actual_y)

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]


            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):

                # update weight history
                for (key, value) in self.wHistory.items():
                    self.wHistory[key] = beta * self.prev_wHistory[key] + \
                        ((self.learningRate / self.batch_size) * acc_grad_weights[key])

                # update bias history
                for (key, value) in self.bHistory.items():
                    self.bHistory[key] = beta * self.prev_bHistory[key] + \
                        ((self.learningRate / self.batch_size) * acc_grad_bias[key])

                # update weights
                for (key, value) in self.weights.items():
                    self.weights[key] = self.weights[key] - self.wHistory[key]

                # update bias
                for (key, value) in self.bias.items():
                    self.bias[key] = self.bias[key] - self.bHistory[key]

                # updating histroy for weights and bias
                self.prev_wHistory = self.wHistory
                self.prev_bHistory = self.bHistory

                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

            # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:
            
            # update weight history
            for (key, value) in self.wHistory.items():
                self.wHistory[key] = beta * self.prev_wHistory[key] + \
                    ((self.learningRate / self.batch_size) * acc_grad_weights[key])

            # update bias history
            for (key, value) in self.bHistory.items():
                self.bHistory[key] = beta * self.prev_bHistory[key] + \
                    ((self.learningRate / self.batch_size) * acc_grad_bias[key])

            # update weights
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] - self.wHistory[key]

            # update bias
            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - self.bHistory[key]

            # updating histroy for weights and bias
            self.prev_wHistory = self.wHistory
            self.prev_bHistory = self.bHistory

            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()

        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs nestrov accelerated gradient descent for one epoch
    def oneEpochNAG(self, epoch):
        ''' Executes A Single Epoch for Nesterov Accelerated Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''
        
        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # maintaining previous history for weights and bias
        self.prev_wHistory, self.prev_bHistory = self.wHistory, self.bHistory
        
        # Total Loss for epoch
        loss_input = 0
        count = 0
        beta = self.parameters["momentum"]

        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
        
        # computing partial values
        self.partial_wHistory = {}
        self.partial_bHistory = {}

        for (key, value) in self.wHistory.items():
            self.partial_wHistory[key] = beta * self.prev_wHistory[key]

        for (key, value) in self.bHistory.items():
            self.partial_bHistory[key] = beta * self.prev_bHistory[key]

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perfrom forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # storing weights and bias in temperory values
            temp_weights = copy.deepcopy(self.weights)
            temp_bias = copy.deepcopy(self.bias)

            # update weights and bias
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] -  self.partial_wHistory[key]

            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - self.partial_bHistory[key]

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(index,actual_y)

            # update weights and bias 
            self.weights = temp_weights
            self.bias = temp_bias

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]

            # compute loss
            loss_input += self.loss_function(predicted_y, self.y_train[index])

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):
    
                # update weight history
                for (key, value) in self.wHistory.items():
                    self.wHistory[key] = beta * self.prev_wHistory[key] + ((self.learningRate / self.batch_size) * acc_grad_weights[key])

                # update bias history
                for (key, value) in self.bHistory.items():
                    self.bHistory[key] = beta * self.prev_bHistory[key] + ((self.learningRate / self.batch_size) * acc_grad_bias[key])

                # update weights
                for (key, value) in self.weights.items():
                    self.weights[key] = self.weights[key] - self.wHistory[key]

                for (key, value) in self.bias.items():
                    self.bias[key] = self.bias[key] - self.bHistory[key]

                # updating histroy for weights and bias
                self.prev_wHistory = self.wHistory
                self.prev_bHistory = self.bHistory
                
                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
            
        # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:

            # update weight history
            for (key, value) in self.wHistory.items():
                self.wHistory[key] = beta * self.prev_wHistory[key] + ((self.learningRate / self.batch_size) * acc_grad_weights[key])

            # update bias history
            for (key, value) in self.bHistory.items():
                self.bHistory[key] = beta * self.prev_bHistory[key] + ((self.learningRate / self.batch_size) * acc_grad_bias[key])

            # update weights
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] - self.wHistory[key]

            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - self.bHistory[key]

            # updating histroy for weights and bias
            self.prev_wHistory = self.wHistory
            self.prev_bHistory = self.bHistory
            
            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
                
        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()

        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs RMSPROP for one epoch
    def oneEpochRMSPROP(self,epoch):
        ''' Executes A Single Epoch for RMSPROP Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. 
        '''
        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train


        # maintaining previous history for weights and bias
        prev_wHistory, prev_bHistory = self.wHistory, self.bHistory

        # Total Loss for epoch
        loss_input = 0
        beta = self.parameters["beta"]
        eps = self.parameters["epsilon"]
        count = 0
        
        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perfrom forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(index,actual_y)

            # compute loss
            loss_input += self.loss_function(predicted_y, self.y_train[index])

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]
        

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):
    
                # update weight history
                for (key, value) in self.wHistory.items():
                    self.wHistory[key] = beta * prev_wHistory[key] +  (1 - beta) * acc_grad_weights[key] ** 2

                # update bias history
                for (key, value) in self.bHistory.items():
                    self.bHistory[key] = beta * prev_bHistory[key] + (1 - beta) * acc_grad_bias[key] ** 2

                # update weights
                for (key, value) in self.weights.items():
                    self.weights[key] = self.weights[key] -  (self.learningRate / self.batch_size) * acc_grad_weights[key] / (np.sqrt(self.wHistory[key] + eps))

                # update bias
                for (key, value) in self.bias.items():
                    self.bias[key] = self.bias[key] - (self.learningRate / self.batch_size) * acc_grad_bias[key] / (np.sqrt(self.bHistory[key] + eps))
                
                # updating histroy for weights and bias
                self.prev_wHistory = self.wHistory
                self.prev_bHistory = self.bHistory

                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:
            
            # update weight history
            for (key, value) in self.wHistory.items():
                self.wHistory[key] = beta * prev_wHistory[key] +  (1 - beta) * acc_grad_weights[key] ** 2

            # update bias history
            for (key, value) in self.bHistory.items():
                self.bHistory[key] = beta * prev_bHistory[key] + (1 - beta) * acc_grad_bias[key] ** 2

            # update weights
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] -  (self.learningRate / self.batch_size) * acc_grad_weights[key] / (np.sqrt(self.wHistory[key] + eps))

            # update bias
            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - (self.learningRate / self.batch_size) * acc_grad_bias[key] / (np.sqrt(self.bHistory[key] + eps))
            
            # updating histroy for weights and bias
            self.prev_wHistory = self.wHistory
            self.prev_bHistory = self.bHistory

            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()

        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs ADAM gradient descent for one epoch
    def oneEpochADAM(self,epoch):
        ''' Executes A Single Epoch for ADAM Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''

        n = self.train_n_samples
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # maintaining previous history and momentum for weights and bias
        prev_wMomentum, prev_bMomentum = self.wMomentum, self.bMomentum
        prev_wHistory, prev_bHistory = self.wHistory, self.bHistory

        wMomentum_hat = {} 
        bMomentum_hat = {}
        wHistory_hat = {} 
        bHistory_hat = {}

        # Total Loss for epoch
        loss_input = 0
        count = 0
        beta1 = self.parameters["beta1"]
        beta2 = self.parameters["beta2"]
        epsilon = self.parameters["epsilon"]
        
        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perform forward propagation

            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(index,actual_y)

            # compute loss
            loss_input += self.loss_function(predicted_y, actual_y[index])

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):
        
                # update weight momentum
                for (key, value) in self.wMomentum.items():
                    self.wMomentum[key] = beta1*prev_wMomentum[key] +  (1 - beta1) * acc_grad_weights[key]

                # update bias momentum
                for (key, value) in self.bMomentum.items():
                    self.bMomentum[key] = beta1*prev_bMomentum[key] + (1 - beta1) * acc_grad_bias[key]

                # update weight history
                for (key, value) in self.wHistory.items():
                    self.wHistory[key] = beta2 * prev_wHistory[key] + (1 - beta2) * acc_grad_weights[key] ** 2

                # update bias history
                for (key, value) in self.bHistory.items():
                    self.bHistory[key] = beta2 * prev_bHistory[key] + (1 - beta2) * acc_grad_bias[key] ** 2

                
                # compute intermediate values
                for (key, value) in self.weights.items():
                    wMomentum_hat[key] = self.wMomentum[key] / (1 - np.power(beta1, epoch + 1))

                for (key, value) in self.bias.items():
                    bMomentum_hat[key] = self.bMomentum[key] / (1 - np.power(beta1, epoch + 1))

                for (key, value) in self.weights.items():
                    wHistory_hat[key] = self.wHistory[key] / (1 - np.power(beta2, epoch + 1))

                for (key, value) in self.bias.items():
                    bHistory_hat[key] = self.bHistory[key] / (1 - np.power(beta2, epoch + 1))

                # update weights
                for (key, value) in self.weights.items():
                    temp = (self.learningRate / self.batch_size) * wMomentum_hat[key] / (np.sqrt(wHistory_hat[key] + epsilon))
                    self.weights[key] = self.weights[key] - temp
                
                # update bias
                for (key, value) in self.bias.items():
                    temp = (self.learningRate / self.batch_size) * bMomentum_hat[key] / (np.sqrt(bHistory_hat[key] + epsilon))
                    self.bias[key] = self.bias[key] - temp

                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
            
        # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:
            
            # update weight momentum
            for (key, value) in self.wMomentum.items():
                self.wMomentum[key] = beta1*prev_wMomentum[key] +  (1 - beta1) * acc_grad_weights[key]

            # update bias momentum
            for (key, value) in self.bMomentum.items():
                self.bMomentum[key] = beta1*prev_bMomentum[key] + (1 - beta1) * acc_grad_bias[key]

            # update weight history
            for (key, value) in self.wHistory.items():
                self.wHistory[key] = beta2 * prev_wHistory[key] + (1 - beta2) * acc_grad_weights[key] ** 2

            # update bias history
            for (key, value) in self.bHistory.items():
                self.bHistory[key] = beta2 * prev_bHistory[key] + (1 - beta2) * acc_grad_bias[key] ** 2

            
            # compute intermediate values
            for (key, value) in self.weights.items():
                wMomentum_hat[key] = self.wMomentum[key] / (1 - np.power(beta1, epoch + 1))

            for (key, value) in self.bias.items():
                bMomentum_hat[key] = self.bMomentum[key] / (1 - np.power(beta1, epoch + 1))

            for (key, value) in self.weights.items():
                wHistory_hat[key] = self.wHistory[key] / (1 - np.power(beta2, epoch + 1))

            for (key, value) in self.bias.items():
                bHistory_hat[key] = self.bHistory[key] / (1 - np.power(beta2, epoch + 1))

            # update weights
            for (key, value) in self.weights.items():
                temp = (self.learningRate / self.batch_size) * wMomentum_hat[key] / (np.sqrt(wHistory_hat[key] + epsilon))
                self.weights[key] = self.weights[key] - temp
            
            # update bias
            for (key, value) in self.bias.items():
                temp = (self.learningRate / self.batch_size) * bMomentum_hat[key] / (np.sqrt(bHistory_hat[key] + epsilon))
                self.bias[key] = self.bias[key] - temp

            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
        
        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()
        
        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs NADAM gradient descent for one epoch
    def oneEpochNADAM(self,epoch):
        ''' Executes A Single Epoch for NADAM Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''

        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # maintaining previous history and momentum for weights and bias
        prev_wMomentum, prev_bMomentum = self.wMomentum, self.bMomentum
        prev_wHistory, prev_bHistory = self.wHistory, self.bHistory

        wMomentum_hat = {} 
        bMomentum_hat = {}
        wHistory_hat = {} 
        bHistory_hat = {}

        # Total Loss for epoch
        loss_input = 0
        count = 0
        beta1 = self.parameters["beta1"]
        beta2 = self.parameters["beta2"]
        epsilon = self.parameters["epsilon"]


        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        for index in range(n):

            # perform forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # perform backward propagation
            grad_weights, grad_bias = self.backward_propagation(index,actual_y)

            # compute loss
            loss_input += self.loss_function(predicted_y, actual_y[index])

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):
                
                # update weight momentum
                for (key, value) in self.wMomentum.items():
                    self.wMomentum[key] = beta1*prev_wMomentum[key] + (1 - beta1) * acc_grad_weights[key]

                # update bias momentum
                for (key, value) in self.bMomentum.items():
                    self.bMomentum[key] = beta1*prev_bMomentum[key] + (1 - beta1) * acc_grad_bias[key]

                # update weight history
                for (key, value) in self.wHistory.items():
                    self.wHistory[key] = beta2 * prev_wHistory[key] + (1 - beta2) * acc_grad_weights[key] ** 2

                # update bias history
                for (key, value) in self.bHistory.items():
                    self.bHistory[key] = beta2 * prev_bHistory[key] + (1 - beta2) * acc_grad_bias[key] ** 2


                # compute intermediate values
                for (key, value) in self.weights.items():
                    wMomentum_hat[key] = self.wMomentum[key] / (1 - np.power(beta1, epoch + 1))
                
                for (key, value) in self.bias.items():
                    bMomentum_hat[key] = self.bMomentum[key] / (1 - np.power(beta1, epoch + 1))

                for (key, value) in self.weights.items():
                    wHistory_hat[key] = self.wHistory[key] / (1 - np.power(beta2, epoch + 1))

                for (key, value) in self.bias.items():
                    bHistory_hat[key] = self.bHistory[key] / (1 - np.power(beta2, epoch + 1))

                # update weights
                for (key, value) in self.weights.items():
                    num1 = ((self.learningRate / self.batch_size) / np.sqrt(wHistory_hat[key] + epsilon))
                    num2 = beta1 * wMomentum_hat[key] + ((1 - beta1) * acc_grad_weights[key] / (1 - beta1 ** (epoch + 1)))
                    self.weights[key] = self.weights[key] - num1*num2

                # update bias
                for (key, value) in self.bias.items():
                    num1 = ((self.learningRate / self.batch_size) / np.sqrt(bHistory_hat[key] + epsilon))
                    num2 = beta1 * bMomentum_hat[key] + ((1 - beta1) * acc_grad_bias[key] / (1 - beta1 ** (epoch + 1)))
                    self.bias[key] = self.bias[key] - num1*num2

                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
            
            # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:
            
            # update weight momentum
            for (key, value) in self.wMomentum.items():
                self.wMomentum[key] = beta1*prev_wMomentum[key] + (1 - beta1) * acc_grad_weights[key]

            # update bias momentum
            for (key, value) in self.bMomentum.items():
                self.bMomentum[key] = beta1*prev_bMomentum[key] + (1 - beta1) * acc_grad_bias[key]

            # update weight history
            for (key, value) in self.wHistory.items():
                self.wHistory[key] = beta2 * prev_wHistory[key] + (1 - beta2) * acc_grad_weights[key] ** 2

            # update bias history
            for (key, value) in self.bHistory.items():
                self.bHistory[key] = beta2 * prev_bHistory[key] + (1 - beta2) * acc_grad_bias[key] ** 2


            # compute intermediate values
            for (key, value) in self.weights.items():
                wMomentum_hat[key] = self.wMomentum[key] / (1 - np.power(beta1, epoch + 1))
            
            for (key, value) in self.bias.items():
                bMomentum_hat[key] = self.bMomentum[key] / (1 - np.power(beta1, epoch + 1))

            for (key, value) in self.weights.items():
                wHistory_hat[key] = self.wHistory[key] / (1 - np.power(beta2, epoch + 1))

            for (key, value) in self.bias.items():
                bHistory_hat[key] = self.bHistory[key] / (1 - np.power(beta2, epoch + 1))

            # update weights
            for (key, value) in self.weights.items():
                num1 = ((self.learningRate / self.batch_size) / np.sqrt(wHistory_hat[key] + epsilon))
                num2 = beta1 * wMomentum_hat[key] + ((1 - beta1) * acc_grad_weights[key] / (1 - beta1 ** (epoch + 1)))
                self.weights[key] = self.weights[key] - num1*num2

            # update bias
            for (key, value) in self.bias.items():
                num1 = ((self.learningRate / self.batch_size) / np.sqrt(bHistory_hat[key] + epsilon))
                num2 = beta1 * bMomentum_hat[key] + ((1 - beta1) * acc_grad_bias[key] / (1 - beta1 ** (epoch + 1)))
                self.bias[key] = self.bias[key] - num1*num2

            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()
        
                
        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()
        
        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # runs gradient descent for one epoch
    def oneEpochGD(self, epoch):
        ''' Executes A Single Epoch for Vanilla Gradient Descent Algorithm.
            Returns the training loss,training accuracy,validaiton loss and validation accuracy,averaged over all points. '''

        n = self.train_n_samples
        
        # randomizing batches
        idx = np.random.permutation(self.train_n_samples)
        self.x_train = self.x_train[idx]
        self.y_train = self.y_train[idx]

        input = self.x_train
        actual_y = self.y_train

        # Total Loss for epoch
        loss_input = 0
        count = 0

        # set accumalated gradients to zero
        acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # execute one epoch for all datapoints in train set
        for index in range(n):

            # perform forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # compute loss
            loss_input += self.loss_function(predicted_y, actual_y[index])

            # perofrm backward propagation
            grad_weights, grad_bias = self.backward_propagation(index, actual_y)

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

            # accumulate grad_weights and grad_bais for each input
            for (key, value) in grad_weights.items():
                acc_grad_weights[key] = acc_grad_weights[key] + \
                    grad_weights[key]

            for (key, value) in grad_bias.items():
                acc_grad_bias[key] = acc_grad_bias[key] + grad_bias[key]

            # update weights and bias if the number of datapoints in batch_size are divisble by batch_size
            if ((index + 1) % self.batch_size == 0):

                # update weights
                for (key, value) in self.weights.items():
                    self.weights[key] = self.weights[key] - ((self.learningRate / self.batch_size) * acc_grad_weights[key])
                
                # update bias
                for (key, value) in self.bias.items():
                    self.bias[key] = self.bias[key] - ((self.learningRate / self.batch_size) * acc_grad_bias[key])

                # set accumalated gradients to zero
                acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # if the number of datapoints in batch is not divisible by batch_size update weights and bias 
        if n % self.batch_size != 0:

            # update weights
            for (key, value) in self.weights.items():
                self.weights[key] = self.weights[key] - ((self.learningRate / self.batch_size) * acc_grad_weights[key])
            
            # update bias
            for (key, value) in self.bias.items():
                self.bias[key] = self.bias[key] - ((self.learningRate / self.batch_size) * acc_grad_bias[key])

            # set accumalated gradients to zero
            acc_grad_weights, acc_grad_bias = self.make_accumalate_zero()

        # compute trainAccuracy,trainLoss averaged over train size
        trainAccuracy = count / n
        trainLoss = loss_input / n + self.regularize()
        
        # compute validationAccuracy,validationLoss averaged over test size 
        validationLoss, validationAccuracy = self.computeTestLossAndAccuracy()
        
        return trainLoss, trainAccuracy, validationLoss, validationAccuracy

    # executes a single epoch of the FeedForward NN according to the optimizer function. 
    def executeOneEpoch(self,epoch):
        
        if self.optimizer == "sgd":
            return self.oneEpochSGD(epoch)

        if self.optimizer == "momentum":
            return self.oneEpochMOMENTUM(epoch)

        if self.optimizer == "nestrov":
            return self.oneEpochNAG(epoch)

        if self.optimizer == "rmsprop":
            return self.oneEpochRMSPROP(epoch)

        if self.optimizer == "adam":
            return self.oneEpochADAM(epoch)

        if self.optimizer == "nadam":
            return self.oneEpochNADAM(epoch)

        if self.optimizer == "gd":
            return self.oneEpochGD(epoch)

    # computes validation loss and validation accuracy 
    def computeTestLossAndAccuracy(self):
        
        validation_size = self.y_validate.shape[0]
        test_loss = 0
        count = 0

        input = self.x_validate
        actual_y = self.y_validate

        for index in range(0, validation_size):

            # perform forward propagation
            self.forward_propagation(input, index)
            predicted_y = self.post_activation["h" + str(self.L)]

            # compute loss
            test_loss += self.loss_function(predicted_y, actual_y[index])

            # compute the number of datapoints which are correctly classified
            indexWithMaxProb = np.argmax(predicted_y)
            if (actual_y[index] == (indexWithMaxProb)):
                count = count + 1

        # compute validationAccuracy,validationLoss averaged over validation size 
        validationAccuracy = count / validation_size
        validationLoss = test_loss / validation_size + self.regularize()
        
        return validationLoss, validationAccuracy

    '''<----------------------------Question 1------------------------------------->'''
    def question_1(self):

        wandb.init(
                # set the wandb project where this run will be logged
                project = feed_forward.parameters["wandb_project"],
                # config = sweep_config
        )

        # dictionary of labels to be added
        labels_added = {}

        ''' 
        Running the loop for the number of training samples.
        In each iteration,a random index is generated and we extract the feature and label at the generated index.
        If the label is already in the labels_added dictionary,we ignore that label,else we add that (label,feature) 
        as (key,value) pair in dictionary (so that one label is considered only once).
        '''
        images = []
        for i in range(self.train_n_samples):
            index = random.randrange(self.train_n_samples)
            feature = self. x_train[index]
            label = self.y_train[index]
            if (label in labels_added.keys()):
                continue
            labels_added[label] = feature
            image = wandb.Image(
                labels_added[label], caption=f"{self.title[label]}")
            images.append(image)
        wandb.log({"Images": images})
    
    '''<----------------------------Question 2------------------------>'''

    def feed_forward_q2(self): 
        # initialization of weights
        self.weightsAndBiasInitializer()
 
        # train the data
        for i in range(1, self.epoch + 1):
            (train_Loss, train_Accuracy, validation_Loss,
             validation_Accuracy) = self.executeOneEpoch(i)
            
        # generating the random index to test the model and finding the y for that
        index = np.random.randint(self.y_validate.shape[0])
        input = self.x_validate
        self.forward_propagation(input, index)
        predicted_y = self.post_activation["h" + str(self.L)]
        print(predicted_y)

    '''<----------------------------Question 3-4------------------------------------->'''
    def feed_forward_q3_4(self):
        
        self.weights = dict()
        self.bias = dict()
        self.wHistory = dict()
        self.bHistory = dict()
        self.wMomentum = dict()
        self.bMomentum = dict()
        
        # initialization of weights and bias
        self.weightsAndBiasInitializer()

        # initializing history for weights and bias
        self.historyInitializer()

        # initializing momentum for weights and bias
        self.momentumInitializer()

        self.validation_Accuracy = 0


        # run feedforward NN 
        for i in range(1, self.epoch + 1):
              (train_Loss, train_Accuracy, validation_Loss,self.validation_Accuracy) = self.executeOneEpoch(i)
              print("epoch:{epoch}, train loss:{train_l}, train accuracy:{train_ac}, validation loss:{validation_l}, validation accuracy:{validation_ac}".\
                  format(epoch = i,train_l = train_Loss,train_ac = train_Accuracy,validation_l = validation_Loss,validation_ac = self.validation_Accuracy))
            
              wandb.log({'train loss':train_Loss, 'train accuracy':train_Accuracy,'validation loss':validation_Loss, 'validation accuracy':self.validation_Accuracy})
            


In [None]:
feed_forward = FeedForward()

In [None]:
sweep_config = {

        'method' : 'bayes', #grid ,random - generates exponential ways,bayesian  efficient way
        'name' : 'bayes_sweep cross_entropy',
        'metric' : {
            'name' : 'validation accuracy',
            'goal' : 'maximize'
        },
        'parameters':{
                'epochs' : {
                    'values' : [5,10]
                },
                'number_of_hidden_layer':{
                    'values' : [3,4,5]
                },
                'size_of_hidden_layer' : {
                    'values' :[32,64,128]
                },
                'weight_decay' : {
                    'values' : [0,0.0005,0.5]
                },
                'learning_rate' : {
                    'values' : [1e-3,1e-4]
                },
                'optimizer' : {
                    'values' : ['sgd','momentum','nestrov','rmsprop','adam','nadam']
                },
                'batch_size' : {
                        'values' : [16,32,64]
                },
                'weight_initialization' :{
                    'values' : ['random','Xavier']
                },
                'activation' : {
                    'values' : ['sigmoid','tanh','ReLU']
                }
        }
}
sweep_id = wandb.sweep(sweep = sweep_config,project= feed_forward.parameters["wandb_project"])

Create sweep with ID: cuhjfznh
Sweep URL: https://wandb.ai/cs22m019/DL%20Final%20Assignment%201/sweeps/cuhjfznh


In [None]:
def train():
    wandb.init(
                # set the wandb project where this run will be logged
                # project = feed_forward.parameters["wandb_project"],
                config = sweep_config
    )
    
    feed_forward.epoch = wandb.config.epochs
    feed_forward.nnl = wandb.config.size_of_hidden_layer
    feed_forward.weightDecay =  wandb.config.weight_decay
    feed_forward.learningRate = wandb.config.learning_rate
    feed_forward.optimizer = wandb.config.optimizer
    feed_forward.batch_size = wandb.config.batch_size
    feed_forward.weightInitialization = wandb.config.weight_initialization
    feed_forward.activationFunction = wandb.config.activation
    feed_forward.L = wandb.config.number_of_hidden_layer + 1
    feed_forward.weight_decay = wandb.config.weight_decay


    wandb.run.name = "optimizer_" + str(wandb.config.optimizer) +  "_hl_"+ str(wandb.config.number_of_hidden_layer) + "_bs_" + str(wandb.config.batch_size) + "_ac_" + str(wandb.config.activation)    
    feed_forward.feed_forward_q3_4()

In [None]:
wandb.agent(sweep_id=sweep_id,function = train,count = 42)

[34m[1mwandb[0m: Agent Starting Run: iue6d4wi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: random
[34m[1mwandb[0m: Currently logged in as: [33mcs22m019[0m. Use [1m`wandb login --relogin`[0m to force relogin


epoch:1, train loss:8.084183199339416, train accuracy:0.38942592592592595, validation loss:4.459932895915803, validation accuracy:0.49433333333333335
epoch:2, train loss:3.3293499991008484, train accuracy:0.5576481481481481, validation loss:2.6642070740319634, validation accuracy:0.5811666666666667
epoch:3, train loss:2.2872293369870738, train accuracy:0.6082592592592593, validation loss:2.027670439455047, validation accuracy:0.6235
epoch:4, train loss:1.8083029311491818, train accuracy:0.6369074074074074, validation loss:1.6806093328134972, validation accuracy:0.6463333333333333
epoch:5, train loss:1.525278875187279, train accuracy:0.6603703703703704, validation loss:1.4563037889984742, validation accuracy:0.6673333333333333


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▅▇▇█
train loss,█▃▂▁▁
validation accuracy,▁▅▆▇█
validation loss,█▄▂▂▁

0,1
train accuracy,0.66037
train loss,1.52528
validation accuracy,0.66733
validation loss,1.4563


[34m[1mwandb[0m: Agent Starting Run: pwiaqf5x with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.317239646551535, train accuracy:0.10014814814814815, validation loss:2.303247758455665, validation accuracy:0.09933333333333333
epoch:2, train loss:2.3084889314901464, train accuracy:0.1012962962962963, validation loss:2.309652197039912, validation accuracy:0.09766666666666667
epoch:3, train loss:2.304887189487476, train accuracy:0.1032962962962963, validation loss:2.3037470525140065, validation accuracy:0.1005
epoch:4, train loss:2.304770238210802, train accuracy:0.10307407407407407, validation loss:2.306703312936343, validation accuracy:0.09716666666666667
epoch:5, train loss:2.3093202911426562, train accuracy:0.1002962962962963, validation loss:2.3129513506162986, validation accuracy:0.09766666666666667
epoch:6, train loss:2.3062795578854733, train accuracy:0.10031481481481481, validation loss:2.3082721355859865, validation accuracy:0.10433333333333333
epoch:7, train loss:2.3062658887118856, train accuracy:0.1029074074074074, validation loss:2.306407593976245, 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▃▆▅▁▁▅█▂▇
train loss,█▄▃▂▅▃▃▂▂▁
validation accuracy,▂▁▃▁▁▆▁█▁▁
validation loss,▁▆▁▃█▅▃▃▅▁

0,1
train accuracy,0.10424
train loss,2.3014
validation accuracy,0.09767
validation loss,2.30345


[34m[1mwandb[0m: Agent Starting Run: nvddsmga with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: random


epoch:1, train loss:2.4867225064167506, train accuracy:0.10562962962962963, validation loss:2.3746112710133893, validation accuracy:0.11983333333333333
epoch:2, train loss:2.3386891497482742, train accuracy:0.1333888888888889, validation loss:2.3135865289751028, validation accuracy:0.15666666666666668
epoch:3, train loss:2.302702833029946, train accuracy:0.16442592592592592, validation loss:2.2967224569556453, validation accuracy:0.15866666666666668
epoch:4, train loss:2.292579612831134, train accuracy:0.16164814814814815, validation loss:2.291426535879565, validation accuracy:0.161
epoch:5, train loss:2.288591419730181, train accuracy:0.16677777777777777, validation loss:2.2883486329661764, validation accuracy:0.16416666666666666
epoch:6, train loss:2.2856911475043877, train accuracy:0.1715, validation loss:2.285676614419287, validation accuracy:0.16833333333333333
epoch:7, train loss:2.2829097657180397, train accuracy:0.1842962962962963, validation loss:2.282852996965865, validation 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▃▅▄▅▅▆▆▇█
train loss,█▃▂▂▁▁▁▁▁▁
validation accuracy,▁▄▄▄▄▄▅▆▇█
validation loss,█▄▃▂▂▂▂▁▁▁

0,1
train accuracy,0.22098
train loss,2.27403
validation accuracy,0.2215
validation loss,2.27399


[34m[1mwandb[0m: Agent Starting Run: cab0l47u with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.9736835821282648, train accuracy:0.3138518518518518, validation loss:1.6493396296927547, validation accuracy:0.41633333333333333
epoch:2, train loss:1.3901082351716538, train accuracy:0.5293333333333333, validation loss:1.1853852549111914, validation accuracy:0.6055
epoch:3, train loss:1.0464078682638651, train accuracy:0.6239074074074074, validation loss:0.9647764361576867, validation accuracy:0.6353333333333333
epoch:4, train loss:0.8902336072579298, train accuracy:0.6532592592592592, validation loss:0.8525429846725171, validation accuracy:0.6713333333333333
epoch:5, train loss:0.8107246655194909, train accuracy:0.6830740740740741, validation loss:0.8061122295830091, validation accuracy:0.6898333333333333
epoch:6, train loss:0.7733852778530169, train accuracy:0.7061111111111111, validation loss:0.7722379248516971, validation accuracy:0.7048333333333333
epoch:7, train loss:0.7526456001666493, train accuracy:0.7225925925925926, validation loss:0.7557726774053249, 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▄▆▆▇▇▇███
train loss,█▅▃▂▁▁▁▁▁▁
validation accuracy,▁▅▅▆▇▇▇▇██
validation loss,█▄▃▂▂▁▁▁▁▁

0,1
train accuracy,0.75878
train loss,0.73035
validation accuracy,0.75983
validation loss,0.73312


[34m[1mwandb[0m: Agent Starting Run: bsho1cd2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.078808526977658, train accuracy:0.3637037037037037, validation loss:1.6992344584417027, validation accuracy:0.43716666666666665
epoch:2, train loss:1.308027755686263, train accuracy:0.6203888888888889, validation loss:1.0351003591795926, validation accuracy:0.6966666666666667
epoch:3, train loss:0.902991411461968, train accuracy:0.7204814814814815, validation loss:0.8180761914636422, validation accuracy:0.7321666666666666
epoch:4, train loss:0.7624561124501479, train accuracy:0.7421851851851852, validation loss:0.7296034968588583, validation accuracy:0.7505
epoch:5, train loss:0.6952056808927749, train accuracy:0.7577222222222222, validation loss:0.678616522706193, validation accuracy:0.763


0,1
train accuracy,▁▆▇██
train loss,█▄▂▁▁
validation accuracy,▁▇▇██
validation loss,█▃▂▁▁

0,1
train accuracy,0.75772
train loss,0.69521
validation accuracy,0.763
validation loss,0.67862


[34m[1mwandb[0m: Agent Starting Run: 67hszstt with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.0597965422116555, train accuracy:0.31544444444444447, validation loss:1.692719469715832, validation accuracy:0.43333333333333335
epoch:2, train loss:1.3073022508673764, train accuracy:0.5723333333333334, validation loss:1.0073489046052926, validation accuracy:0.6526666666666666
epoch:3, train loss:0.9125742851517874, train accuracy:0.6752222222222222, validation loss:0.8620099055190851, validation accuracy:0.6958333333333333
epoch:4, train loss:0.8345605557813113, train accuracy:0.7028148148148148, validation loss:0.8143915250233666, validation accuracy:0.7128333333333333
epoch:5, train loss:0.805709382358577, train accuracy:0.7172037037037037, validation loss:0.7938970828765344, validation accuracy:0.729
epoch:6, train loss:0.7957045885773633, train accuracy:0.7260925925925926, validation loss:0.7873891450179908, validation accuracy:0.7338333333333333
epoch:7, train loss:0.7905796827960857, train accuracy:0.7348703703703704, validation loss:0.7771072828879618, va

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▅▇▇▇█████
train loss,█▄▂▁▁▁▁▁▁▁
validation accuracy,▁▆▇▇▇▇████
validation loss,█▃▂▁▁▁▁▁▁▁

0,1
train accuracy,0.75639
train loss,0.77727
validation accuracy,0.7645
validation loss,0.76685


[34m[1mwandb[0m: Agent Starting Run: xh96qzg7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.2441428225673175, train accuracy:0.17737037037037037, validation loss:2.0751125731565345, validation accuracy:0.241
epoch:2, train loss:1.8908769755816548, train accuracy:0.2855740740740741, validation loss:1.7053098635771478, validation accuracy:0.37383333333333335
epoch:3, train loss:1.539732319815318, train accuracy:0.45501851851851854, validation loss:1.4028099009637303, validation accuracy:0.49383333333333335
epoch:4, train loss:1.292702706508745, train accuracy:0.5222037037037037, validation loss:1.2002433662191858, validation accuracy:0.549
epoch:5, train loss:1.1161376339481088, train accuracy:0.5969444444444445, validation loss:1.0449925045151864, validation accuracy:0.642


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▃▆▇█
train loss,█▆▄▂▁
validation accuracy,▁▃▅▆█
validation loss,█▅▃▂▁

0,1
train accuracy,0.59694
train loss,1.11614
validation accuracy,0.642
validation loss,1.04499


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 78klxnb9 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.9356995697118957, train accuracy:0.3089074074074074, validation loss:1.5647006443441125, validation accuracy:0.4671666666666667
epoch:2, train loss:1.3684271784903408, train accuracy:0.5064444444444445, validation loss:1.212628676599758, validation accuracy:0.5548333333333333
epoch:3, train loss:1.0941724611880008, train accuracy:0.6082592592592593, validation loss:0.999325811562412, validation accuracy:0.6421666666666667
epoch:4, train loss:0.9240379797236258, train accuracy:0.6658888888888889, validation loss:0.8674001381241468, validation accuracy:0.6828333333333333
epoch:5, train loss:0.8284790355652697, train accuracy:0.6967592592592593, validation loss:0.8042590256456368, validation accuracy:0.7055
epoch:6, train loss:0.7750918981772332, train accuracy:0.7192037037037037, validation loss:0.7700083419974065, validation accuracy:0.7145
epoch:7, train loss:0.7395253290791761, train accuracy:0.7345925925925926, validation loss:0.7326607454225469, validation accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▄▆▆▇▇▇███
train loss,█▅▃▂▂▁▁▁▁▁
validation accuracy,▁▃▅▆▇▇▇███
validation loss,█▅▃▂▂▂▁▁▁▁

0,1
train accuracy,0.7677
train loss,0.69145
validation accuracy,0.7665
validation loss,0.68976


[34m[1mwandb[0m: Agent Starting Run: ru874kwi with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.9545501366016205, train accuracy:0.3943333333333333, validation loss:1.5897255697196768, validation accuracy:0.558
epoch:2, train loss:1.3678490423912242, train accuracy:0.6140925925925926, validation loss:1.1826083512717407, validation accuracy:0.6745
epoch:3, train loss:1.0622888967517492, train accuracy:0.710574074074074, validation loss:0.9627497234925663, validation accuracy:0.7225
epoch:4, train loss:0.8931618638618672, train accuracy:0.7396111111111111, validation loss:0.8338552646057472, validation accuracy:0.7485
epoch:5, train loss:0.7924499703687473, train accuracy:0.7556851851851852, validation loss:0.754961577521582, validation accuracy:0.7616666666666667


0,1
train accuracy,▁▅▇██
train loss,█▄▃▂▁
validation accuracy,▁▅▇██
validation loss,█▅▃▂▁

0,1
train accuracy,0.75569
train loss,0.79245
validation accuracy,0.76167
validation loss,0.75496


[34m[1mwandb[0m: Agent Starting Run: iodw9rnq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: random


epoch:1, train loss:13.343452642924714, train accuracy:0.1377962962962963, validation loss:11.395466226922657, validation accuracy:0.1525
epoch:2, train loss:9.769201102606043, train accuracy:0.18724074074074074, validation loss:8.20878786648465, validation accuracy:0.23433333333333334
epoch:3, train loss:7.211856384519156, train accuracy:0.2819259259259259, validation loss:6.09457397932195, validation accuracy:0.3175
epoch:4, train loss:5.460665601177655, train accuracy:0.3532037037037037, validation loss:4.770824357465145, validation accuracy:0.38333333333333336
epoch:5, train loss:4.51738397260716, train accuracy:0.4051666666666667, validation loss:4.08480043814051, validation accuracy:0.43533333333333335


0,1
train accuracy,▁▂▅▇█
train loss,█▅▃▂▁
validation accuracy,▁▃▅▇█
validation loss,█▅▃▂▁

0,1
train accuracy,0.40517
train loss,4.51738
validation accuracy,0.43533
validation loss,4.0848


[34m[1mwandb[0m: Agent Starting Run: bbwdter4 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.0795053966617212, train accuracy:0.634425925925926, validation loss:0.6890046001715449, validation accuracy:0.7553333333333333
epoch:2, train loss:0.6324685150109904, train accuracy:0.7738333333333334, validation loss:0.5827150905177431, validation accuracy:0.795
epoch:3, train loss:0.565042568603253, train accuracy:0.8040185185185185, validation loss:0.5390167626998343, validation accuracy:0.8185
epoch:4, train loss:0.5334726686189296, train accuracy:0.8190925925925926, validation loss:0.517584772769997, validation accuracy:0.8215
epoch:5, train loss:0.5127295623775378, train accuracy:0.8292962962962963, validation loss:0.4998111395442674, validation accuracy:0.8325


0,1
train accuracy,▁▆▇██
train loss,█▂▂▁▁
validation accuracy,▁▅▇▇█
validation loss,█▄▂▂▁

0,1
train accuracy,0.8293
train loss,0.51273
validation accuracy,0.8325
validation loss,0.49981


[34m[1mwandb[0m: Agent Starting Run: 4dqvryf0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.0912473542729737, train accuracy:0.25033333333333335, validation loss:1.6455957212281977, validation accuracy:0.5165
epoch:2, train loss:1.2436931223466292, train accuracy:0.6271851851851852, validation loss:0.9663438118961027, validation accuracy:0.6923333333333334
epoch:3, train loss:0.8913637035079969, train accuracy:0.7140925925925926, validation loss:0.8311979054778015, validation accuracy:0.73
epoch:4, train loss:0.8233354650366262, train accuracy:0.7386111111111111, validation loss:0.799511932967088, validation accuracy:0.7443333333333333
epoch:5, train loss:0.8084520736522053, train accuracy:0.7515, validation loss:0.7898156003996002, validation accuracy:0.7545


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▃▁▁▁
validation accuracy,▁▆▇██
validation loss,█▂▁▁▁

0,1
train accuracy,0.7515
train loss,0.80845
validation accuracy,0.7545
validation loss,0.78982


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6djtooj2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.3228535210530454, train accuracy:0.5675555555555556, validation loss:0.7438028525762325, validation accuracy:0.754
epoch:2, train loss:0.6467335404020383, train accuracy:0.7714074074074074, validation loss:0.5809659711278641, validation accuracy:0.7956666666666666
epoch:3, train loss:0.5542496559478393, train accuracy:0.8022777777777778, validation loss:0.5170488916280055, validation accuracy:0.8191666666666667
epoch:4, train loss:0.5156155856352986, train accuracy:0.8201851851851852, validation loss:0.49323995238782814, validation accuracy:0.8291666666666667
epoch:5, train loss:0.49466603256857344, train accuracy:0.8299814814814814, validation loss:0.4763043396459254, validation accuracy:0.8308333333333333


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▂▂▁▁
validation accuracy,▁▅▇██
validation loss,█▄▂▁▁

0,1
train accuracy,0.82998
train loss,0.49467
validation accuracy,0.83083
validation loss,0.4763


[34m[1mwandb[0m: Agent Starting Run: 1htki50d with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.141617810015851, train accuracy:0.2137962962962963, validation loss:1.8042871294295126, validation accuracy:0.43766666666666665
epoch:2, train loss:1.5049022711584032, train accuracy:0.5790740740740741, validation loss:1.262690530438377, validation accuracy:0.6335
epoch:3, train loss:1.1066911908382568, train accuracy:0.6561481481481481, validation loss:1.0163215306868867, validation accuracy:0.661
epoch:4, train loss:0.9250243374747351, train accuracy:0.6803888888888889, validation loss:0.8938417703626397, validation accuracy:0.689
epoch:5, train loss:0.8415339805576642, train accuracy:0.7056851851851852, validation loss:0.8498269590924923, validation accuracy:0.7153333333333334


0,1
train accuracy,▁▆▇██
train loss,█▅▂▁▁
validation accuracy,▁▆▇▇█
validation loss,█▄▂▁▁

0,1
train accuracy,0.70569
train loss,0.84153
validation accuracy,0.71533
validation loss,0.84983


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: b3osa96r with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8237126347743972, train accuracy:0.7375185185185186, validation loss:0.7000956038484706, validation accuracy:0.8056666666666666
epoch:2, train loss:0.7146046511083347, train accuracy:0.8088148148148148, validation loss:0.6904742052344531, validation accuracy:0.8175
epoch:3, train loss:0.7276326860971586, train accuracy:0.8161296296296296, validation loss:0.6965941001884555, validation accuracy:0.8236666666666667
epoch:4, train loss:0.7419912084386128, train accuracy:0.820962962962963, validation loss:0.7267868861362332, validation accuracy:0.8208333333333333
epoch:5, train loss:0.7561695332607258, train accuracy:0.8257407407407408, validation loss:0.7069271419545035, validation accuracy:0.8361666666666666


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▇▇██
train loss,█▁▂▃▄
validation accuracy,▁▄▅▄█
validation loss,▃▁▂█▄

0,1
train accuracy,0.82574
train loss,0.75617
validation accuracy,0.83617
validation loss,0.70693


[34m[1mwandb[0m: Agent Starting Run: rmkyng1r with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8042676286232019, train accuracy:0.7352962962962963, validation loss:0.7125622081376861, validation accuracy:0.784
epoch:2, train loss:0.6900628376015044, train accuracy:0.8112592592592592, validation loss:0.715956892432881, validation accuracy:0.8145
epoch:3, train loss:0.7017863586887985, train accuracy:0.8231111111111111, validation loss:0.7411639204829673, validation accuracy:0.8188333333333333
epoch:4, train loss:0.718973178577908, train accuracy:0.8277222222222222, validation loss:0.6844911769181652, validation accuracy:0.832
epoch:5, train loss:0.7344314466189147, train accuracy:0.8327407407407408, validation loss:0.7264208246951315, validation accuracy:0.8251666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▂▃▄
validation accuracy,▁▅▆█▇
validation loss,▄▅█▁▆

0,1
train accuracy,0.83274
train loss,0.73443
validation accuracy,0.82517
validation loss,0.72642


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3e4af5vr with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.4094066640989484, train accuracy:0.483, validation loss:0.8963588020553449, validation accuracy:0.688
epoch:2, train loss:0.8493509872455963, train accuracy:0.7245555555555555, validation loss:0.8084906526082479, validation accuracy:0.7451666666666666
epoch:3, train loss:0.8133550671860057, train accuracy:0.7595555555555555, validation loss:0.7986167999224288, validation accuracy:0.7595
epoch:4, train loss:0.8067722320184315, train accuracy:0.7781296296296296, validation loss:0.8059904957202357, validation accuracy:0.7831666666666667
epoch:5, train loss:0.8089973291509901, train accuracy:0.7878888888888889, validation loss:0.8158989646539045, validation accuracy:0.789


0,1
train accuracy,▁▇▇██
train loss,█▁▁▁▁
validation accuracy,▁▅▆██
validation loss,█▂▁▂▂

0,1
train accuracy,0.78789
train loss,0.809
validation accuracy,0.789
validation loss,0.8159


[34m[1mwandb[0m: Agent Starting Run: n9wnv3c5 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.6854448337294661, train accuracy:0.7662592592592593, validation loss:0.4752404779326549, validation accuracy:0.8275
epoch:2, train loss:0.4584546904104416, train accuracy:0.8405185185185186, validation loss:0.42034248807740104, validation accuracy:0.8538333333333333
epoch:3, train loss:0.4201586557159391, train accuracy:0.8534814814814815, validation loss:0.41469425848082864, validation accuracy:0.8535
epoch:4, train loss:0.4033797312330614, train accuracy:0.859574074074074, validation loss:0.3857728841790507, validation accuracy:0.8638333333333333
epoch:5, train loss:0.3952465714434801, train accuracy:0.8641481481481481, validation loss:0.382929301115162, validation accuracy:0.8703333333333333


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▃▂▁▁
validation accuracy,▁▅▅▇█
validation loss,█▄▃▁▁

0,1
train accuracy,0.86415
train loss,0.39525
validation accuracy,0.87033
validation loss,0.38293


[34m[1mwandb[0m: Agent Starting Run: xyw4lnhc with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.7833518731452311, train accuracy:0.7648333333333334, validation loss:0.7100560411692466, validation accuracy:0.807
epoch:2, train loss:0.7407015579142708, train accuracy:0.8215925925925925, validation loss:0.7098677376757788, validation accuracy:0.8343333333333334
epoch:3, train loss:0.7587709601556472, train accuracy:0.8331851851851851, validation loss:0.7063167172553272, validation accuracy:0.8385
epoch:4, train loss:0.7477359975071415, train accuracy:0.8382777777777778, validation loss:0.8105559155985207, validation accuracy:0.8363333333333334
epoch:5, train loss:0.7749747186917436, train accuracy:0.8427222222222223, validation loss:0.8974629238777305, validation accuracy:0.8406666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▄▂▇
validation accuracy,▁▇█▇█
validation loss,▁▁▁▅█

0,1
train accuracy,0.84272
train loss,0.77497
validation accuracy,0.84067
validation loss,0.89746


[34m[1mwandb[0m: Agent Starting Run: 3etupl53 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01718333333337796, max=1.0)…

epoch:1, train loss:0.8095997665489583, train accuracy:0.7539444444444444, validation loss:0.8352467665141263, validation accuracy:0.7963333333333333
epoch:2, train loss:0.7894140962068873, train accuracy:0.8124074074074074, validation loss:0.938989800759212, validation accuracy:0.7913333333333333
epoch:3, train loss:0.779316970484511, train accuracy:0.8260555555555555, validation loss:0.8057772187029386, validation accuracy:0.8223333333333334
epoch:4, train loss:0.7745930448836162, train accuracy:0.8316111111111111, validation loss:0.7264136825188102, validation accuracy:0.8396666666666667
epoch:5, train loss:0.7852044639917282, train accuracy:0.8369444444444445, validation loss:0.8542300025959294, validation accuracy:0.834


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▄▂▁▃
validation accuracy,▂▁▅█▇
validation loss,▅█▄▁▅

0,1
train accuracy,0.83694
train loss,0.7852
validation accuracy,0.834
validation loss,0.85423


[34m[1mwandb[0m: Agent Starting Run: 3ccb6m38 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.9264518043659488, train accuracy:0.6962592592592592, validation loss:0.7551125203268881, validation accuracy:0.7706666666666667
epoch:2, train loss:0.7258615944242947, train accuracy:0.7895925925925926, validation loss:0.7178610179394368, validation accuracy:0.7983333333333333
epoch:3, train loss:0.742171230610622, train accuracy:0.8018703703703703, validation loss:0.7322252418171943, validation accuracy:0.812
epoch:4, train loss:0.7651931190839385, train accuracy:0.8098888888888889, validation loss:0.7381197351251788, validation accuracy:0.817
epoch:5, train loss:0.7917249710333287, train accuracy:0.8156666666666667, validation loss:0.7661131252856508, validation accuracy:0.8191666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▂▂▃
validation accuracy,▁▅▇██
validation loss,▆▁▃▄█

0,1
train accuracy,0.81567
train loss,0.79172
validation accuracy,0.81917
validation loss,0.76611


[34m[1mwandb[0m: Agent Starting Run: nbi56kwm with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:2.2730318468670214, train accuracy:0.20951851851851852, validation loss:2.147460539324381, validation accuracy:0.2165
epoch:2, train loss:1.8513488673319265, train accuracy:0.3209814814814815, validation loss:1.5073828286060276, validation accuracy:0.4761666666666667
epoch:3, train loss:1.2527376701259334, train accuracy:0.5894259259259259, validation loss:1.0654651556722676, validation accuracy:0.6405
epoch:4, train loss:0.9574733834396992, train accuracy:0.6859814814814815, validation loss:0.8704802971600383, validation accuracy:0.699
epoch:5, train loss:0.8079783950141023, train accuracy:0.7253333333333334, validation loss:0.7504433558570881, validation accuracy:0.7371666666666666


0,1
train accuracy,▁▃▆▇█
train loss,█▆▃▂▁
validation accuracy,▁▄▇▇█
validation loss,█▅▃▂▁

0,1
train accuracy,0.72533
train loss,0.80798
validation accuracy,0.73717
validation loss,0.75044


[34m[1mwandb[0m: Agent Starting Run: do63d723 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.957420557154426, train accuracy:0.3378703703703704, validation loss:1.5605632467673225, validation accuracy:0.5141666666666667
epoch:2, train loss:1.3205233369927805, train accuracy:0.5776666666666667, validation loss:1.1027470899995389, validation accuracy:0.6493333333333333
epoch:3, train loss:0.9670671659941379, train accuracy:0.6580555555555555, validation loss:0.8829804255040895, validation accuracy:0.6633333333333333
epoch:4, train loss:0.8410457960245548, train accuracy:0.6722037037037037, validation loss:0.8242988820943109, validation accuracy:0.6768333333333333
epoch:5, train loss:0.8064678220719406, train accuracy:0.6853518518518519, validation loss:0.8039897691993398, validation accuracy:0.6886666666666666


0,1
train accuracy,▁▆▇██
train loss,█▄▂▁▁
validation accuracy,▁▆▇██
validation loss,█▄▂▁▁

0,1
train accuracy,0.68535
train loss,0.80647
validation accuracy,0.68867
validation loss,0.80399


[34m[1mwandb[0m: Agent Starting Run: 9kro00cu with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	size_of_hidden_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.9462779821780891, train accuracy:0.6716851851851852, validation loss:0.8217865666646078, validation accuracy:0.7655
epoch:2, train loss:0.8144696007017364, train accuracy:0.7852222222222223, validation loss:0.8060847918041646, validation accuracy:0.787
epoch:3, train loss:0.8585035370216221, train accuracy:0.7996481481481481, validation loss:0.8475291274365416, validation accuracy:0.804
epoch:4, train loss:0.8766488568118427, train accuracy:0.8087592592592593, validation loss:0.8649834908074955, validation accuracy:0.8036666666666666
epoch:5, train loss:0.8903542123559585, train accuracy:0.8137592592592593, validation loss:0.8514757108293577, validation accuracy:0.8195


0,1
train accuracy,▁▇▇██
train loss,█▁▃▄▅
validation accuracy,▁▄▆▆█
validation loss,▃▁▆█▆

0,1
train accuracy,0.81376
train loss,0.89035
validation accuracy,0.8195
validation loss,0.85148


[34m[1mwandb[0m: Agent Starting Run: 53rbku75 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:1.938069900377563, train accuracy:0.3380185185185185, validation loss:1.497391982429797, validation accuracy:0.4603333333333333
epoch:2, train loss:1.211233564676678, train accuracy:0.5783518518518519, validation loss:1.008533769251504, validation accuracy:0.6556666666666666
epoch:3, train loss:0.9022937827504424, train accuracy:0.6788333333333333, validation loss:0.8329626565317119, validation accuracy:0.702
epoch:4, train loss:0.782894670800924, train accuracy:0.7245555555555555, validation loss:0.7496819129350648, validation accuracy:0.7425
epoch:5, train loss:0.7147644346098105, train accuracy:0.7525185185185185, validation loss:0.6930237813440993, validation accuracy:0.7638333333333334


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▅▇██
train loss,█▄▂▁▁
validation accuracy,▁▆▇██
validation loss,█▄▂▁▁

0,1
train accuracy,0.75252
train loss,0.71476
validation accuracy,0.76383
validation loss,0.69302


[34m[1mwandb[0m: Agent Starting Run: xfpt4uzk with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_hidden_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666656966, max=1.0…

epoch:1, train loss:2.2520699021291515, train accuracy:0.18464814814814814, validation loss:2.100650432648524, validation accuracy:0.25583333333333336
epoch:2, train loss:1.8137664678643404, train accuracy:0.32672222222222225, validation loss:1.5383847560764334, validation accuracy:0.44416666666666665
epoch:3, train loss:1.2879625608880807, train accuracy:0.5443518518518519, validation loss:1.0840124648920395, validation accuracy:0.6196666666666667
epoch:4, train loss:0.9795832820943616, train accuracy:0.645, validation loss:0.8915176924250272, validation accuracy:0.6721666666666667
epoch:5, train loss:0.8472156248308592, train accuracy:0.6970740740740741, validation loss:0.802304402773518, validation accuracy:0.7011666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▃▆▇█
train loss,█▆▃▂▁
validation accuracy,▁▄▇██
validation loss,█▅▃▁▁

0,1
train accuracy,0.69707
train loss,0.84722
validation accuracy,0.70117
validation loss,0.8023


[34m[1mwandb[0m: Agent Starting Run: y65l4ndy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8363303151345036, train accuracy:0.7486296296296296, validation loss:0.7755799311144369, validation accuracy:0.8158333333333333
epoch:2, train loss:0.7838021256264233, train accuracy:0.8153703703703704, validation loss:0.7451645576473998, validation accuracy:0.8335
epoch:3, train loss:0.7954106430573266, train accuracy:0.8256851851851852, validation loss:0.8372568451366169, validation accuracy:0.8291666666666667
epoch:4, train loss:0.8258448950393849, train accuracy:0.8295925925925925, validation loss:0.7957808299712181, validation accuracy:0.8405
epoch:5, train loss:0.8384702431631119, train accuracy:0.8336296296296296, validation loss:0.813979547190952, validation accuracy:0.8455


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▂▆█
validation accuracy,▁▅▄▇█
validation loss,▃▁█▅▆

0,1
train accuracy,0.83363
train loss,0.83847
validation accuracy,0.8455
validation loss,0.81398


[34m[1mwandb[0m: Agent Starting Run: hkigture with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.9241266376489358, train accuracy:0.6822407407407407, validation loss:0.6816739602251831, validation accuracy:0.7726666666666666
epoch:2, train loss:0.7170168019539432, train accuracy:0.7867777777777778, validation loss:0.7370856956291165, validation accuracy:0.803
epoch:3, train loss:0.7380906169865765, train accuracy:0.8054814814814815, validation loss:0.6887219771762441, validation accuracy:0.814
epoch:4, train loss:0.7476546677520542, train accuracy:0.8129074074074074, validation loss:0.7211586703797919, validation accuracy:0.8148333333333333
epoch:5, train loss:0.7760931414153514, train accuracy:0.8183148148148148, validation loss:0.8003562365739215, validation accuracy:0.8043333333333333


0,1
train accuracy,▁▆▇██
train loss,█▁▂▂▃
validation accuracy,▁▆██▆
validation loss,▁▄▁▃█

0,1
train accuracy,0.81831
train loss,0.77609
validation accuracy,0.80433
validation loss,0.80036


[34m[1mwandb[0m: Agent Starting Run: wm54qxjz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.7725598945758048, train accuracy:0.7600925925925925, validation loss:0.7262629142581101, validation accuracy:0.8078333333333333
epoch:2, train loss:0.7308539760007282, train accuracy:0.8167777777777778, validation loss:0.6849817120503108, validation accuracy:0.8308333333333333
epoch:3, train loss:0.773353845305697, train accuracy:0.8247222222222222, validation loss:0.6954274440849886, validation accuracy:0.8316666666666667


[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.


epoch:4, train loss:0.7957772098407764, train accuracy:0.8317777777777777, validation loss:0.7486404841704205, validation accuracy:0.839
epoch:5, train loss:0.811012214421897, train accuracy:0.8360740740740741, validation loss:0.7903786097931428, validation accuracy:0.843


0,1
train accuracy,▁▆▇██
train loss,▅▁▅▇█
validation accuracy,▁▆▆▇█
validation loss,▄▁▂▅█

0,1
train accuracy,0.83607
train loss,0.81101
validation accuracy,0.843
validation loss,0.79038


[34m[1mwandb[0m: Agent Starting Run: ao320wjv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8349614330892571, train accuracy:0.727462962962963, validation loss:0.6918913438865528, validation accuracy:0.8021666666666667
epoch:2, train loss:0.7166338128285495, train accuracy:0.8099074074074074, validation loss:0.796011373200869, validation accuracy:0.8128333333333333
epoch:3, train loss:0.7233171133983421, train accuracy:0.8204814814814815, validation loss:0.7657387600242418, validation accuracy:0.8223333333333334
epoch:4, train loss:0.7272475621217189, train accuracy:0.8277777777777777, validation loss:0.6696451433784657, validation accuracy:0.838
epoch:5, train loss:0.7348373403207223, train accuracy:0.8316666666666667, validation loss:0.79826757861975, validation accuracy:0.8296666666666667


VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.061833…

0,1
train accuracy,▁▇▇██
train loss,█▁▁▂▂
validation accuracy,▁▃▅█▆
validation loss,▂█▆▁█

0,1
train accuracy,0.83167
train loss,0.73484
validation accuracy,0.82967
validation loss,0.79827


[34m[1mwandb[0m: Agent Starting Run: id8vl73p with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.7888971791794431, train accuracy:0.7423888888888889, validation loss:0.6465107708984326, validation accuracy:0.7935
epoch:2, train loss:0.6482745556458877, train accuracy:0.8100740740740741, validation loss:0.6193487251911246, validation accuracy:0.8268333333333333
epoch:3, train loss:0.6613289556624964, train accuracy:0.8220555555555555, validation loss:0.6533638783878032, validation accuracy:0.8205
epoch:4, train loss:0.6842332619880621, train accuracy:0.828925925925926, validation loss:0.6632757037943675, validation accuracy:0.8368333333333333
epoch:5, train loss:0.6931145367614523, train accuracy:0.8327037037037037, validation loss:0.811554688482032, validation accuracy:0.819


0,1
train accuracy,▁▆▇██
train loss,█▁▂▃▃
validation accuracy,▁▆▅█▅
validation loss,▂▁▂▃█

0,1
train accuracy,0.8327
train loss,0.69311
validation accuracy,0.819
validation loss,0.81155


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: s8t85rt6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8331989486337239, train accuracy:0.7243888888888889, validation loss:0.6971393999689742, validation accuracy:0.7913333333333333
epoch:2, train loss:0.6929073491233678, train accuracy:0.8048333333333333, validation loss:0.6816019327396287, validation accuracy:0.8166666666666667
epoch:3, train loss:0.7140091796284428, train accuracy:0.8174814814814815, validation loss:0.7846050973343507, validation accuracy:0.8191666666666667
epoch:4, train loss:0.7371687746536036, train accuracy:0.8226481481481481, validation loss:0.7120551730281635, validation accuracy:0.833
epoch:5, train loss:0.7438114749760577, train accuracy:0.8290555555555555, validation loss:0.7635235971109403, validation accuracy:0.8246666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▂▃▄
validation accuracy,▁▅▆█▇
validation loss,▂▁█▃▇

0,1
train accuracy,0.82906
train loss,0.74381
validation accuracy,0.82467
validation loss,0.76352


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
502 response executing GraphQL.

<html><head>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<title>502 Server Error</title>
</head>
<body text=#000000 bgcolor=#ffffff>
<h1>Error: Server Error</h1>
<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>
<h2></h2>
</body></html>

[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 47a3jrnc with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.7540030274070251, train accuracy:0.7399444444444444, validation loss:0.5239493434167283, validation accuracy:0.8136666666666666
epoch:2, train loss:0.4806083551419472, train accuracy:0.830925925925926, validation loss:0.44945112818223554, validation accuracy:0.841
epoch:3, train loss:0.44214602438625134, train accuracy:0.8432777777777778, validation loss:0.4176578248526562, validation accuracy:0.8533333333333334
epoch:4, train loss:0.42088629012549167, train accuracy:0.8519444444444444, validation loss:0.4440770756720643, validation accuracy:0.8448333333333333
epoch:5, train loss:0.4134453187373555, train accuracy:0.8566296296296296, validation loss:0.42158734208808724, validation accuracy:0.8613333333333333


0,1
train accuracy,▁▆▇██
train loss,█▂▂▁▁
validation accuracy,▁▅▇▆█
validation loss,█▃▁▃▁

0,1
train accuracy,0.85663
train loss,0.41345
validation accuracy,0.86133
validation loss,0.42159


[34m[1mwandb[0m: Agent Starting Run: d44qtek7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.667493518282994, train accuracy:0.779537037037037, validation loss:0.4700452830873328, validation accuracy:0.8353333333333334
epoch:2, train loss:0.4518322182942846, train accuracy:0.840574074074074, validation loss:0.4238791103290023, validation accuracy:0.8531666666666666
epoch:3, train loss:0.41711922595337525, train accuracy:0.8532037037037037, validation loss:0.40491582963084916, validation accuracy:0.86
epoch:4, train loss:0.3989084673616135, train accuracy:0.859462962962963, validation loss:0.4016600281623518, validation accuracy:0.8601666666666666
epoch:5, train loss:0.38864114209851386, train accuracy:0.8647407407407407, validation loss:0.38927821179669114, validation accuracy:0.8656666666666667


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▃▂▁▁
validation accuracy,▁▅▇▇█
validation loss,█▄▂▂▁

0,1
train accuracy,0.86474
train loss,0.38864
validation accuracy,0.86567
validation loss,0.38928


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vyv9z9vl with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8434067830462618, train accuracy:0.7087592592592593, validation loss:0.6874328025170956, validation accuracy:0.7911666666666667
epoch:2, train loss:0.6746353449105454, train accuracy:0.8037407407407408, validation loss:0.6610112276580001, validation accuracy:0.8135
epoch:3, train loss:0.6886050052356213, train accuracy:0.8178888888888889, validation loss:0.6920221393419576, validation accuracy:0.8223333333333334
epoch:4, train loss:0.7059846402020648, train accuracy:0.8244259259259259, validation loss:0.7575761155728381, validation accuracy:0.8253333333333334
epoch:5, train loss:0.7258445724532303, train accuracy:0.8295370370370371, validation loss:0.7274477538896511, validation accuracy:0.836


0,1
train accuracy,▁▇▇██
train loss,█▁▂▂▃
validation accuracy,▁▄▆▆█
validation loss,▃▁▃█▆

0,1
train accuracy,0.82954
train loss,0.72584
validation accuracy,0.836
validation loss,0.72745


[34m[1mwandb[0m: Agent Starting Run: 8hqcez4a with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.7954806870140272, train accuracy:0.7647222222222222, validation loss:0.8295067373197907, validation accuracy:0.7928333333333333
epoch:2, train loss:0.7535988081220403, train accuracy:0.8221851851851851, validation loss:0.725297040287681, validation accuracy:0.8381666666666666
epoch:3, train loss:0.7701385151825867, train accuracy:0.8321481481481482, validation loss:0.797080500679502, validation accuracy:0.8343333333333334
epoch:4, train loss:0.7799879013542241, train accuracy:0.8378888888888889, validation loss:0.7530577902245223, validation accuracy:0.846
epoch:5, train loss:0.7914431467872658, train accuracy:0.8412037037037037, validation loss:0.7386290602758098, validation accuracy:0.849


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▁▄▅▇
validation accuracy,▁▇▆██
validation loss,█▁▆▃▂

0,1
train accuracy,0.8412
train loss,0.79144
validation accuracy,0.849
validation loss,0.73863


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ugw535rd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.708199392186897, train accuracy:0.7696481481481482, validation loss:0.5032205083455491, validation accuracy:0.8313333333333334
epoch:2, train loss:0.4822863551691275, train accuracy:0.8392407407407407, validation loss:0.46015614478218914, validation accuracy:0.8535
epoch:3, train loss:0.44827264944276185, train accuracy:0.8521851851851852, validation loss:0.44062220595787194, validation accuracy:0.8561666666666666
epoch:4, train loss:0.4302627266880961, train accuracy:0.8596666666666667, validation loss:0.44793031295976904, validation accuracy:0.8508333333333333
epoch:5, train loss:0.42194418447522636, train accuracy:0.8649629629629629, validation loss:0.41611984227474214, validation accuracy:0.8711666666666666


0,1
train accuracy,▁▆▇██
train loss,█▂▂▁▁
validation accuracy,▁▅▅▄█
validation loss,█▅▃▄▁

0,1
train accuracy,0.86496
train loss,0.42194
validation accuracy,0.87117
validation loss,0.41612


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ckkn4wur with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.6993037031231185, train accuracy:0.7611481481481481, validation loss:0.49022794506448913, validation accuracy:0.8273333333333334
epoch:2, train loss:0.454871765065167, train accuracy:0.8406296296296296, validation loss:0.41542184411225214, validation accuracy:0.8538333333333333
epoch:3, train loss:0.415873635176631, train accuracy:0.8535, validation loss:0.4338884580380648, validation accuracy:0.8473333333333334
epoch:4, train loss:0.39886607037172045, train accuracy:0.8598333333333333, validation loss:0.39804067443995617, validation accuracy:0.864
epoch:5, train loss:0.3902627000151748, train accuracy:0.8643888888888889, validation loss:0.4022191928753815, validation accuracy:0.8668333333333333


0,1
train accuracy,▁▆▇██
train loss,█▂▂▁▁
validation accuracy,▁▆▅▇█
validation loss,█▂▄▁▁

0,1
train accuracy,0.86439
train loss,0.39026
validation accuracy,0.86683
validation loss,0.40222


[34m[1mwandb[0m: Agent Starting Run: 82wg1qtg with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8318098469219205, train accuracy:0.719925925925926, validation loss:0.7094479579254124, validation accuracy:0.7911666666666667
epoch:2, train loss:0.6873449446036934, train accuracy:0.8062222222222222, validation loss:0.6805614950258779, validation accuracy:0.8163333333333334
epoch:3, train loss:0.6966587623726833, train accuracy:0.8180740740740741, validation loss:0.6902034825730876, validation accuracy:0.8201666666666667
epoch:4, train loss:0.7165820332381732, train accuracy:0.8242592592592592, validation loss:0.6942133268713334, validation accuracy:0.8286666666666667
epoch:5, train loss:0.7414357330230238, train accuracy:0.8277037037037037, validation loss:0.6960130792702979, validation accuracy:0.8368333333333333


0,1
train accuracy,▁▇▇██
train loss,█▁▁▂▄
validation accuracy,▁▅▅▇█
validation loss,█▁▃▄▅

0,1
train accuracy,0.8277
train loss,0.74144
validation accuracy,0.83683
validation loss,0.69601


[34m[1mwandb[0m: Agent Starting Run: nkq8rvr8 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8008307621277222, train accuracy:0.7395185185185185, validation loss:0.6335533603487422, validation accuracy:0.8108333333333333
epoch:2, train loss:0.6861383179167921, train accuracy:0.8125185185185185, validation loss:0.6900220913200451, validation accuracy:0.8205
epoch:3, train loss:0.7087005461867962, train accuracy:0.8252222222222222, validation loss:0.7345480626506181, validation accuracy:0.8183333333333334
epoch:4, train loss:0.7146947228816481, train accuracy:0.8318518518518518, validation loss:0.7376103770090229, validation accuracy:0.834


[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.


epoch:5, train loss:0.7194159166099839, train accuracy:0.8342222222222222, validation loss:0.7074875723924342, validation accuracy:0.842


0,1
train accuracy,▁▆▇██
train loss,█▁▂▃▃
validation accuracy,▁▃▃▆█
validation loss,▁▅██▆

0,1
train accuracy,0.83422
train loss,0.71942
validation accuracy,0.842
validation loss,0.70749


[34m[1mwandb[0m: Agent Starting Run: uvcw7dhy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.6849805687563485, train accuracy:0.7697777777777778, validation loss:0.49217342717690815, validation accuracy:0.8268333333333333
epoch:2, train loss:0.4538480031167666, train accuracy:0.8403703703703703, validation loss:0.41640914202797197, validation accuracy:0.8525
epoch:3, train loss:0.4185574913677427, train accuracy:0.8537037037037037, validation loss:0.41293827127738303, validation accuracy:0.8565
epoch:4, train loss:0.3988106077638303, train accuracy:0.8603703703703703, validation loss:0.3876560393212241, validation accuracy:0.8648333333333333
epoch:5, train loss:0.3850889049086975, train accuracy:0.8649444444444444, validation loss:0.3909001265914625, validation accuracy:0.8623333333333333


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train accuracy,▁▆▇██
train loss,█▃▂▁▁
validation accuracy,▁▆▆██
validation loss,█▃▃▁▁

0,1
train accuracy,0.86494
train loss,0.38509
validation accuracy,0.86233
validation loss,0.3909


[34m[1mwandb[0m: Agent Starting Run: phtvd6mo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layer: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	size_of_hidden_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_initialization: Xavier


epoch:1, train loss:0.8599916037273996, train accuracy:0.7383333333333333, validation loss:0.7347011112017289, validation accuracy:0.809
epoch:2, train loss:0.8456985168343978, train accuracy:0.8064629629629629, validation loss:0.8046632872216161, validation accuracy:0.8263333333333334
