# Import library

In [None]:
import random
import time
import warnings
import numpy as np
import matplotlib.pyplot as plt
from numpy.core.fromnumeric import size
%matplotlib inline

# Import fashion'mnist 

In [None]:
from tensorflow.keras.datasets import fashion_mnist

# Load datasets

In [None]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# plot 2*2 grid from four random fashion'mnist dataset 

In [None]:
rand_list=random.sample(range(0, 60000), 4)
for i in range(4):
    plt.subplot(220 + 1 + (i))
    plt.imshow(x_train[rand_list[i]])
plt.show()

# Implementation activation functions

### 1-1 Sigmoid

In [None]:
def sigmoid(x):
        return 1/(1+np.exp(-x))

### 1-2 ReLU

In [None]:
def relu(x):
    return np.maximum(0.0, x)

### 1-3 Softmax

In [None]:
def softmax(x):
    exps = np.exp(x - np.max(x))
    return exps / np.sum(exps)

# Implementation derivative activation functions

### 2-1 Sigmoid Derivative

In [None]:
def Sigmoid_D(x):
    return np.multiply(x,(1-x))

### 2-2 ReLU Derivative

In [None]:
def relu_D(x):
    return 1 * (x > 0)

### 2-3 Softmax Derivative

In [None]:
def Softmax_D(x):
    exps = np.exp(x - x.max())
    return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0))

# Loss function

### Categorical Cross entropy

In [None]:
def Multi_cross_entropy(y, ypred):
    return -sum([y[i]*np.log(ypred[i]) for i in range(len(y))])

# Implementation SGD and Momentum

### SGD Optimizer

In [None]:
def SGD(deri_b1,deri_b2,deri_b3,deri_w1,deri_w2,deri_w3,lr,b1,b2,b3,w1,w2,w3):
    self.b3 -= self.lr*self.deri_b3
    self.w3 -= self.lr*self.deri_w3
    self.b2 -= self.lr*self.deri_b2
    self.w2 -= self.lr*self.deri_w2
    self.b1 -= self.lr*self.deri_b1
    self.w1 -= self.lr*self.deri_w1

### Momentum Optimizer

In [None]:
def Momentum(deri_b1,deri_b2,deri_b3,deri_w1,deri_w2,deri_w3,momentum_b1,momentum_b2,momentum_b3,
            momentum_w1,momentum_w2,momentum_w3,beta1,beta2,b1,b2,b3,w1,w2,w3,lr):
    
    momentum_b3 = beta1*momentum_b3 + (1-beta1)*deri_b3
    momentum_w3 = beta1*momentum_w3 + (1-beta1)*deri_w3
    momentum_b2 = beta1*momentum_b2 + (1-beta1)*deri_b2
    momentum_w2 = beta1*momentum_w2 + (1-beta1)*deri_w2
    momentum_b1 = beta1*momentum_b1 + (1-beta1)*deri_b1
    momentum_w1 = beta1*momentum_w1 + (1-beta1)*deri_w1
    ################update###########
    b3 -= momentum_b3
    w3 -= momentum_w3
    b2 -= momentum_b2
    w2 -= momentum_w2
    b1 -= momentum_b1
    w1 -= momentum_w1

# Create a Neural Network (784,128,64,10)

In [None]:
class MLP():
    ################intialize parameter###########
    samples = 60000
    batch_size = 1
    epochs = 3
    beta1 = 0.9
    beta2 = 0.999
    constant = 1e-8
    lr = 1e-3
    Lambda = 0
    
    def __init__(self, sizes, optimizer, hiddenActivation, sechiddenActivation, outputActivation): 
        self.sizes = sizes

        self.optimizer = optimizer
        self.hiddenActivation = hiddenActivation
        self.sechiddenActivation=sechiddenActivation
        self.outputActivation = outputActivation

        self.x = np.arange(1,((self.samples/self.batch_size)*self.epochs)+1)
        self.y = np.empty(int(((self.samples/self.batch_size)*self.epochs)))
        self.secondLayerNeurons = np.empty(sizes[1])
        self.thirdLayerNeurons = np.empty(sizes[2])
        self.outputNeurons = np.empty(sizes[3])
        
        self.w1 = np.random.rand(sizes[1], sizes[0]) * 2 - 1
        self.w2 = np.random.rand(sizes[2], sizes[1]) * 2 - 1
        self.w3 = np.random.rand(sizes[3], sizes[2]) * 2 - 1
        self.b1 = np.zeros([sizes[1]])
        self.b2 = np.zeros([sizes[2]])
        self.b3 = np.zeros([sizes[3]])
        
        self.deri_w1 = np.zeros([sizes[1], sizes[0]])
        self.deri_w2 = np.zeros([sizes[2], sizes[1]])
        self.deri_w3 = np.zeros([sizes[3], sizes[2]])
        
        self.deri_b1 = np.zeros([sizes[1]])
        self.deri_b2 = np.zeros([sizes[2]])
        self.deri_b3 = np.zeros([sizes[3]])
        
        self.momentum_w1 = np.zeros([sizes[1], sizes[0]])
        self.momentum_w2 = np.zeros([sizes[2], sizes[1]])
        self.momentum_w3 = np.zeros([sizes[3], sizes[2]])
        self.momentum_b1 = np.zeros([sizes[1]])
        self.momentum_b2 = np.zeros([sizes[2]])
        self.momentum_b3 = np.zeros([sizes[3]])

        self.RMS_prob_w1 = np.zeros([sizes[1], sizes[0]])
        self.RMS_prob_w2 = np.zeros([sizes[2], sizes[1]])
        self.RMS_prob_w3 = np.zeros([sizes[3], sizes[2]])
        
        self.RMS_prob_b1 = np.zeros([sizes[1]])
        self.RMS_prob_b2 = np.zeros([sizes[2]])
        self.RMS_prob_b3 = np.zeros([sizes[3]])
        
        self.momentum_w1_revision = np.empty([sizes[1], sizes[0]])
        self.momentum_w2_revision = np.empty([sizes[2], sizes[1]])
        self.momentum_w3_revision = np.empty([sizes[3], sizes[2]])
        self.momentum_b1_revision = np.empty(sizes[1])
        self.momentum_b2_revision = np.empty(sizes[2])
        self.momentum_b3_revision = np.empty(sizes[3])

        self.RMS_prob_w1_revision = np.empty([sizes[1], sizes[0]])
        self.RMS_prob_w2_revision = np.empty([sizes[2], sizes[1]])
        self.RMS_prob_w3_revision = np.empty([sizes[3], sizes[2]])
        self.RMS_prob_b1_revision = np.empty(sizes[1])
        self.RMS_prob_b2_revision = np.empty(sizes[2])
        self.RMS_prob_b3_revision = np.empty(sizes[3])
        
        self.hiddenLayerErrors = np.empty(sizes[1])
        self.sechiddenLayerErrors = np.empty(sizes[2])
        self.outputLayerErrors = np.empty(sizes[3])
        self.cceoutputLayerErrors = np.empty(sizes[3])
    ################define activation and int's derivative###########
    
    def Multi_cross_entropy(self, y, ypred):
        return -sum([y[i]*np.log(ypred[i]) for i in range(len(y))])    
        
    def sigmoid(self, x):
        warnings.filterwarnings("ignore")
        return 1/(1+np.exp(-x))

    def sigmoidDerivative(self, x):
        return np.multiply(x,(1-x))

    def relu(self, x):
        return np.maximum(0.0, x)
    
    def reluDerivative(self, x):
        return 1 * (x > 0)

    def softmax(self, x):
        exps = np.exp(x - np.max(x))
        return exps / np.sum(exps)
    ################forward path###########
    def forwardProp(self, inputs):
        if self.hiddenActivation == 'sigmoid':
            self.secondLayerNeurons = self.sigmoid(self.w1 @ inputs + self.b1)
        elif self.hiddenActivation == 'relu':
            self.secondLayerNeurons = self.relu(self.w1 @ inputs + self.b1)

        if self.sechiddenActivation == 'sigmoid':
            self.thirdLayerNeurons = self.sigmoid(self.w2 @ self.secondLayerNeurons + self.b2)
        elif self.sechiddenActivation == 'softmax':
            self.thirdLayerNeurons = self.softmax(self.w2 @ self.secondLayerNeurons + self.b2)
            
        if self.outputActivation == 'sigmoid':
            self.outputNeurons = self.sigmoid(self.w3 @ self.thirdLayerNeurons + self.b3)
        elif self.outputActivation == 'softmax':
            self.outputNeurons = self.softmax(self.w3 @ self.thirdLayerNeurons + self.b3)
            len(self.outputNeurons.shape)
    ################backward path###########        
    def backProp(self, inputs, revision_output):
        self.cceoutputLayerErrors=self.Multi_cross_entropy(revision_output,self.outputNeurons)
        self.outputLayerErrors = np.subtract(self.outputNeurons, revision_output)
        if self.sechiddenActivation == 'sigmoid':
            self.sechiddenLayerErrors = np.multiply(np.dot(self.w3.T, self.outputLayerErrors), self.sigmoidDerivative(self.thirdLayerNeurons))
        elif self.sechiddenActivation == 'relu':
            self.sechiddenLayerErrors = np.multiply(np.dot(self.w3.T, self.outputLayerErrors), self.reluDerivative(self.thirdLayerNeurons))
        if self.hiddenActivation == 'sigmoid':
            self.hiddenLayerErrors = np.multiply(np.dot(self.w2.T, self.sechiddenLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
        elif self.hiddenActivation == 'relu':
            self.hiddenLayerErrors = np.multiply(np.dot(self.w2.T, self.sechiddenLayerErrors), self.reluDerivative(self.secondLayerNeurons))
    
        self.deri_b3 += self.outputLayerErrors
        self.deri_w3 += np.dot(self.outputLayerErrors.reshape(self.sizes[3],1), self.thirdLayerNeurons.reshape(1,self.sizes[2]))
        self.deri_b2 += self.sechiddenLayerErrors
        self.deri_w2 += np.dot(self.sechiddenLayerErrors.reshape(self.sizes[2],1), self.secondLayerNeurons.reshape(1,self.sizes[1]))      
        self.deri_b1 += self.hiddenLayerErrors
        self.deri_w1 += np.dot(self.hiddenLayerErrors.reshape(self.sizes[1],1), inputs.reshape(1,self.sizes[0]))
     
    ################update weight###########
    def Update_weight(self):
            
        if self.optimizer == 'momentum':
            self.momentum_b3 = self.beta1*self.momentum_b3 + self.lr*self.deri_b3
            self.momentum_w3 = self.beta1*self.momentum_w3 + self.lr*self.deri_w3
            self.momentum_b2 = self.beta1*self.momentum_b2 + self.lr*self.deri_b2
            self.momentum_w2 = self.beta1*self.momentum_w2 + self.lr*self.deri_w2
            self.momentum_b1 = self.beta1*self.momentum_b1 + self.lr*self.deri_b1
            self.momentum_w1 = self.beta1*self.momentum_w1 + self.lr*self.deri_w1

            self.b3 -= self.momentum_b3
            self.w3 -= self.momentum_w3
            self.b2 -= self.momentum_b2
            self.w2 -= self.momentum_w2
            self.b1 -= self.momentum_b1
            self.w1 -= self.momentum_w1
            
        elif self.optimizer == 'SGD':
            self.b3 -= self.lr*self.deri_b3
            self.w3 -= self.lr*self.deri_w3
            self.b2 -= self.lr*self.deri_b2
            self.w2 -= self.lr*self.deri_w2
            self.b1 -= self.lr*self.deri_b1
            self.w1 -= self.lr*self.deri_w1
        elif self.optimizer == 'adam':
            self.momentum_b3 = self.beta1*self.momentum_b3 + (1-self.beta1)*self.deri_b3
            self.momentum_w3 = self.beta1*self.momentum_w3 + (1-self.beta1)*self.deri_w3
            self.momentum_b2 = self.beta1*self.momentum_b2 + (1-self.beta1)*self.deri_b2
            self.momentum_w2 = self.beta1*self.momentum_w2 + (1-self.beta1)*self.deri_w2
            self.momentum_b1 = self.beta1*self.momentum_b1 + (1-self.beta1)*self.deri_b1
            self.momentum_w1 = self.beta1*self.momentum_w1 + (1-self.beta1)*self.deri_w1

            self.RMS_prob_b3 = self.beta2*self.RMS_prob_b3 + (1-self.beta2)*np.square(self.deri_b3)
            self.RMS_prob_w3 = self.beta2*self.RMS_prob_w3 + (1-self.beta2)*np.square(self.deri_w3)
            self.RMS_prob_b2 = self.beta2*self.RMS_prob_b2 + (1-self.beta2)*np.square(self.deri_b2)
            self.RMS_prob_w2 = self.beta2*self.RMS_prob_w2 + (1-self.beta2)*np.square(self.deri_w2)
            self.RMS_prob_b1 = self.beta2*self.RMS_prob_b1 + (1-self.beta2)*np.square(self.deri_b1)
            self.RMS_prob_w1 = self.beta2*self.RMS_prob_w1 + (1-self.beta2)*np.square(self.deri_w1)


            self.momentum_b3_revision = self.momentum_b3/(1-np.power(self.beta1, self.batch_size))
            self.momentum_w3_revision = self.momentum_w3/(1-np.power(self.beta1, self.batch_size))
            self.momentum_b2_revision = self.momentum_b2/(1-np.power(self.beta1, self.batch_size))
            self.momentum_w2_revision = self.momentum_w2/(1-np.power(self.beta1, self.batch_size))
            self.momentum_b1_revision = self.momentum_b1/(1-np.power(self.beta1, self.batch_size))
            self.momentum_w1_revision = self.momentum_w1/(1-np.power(self.beta1, self.batch_size))

            self.RMS_prob_b3_revision = self.RMS_prob_b3/(1-np.power(self.beta2, self.batch_size))
            self.RMS_prob_w3_revision = self.RMS_prob_w3/(1-np.power(self.beta2, self.batch_size))
            self.RMS_prob_b2_revision = self.RMS_prob_b2/(1-np.power(self.beta2, self.batch_size))
            self.RMS_prob_w2_revision = self.RMS_prob_w2/(1-np.power(self.beta2, self.batch_size))
            self.RMS_prob_b1_revision = self.RMS_prob_b1/(1-np.power(self.beta2, self.batch_size))
            self.RMS_prob_w1_revision = self.RMS_prob_w1/(1-np.power(self.beta2, self.batch_size))

            self.b3 -= self.lr * (self.momentum_b3_revision/(np.sqrt(self.RMS_prob_b3_revision)+self.constant))
            self.w3 -= self.lr * (self.momentum_w3_revision/(np.sqrt(self.RMS_prob_w3_revision)+self.constant))
            self.b2 -= self.lr * (self.momentum_b2_revision/(np.sqrt(self.RMS_prob_b2_revision)+self.constant))
            self.w2 -= self.lr * (self.momentum_w2_revision/(np.sqrt(self.RMS_prob_w2_revision)+self.constant))
            self.b1 -= self.lr * (self.momentum_b1_revision/(np.sqrt(self.RMS_prob_b1_revision)+self.constant))
            self.w1 -= self.lr * (self.momentum_w1_revision/(np.sqrt(self.RMS_prob_w1_revision)+self.constant))

        

        self.deri_w1 = np.zeros([self.sizes[1], self.sizes[0]])
        self.deri_w2 = np.zeros([self.sizes[2], self.sizes[1]])
        self.deri_w3 = np.zeros([self.sizes[3], self.sizes[2]])
        self.deri_b1 = np.zeros(self.sizes[1])
        self.deri_b2 = np.zeros(self.sizes[2])
        self.deri_b3 = np.zeros(self.sizes[3])
        
    ################training phase###########    
    def train(self, trainImages, trainLabels):
        size = str(self.batch_size)
        accuracy = 0
        err_sum = 0.0
        avg_err = 0.0
        revision = 0

        batch_start_time = time.time()
        for m in range (self.batch_size):
            revision_output = np.zeros([self.sizes[3]])
            revision_output[trainLabels[m]] = 1.0
            self.forwardProp(trainImages[m].flatten())
            self.backProp(trainImages[m].flatten(), revision_output)

            if np.argmax(self.outputNeurons) == int(trainLabels[m]):
                revision+=1

            error = np.amax(np.absolute(self.cceoutputLayerErrors))
            print("Error:")
            print(error)
            err_sum += error
            avg_err = err_sum / (m+1)
            acc = str(int((revision/(m+1))*100)) + '%'
            print("Accuracy:")
            print(acc)
            
            
        self.Update_weight()
        return avg_err
    ################prediction###########
    def predict(self, testImage):
        self.forwardProp(testImage)
        return np.argmax(self.outputNeurons), self.outputNeurons



# Training phase based on sigmoid activation and SGD optimizer

In [None]:
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images/255
inst_mlp = MLP([784, 128,64, 10], 'SGD', 'sigmoid','sigmoid', 'softmax')
train_err=[]
test_err=[]
for i in range (inst_mlp.epochs):
    print('----------------------'+'Epoch'+str(i+1)+'----------------------')
    for j in range(int(inst_mlp.samples/inst_mlp.batch_size)):
        print("Epoch", str(i+1) + "/" + str(inst_mlp.epochs) + ":" )
        inst_mlp.y[j+i*(int(inst_mlp.samples/inst_mlp.batch_size))] = inst_mlp.train(train_images[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size], train_labels[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size])
    size1 = test_images.shape[0]
    temp_err=0
    for i in range (size1):
        prediction,yp = inst_mlp.predict(test_images[i].flatten())
        yt = np.zeros([10])
        yt[test_labels[i]] = 1.0
        cost=Multi_cross_entropy(yt,yp)
        temp_err=temp_err+np.amax(np.absolute(cost))
        
    test_err.append(temp_err)
test_err=np.array(test_err)/10000       
for i in range(inst_mlp.epochs):
    train_err.append((np.sum(inst_mlp.y[60000*i:60000*(i+1)-1]))/60000)    

# Train Accuracy

In [None]:
yfalse = 0
size1 = train_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(train_images[i].flatten())
    ytrue = int(train_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Train_Accuracy:')
print(((size1-yfalse)/size1)*100)



# Test Accuracy

In [None]:
yfalse = 0
size1 = test_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(test_images[i].flatten())
    ytrue = int(test_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Test_Accuracy:')
print(((size1-yfalse)/size1)*100)



# plot cost function for testing phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(test_err))
plt.ylabel('Test Error')
plt.xlabel('Epochs')
plt.show()

# plot cost function for training phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(train_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()

# Training phase based on ReLU activation and SGD optimizer

In [None]:
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images/255
inst_mlp = MLP([784, 128,64, 10], 'SGD', 'relu','relu', 'softmax')
train_err=[]
test_err=[]
for i in range (inst_mlp.epochs):
    print('----------------------'+'Epoch'+str(i+1)+'----------------------')
    for j in range(int(inst_mlp.samples/inst_mlp.batch_size)):
        print("Epoch", str(i+1) + "/" + str(inst_mlp.epochs) + ":" )
        inst_mlp.y[j+i*(int(inst_mlp.samples/inst_mlp.batch_size))] = inst_mlp.train(train_images[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size], train_labels[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size])
    size1 = test_images.shape[0]
    temp_err=0
    for i in range (size1):
        prediction,yp = inst_mlp.predict(test_images[i].flatten())
        yt = np.zeros([10])
        yt[test_labels[i]] = 1.0
        cost=Multi_cross_entropy(yt,yp)
        temp_err=temp_err+np.amax(np.absolute(cost))
        
    test_err.append(temp_err)
test_err=np.array(test_err)/10000       
for i in range(inst_mlp.epochs):
    train_err.append((np.sum(inst_mlp.y[60000*i:60000*(i+1)-1]))/60000)    

# Train Accuracy

In [None]:
yfalse = 0
size1 = train_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(train_images[i].flatten())
    ytrue = int(train_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Train_Accuracy:')
print(((size1-yfalse)/size1)*100)



# Test Accuracy

In [None]:
yfalse = 0
size1 = test_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(test_images[i].flatten())
    ytrue = int(test_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Test_Accuracy:')
print(((size1-yfalse)/size1)*100)



# plot cost function for testing phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(test_err))
plt.ylabel('Test Error')
plt.xlabel('Epochs')
plt.show()

# plot cost function for training phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(train_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()

# Training phase based on sigmoid activation and momentum optimizer

In [None]:
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images/255
inst_mlp = MLP([784, 128,64, 10], 'momentum', 'sigmoid','sigmoid', 'softmax')
train_err=[]
test_err=[]
for i in range (inst_mlp.epochs):
    print('----------------------'+'Epoch'+str(i+1)+'----------------------')
    for j in range(int(inst_mlp.samples/inst_mlp.batch_size)):
        print("Epoch", str(i+1) + "/" + str(inst_mlp.epochs) + ":" )
        inst_mlp.y[j+i*(int(inst_mlp.samples/inst_mlp.batch_size))] = inst_mlp.train(train_images[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size], train_labels[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size])
    size1 = test_images.shape[0]
    temp_err=0
    for i in range (size1):
        prediction,yp = inst_mlp.predict(test_images[i].flatten())
        yt = np.zeros([10])
        yt[test_labels[i]] = 1.0
        cost=Multi_cross_entropy(yt,yp)
        temp_err=temp_err+np.amax(np.absolute(cost))
        
    test_err.append(temp_err)
test_err=np.array(test_err)/10000       
for i in range(inst_mlp.epochs):
    train_err.append((np.sum(inst_mlp.y[60000*i:60000*(i+1)-1]))/60000)    

# Train Accuracy

In [None]:
yfalse = 0
size1 = train_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(train_images[i].flatten())
    ytrue = int(train_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Train_Accuracy:')
print(((size1-yfalse)/size1)*100)



# Test Accuracy

In [None]:
yfalse = 0
size1 = test_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(test_images[i].flatten())
    ytrue = int(test_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Test_Accuracy:')
print(((size1-yfalse)/size1)*100)



# plot cost function for testing phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(test_err))
plt.ylabel('Test Error')
plt.xlabel('Epochs')
plt.show()

# plot cost function for traininig phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(train_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()

# Create a Neural Network (784,128,64,64,10)

In [None]:
class MLP():
    ################intialize parameter###########
    samples = 60000
    batch_size = 1
    epochs = 3
    beta1 = 0.9
    beta2 = 0.999
    constant = 1e-8
    lr = 1e-3
    Lambda = 0
    
    def __init__(self, sizes, optimizer, hiddenActivation, sechiddenActivation,thihiddenActivation, outputActivation): 
        self.sizes = sizes

        self.optimizer = optimizer
        self.hiddenActivation = hiddenActivation
        self.sechiddenActivation=sechiddenActivation
        self.thihiddenActivation=thihiddenActivation
        self.outputActivation = outputActivation

        self.x = np.arange(1,((self.samples/self.batch_size)*self.epochs)+1)
        self.y = np.empty(int(((self.samples/self.batch_size)*self.epochs)))
        self.secondLayerNeurons = np.empty(sizes[1])
        self.thirdLayerNeurons = np.empty(sizes[2])
        self.fourLayerNeurons = np.empty(sizes[3])
        self.outputNeurons = np.empty(sizes[4])
        
        self.w1 = np.random.rand(sizes[1], sizes[0]) * 2 - 1
        self.w2 = np.random.rand(sizes[2], sizes[1]) * 2 - 1
        self.w3 = np.random.rand(sizes[3], sizes[2]) * 2 - 1
        self.w4 = np.random.rand(sizes[4], sizes[3]) * 2 - 1
        self.b1 = np.zeros([sizes[1]])
        self.b2 = np.zeros([sizes[2]])
        self.b3 = np.zeros([sizes[3]])
        self.b4 = np.zeros([sizes[4]])
        
        self.deri_w1 = np.zeros([sizes[1], sizes[0]])
        self.deri_w2 = np.zeros([sizes[2], sizes[1]])
        self.deri_w3 = np.zeros([sizes[3], sizes[2]])
        self.deri_w4 = np.zeros([sizes[4], sizes[3]])
        
        self.deri_b1 = np.zeros([sizes[1]])
        self.deri_b2 = np.zeros([sizes[2]])
        self.deri_b3 = np.zeros([sizes[3]])
        self.deri_b4 = np.zeros([sizes[4]])
        
        self.momentum_w1 = np.zeros([sizes[1], sizes[0]])
        self.momentum_w2 = np.zeros([sizes[2], sizes[1]])
        self.momentum_w3 = np.zeros([sizes[3], sizes[2]])
        self.momentum_w4 = np.zeros([sizes[4], sizes[3]])
        self.momentum_b1 = np.zeros([sizes[1]])
        self.momentum_b2 = np.zeros([sizes[2]])
        self.momentum_b3 = np.zeros([sizes[3]])
        self.momentum_b4 = np.zeros([sizes[4]])

        self.RMS_prob_w1 = np.zeros([sizes[1], sizes[0]])
        self.RMS_prob_w2 = np.zeros([sizes[2], sizes[1]])
        self.RMS_prob_w3 = np.zeros([sizes[3], sizes[2]])
        self.RMS_prob_w4 = np.zeros([sizes[4], sizes[3]])
        
        self.RMS_prob_b1 = np.zeros([sizes[1]])
        self.RMS_prob_b2 = np.zeros([sizes[2]])
        self.RMS_prob_b3 = np.zeros([sizes[3]])
        self.RMS_prob_b4 = np.zeros([sizes[4]])
        
        self.momentum_w1_revision = np.empty([sizes[1], sizes[0]])
        self.momentum_w2_revision = np.empty([sizes[2], sizes[1]])
        self.momentum_w3_revision = np.empty([sizes[3], sizes[2]])
        self.momentum_w4_revision = np.empty([sizes[4], sizes[3]])
        self.momentum_b1_revision = np.empty(sizes[1])
        self.momentum_b2_revision = np.empty(sizes[2])
        self.momentum_b3_revision = np.empty(sizes[3])
        self.momentum_b4_revision = np.empty(sizes[4])

        self.RMS_prob_w1_revision = np.empty([sizes[1], sizes[0]])
        self.RMS_prob_w2_revision = np.empty([sizes[2], sizes[1]])
        self.RMS_prob_w3_revision = np.empty([sizes[3], sizes[2]])
        self.RMS_prob_w4_revision = np.empty([sizes[4], sizes[3]])
        self.RMS_prob_b1_revision = np.empty(sizes[1])
        self.RMS_prob_b2_revision = np.empty(sizes[2])
        self.RMS_prob_b3_revision = np.empty(sizes[3])
        self.RMS_prob_b4_revision = np.empty(sizes[4])
        
        self.hiddenLayerErrors = np.empty(sizes[1])
        self.sechiddenLayerErrors = np.empty(sizes[2])
        self.thirhiddenLayerErrors = np.empty(sizes[3])
        self.outputLayerErrors = np.empty(sizes[4])
        self.cceoutputLayerErrors = np.empty(sizes[4])
    ################define activation and int's derivative###########
    
    def Multi_cross_entropy(self, y, ypred):
        return -sum([y[i]*np.log(ypred[i]) for i in range(len(y))])    
        
    def sigmoid(self, x):
        warnings.filterwarnings("ignore")
        return 1/(1+np.exp(-x))

    def sigmoidDerivative(self, x):
        return np.multiply(x,(1-x))

    def relu(self, x):
        return np.maximum(0.0, x)
    
    def reluDerivative(self, x):
        return 1 * (x > 0)

    def softmax(self, x):
        exps = np.exp(x - np.max(x))
        return exps / np.sum(exps)
    ################forward path###########
    def forwardProp(self, inputs):
        if self.hiddenActivation == 'sigmoid':
            self.secondLayerNeurons = self.sigmoid(self.w1 @ inputs + self.b1)
        elif self.hiddenActivation == 'relu':
            self.secondLayerNeurons = self.relu(self.w1 @ inputs + self.b1)

        if self.sechiddenActivation == 'sigmoid':
            self.thirdLayerNeurons = self.sigmoid(self.w2 @ self.secondLayerNeurons + self.b2)
        elif self.sechiddenActivation == 'softmax':
            self.thirdLayerNeurons = self.softmax(self.w2 @ self.secondLayerNeurons + self.b2)
            
        if self.thihiddenActivation == 'sigmoid':
            self.fourLayerNeurons = self.sigmoid(self.w3 @ self.thirdLayerNeurons + self.b3)
        elif self.thihiddenActivation == 'softmax':
            self.fourLayerNeurons = self.softmax(self.w3 @ self.thirdLayerNeurons + self.b3)
            
        if self.outputActivation == 'sigmoid':
            self.outputNeurons = self.sigmoid(self.w4 @ self.fourLayerNeurons+ self.b4)
        elif self.outputActivation == 'softmax':
            self.outputNeurons = self.softmax(self.w4 @ self.fourLayerNeurons + self.b4)
        
    ################backward path###########        
    def backProp(self, inputs, revision_output):
        self.cceoutputLayerErrors=self.Multi_cross_entropy(revision_output,self.outputNeurons)
        self.outputLayerErrors = np.subtract(self.outputNeurons, revision_output)
        
        if self.thihiddenActivation == 'sigmoid':
            self.thirhiddenLayerErrors = np.multiply(np.dot(self.w4.T, self.outputLayerErrors), self.sigmoidDerivative(self.fourLayerNeurons))
        elif self.thihiddenActivation == 'relu':
            self.thirhiddenLayerErrors = np.multiply(np.dot(self.w4.T, self.outputLayerErrors), self.reluDerivative(self.fourLayerNeurons))
            
        if self.sechiddenActivation == 'sigmoid':
            self.sechiddenLayerErrors = np.multiply(np.dot(self.w3.T, self.thirhiddenLayerErrors), self.sigmoidDerivative(self.thirdLayerNeurons))
        elif self.sechiddenActivation == 'relu':
            self.sechiddenLayerErrors = np.multiply(np.dot(self.w3.T, self.thirhiddenLayerErrors), self.reluDerivative(self.thirdLayerNeurons))
            
        if self.hiddenActivation == 'sigmoid':
            self.hiddenLayerErrors = np.multiply(np.dot(self.w2.T, self.sechiddenLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
        elif self.hiddenActivation == 'relu':
            self.hiddenLayerErrors = np.multiply(np.dot(self.w2.T, self.sechiddenLayerErrors), self.reluDerivative(self.secondLayerNeurons))
    
        self.deri_b4 += self.outputLayerErrors
        self.deri_w4 += np.dot(self.outputLayerErrors.reshape(self.sizes[4],1), self.fourLayerNeurons.reshape(1,self.sizes[3]))
        self.deri_b3 += self.thirhiddenLayerErrors
        self.deri_w3 += np.dot(self.thirhiddenLayerErrors.reshape(self.sizes[3],1), self.thirdLayerNeurons.reshape(1,self.sizes[2]))
        self.deri_b2 += self.sechiddenLayerErrors
        self.deri_w2 += np.dot(self.sechiddenLayerErrors.reshape(self.sizes[2],1), self.secondLayerNeurons.reshape(1,self.sizes[1]))      
        self.deri_b1 += self.hiddenLayerErrors
        self.deri_w1 += np.dot(self.hiddenLayerErrors.reshape(self.sizes[1],1), inputs.reshape(1,self.sizes[0]))
     
    ################update weight###########
    def Update_weight(self):
            
        if self.optimizer == 'momentum':
            self.momentum_b4 = self.beta1*self.momentum_b4 + self.lr*self.deri_b4
            self.momentum_w4 = self.beta1*self.momentum_w4 + self.lr*self.deri_w4
            self.momentum_b3 = self.beta1*self.momentum_b3 + self.lr*self.deri_b3
            self.momentum_w3 = self.beta1*self.momentum_w3 + self.lr*self.deri_w3
            self.momentum_b2 = self.beta1*self.momentum_b2 + self.lr*self.deri_b2
            self.momentum_w2 = self.beta1*self.momentum_w2 + self.lr*self.deri_w2
            self.momentum_b1 = self.beta1*self.momentum_b1 + self.lr*self.deri_b1
            self.momentum_w1 = self.beta1*self.momentum_w1 + self.lr*self.deri_w1

            self.b4 -= self.momentum_b4
            self.w4 -= self.momentum_w4
            self.b3 -= self.momentum_b3
            self.w3 -= self.momentum_w3
            self.b2 -= self.momentum_b2
            self.w2 -= self.momentum_w2
            self.b1 -= self.momentum_b1
            self.w1 -= self.momentum_w1
            
        elif self.optimizer == 'SGD':
            self.b4 -= self.lr*self.deri_b4
            self.w4 -= self.lr*self.deri_w4
            self.b3 -= self.lr*self.deri_b3
            self.w3 -= self.lr*self.deri_w3
            self.b2 -= self.lr*self.deri_b2
            self.w2 -= self.lr*self.deri_w2
            self.b1 -= self.lr*self.deri_b1
            self.w1 -= self.lr*self.deri_w1
            
        

        self.deri_w1 = np.zeros([self.sizes[1], self.sizes[0]])
        self.deri_w2 = np.zeros([self.sizes[2], self.sizes[1]])
        self.deri_w3 = np.zeros([self.sizes[3], self.sizes[2]])
        self.deri_w4 = np.zeros([self.sizes[4], self.sizes[3]])
        self.deri_b1 = np.zeros(self.sizes[1])
        self.deri_b2 = np.zeros(self.sizes[2])
        self.deri_b3 = np.zeros(self.sizes[3])
        self.deri_b4 = np.zeros(self.sizes[4])
    ################training phase###########    
    def train(self, trainImages, trainLabels):
        size = str(self.batch_size)
        accuracy = 0
        err_sum = 0.0
        avg_err = 0.0
        revision = 0

        batch_start_time = time.time()
        for m in range (self.batch_size):
            revision_output = np.zeros([self.sizes[4]])
            revision_output[trainLabels[m]] = 1.0
            self.forwardProp(trainImages[m].flatten())
            self.backProp(trainImages[m].flatten(), revision_output)

            if np.argmax(self.outputNeurons) == int(trainLabels[m]):
                revision+=1

            error = np.amax(np.absolute(self.cceoutputLayerErrors))
            print("Error:")
            print(error)
            err_sum += error
            avg_err = err_sum / (m+1)
            acc = str(int((revision/(m+1))*100)) + '%'
            print("Accuracy:")
            print(acc)
            
            
        self.Update_weight()
        return avg_err
    ################prediction###########
    def predict(self, testImage):
        self.forwardProp(testImage)
        return np.argmax(self.outputNeurons), self.outputNeurons



# Training phase based on sigmoid activation and SGD optimizer

In [None]:
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images/255
inst_mlp = MLP([784, 128,64,64, 10], 'SGD', 'sigmoid','sigmoid','sigmoid', 'softmax')
train_err=[]
test_err=[]
for i in range (inst_mlp.epochs):
    print('----------------------'+'Epoch'+str(i+1)+'----------------------')
    for j in range(int(inst_mlp.samples/inst_mlp.batch_size)):
        print("Epoch", str(i+1) + "/" + str(inst_mlp.epochs) + ":" )
        inst_mlp.y[j+i*(int(inst_mlp.samples/inst_mlp.batch_size))] = inst_mlp.train(train_images[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size], train_labels[int(j * inst_mlp.batch_size):int(j * inst_mlp.batch_size) + inst_mlp.batch_size])
    size1 = test_images.shape[0]
    temp_err=0
    for i in range (size1):
        prediction,yp = inst_mlp.predict(test_images[i].flatten())
        yt = np.zeros([10])
        yt[test_labels[i]] = 1.0
        cost=Multi_cross_entropy(yt,yp)
        temp_err=temp_err+np.amax(np.absolute(cost))
        
    test_err.append(temp_err)
test_err=np.array(test_err)/10000       
for i in range(inst_mlp.epochs):
    train_err.append((np.sum(inst_mlp.y[60000*i:60000*(i+1)-1]))/60000)    

# Train Accuracy

In [None]:
yfalse = 0
size1 = train_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(train_images[i].flatten())
    ytrue = int(train_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Train_Accuracy:')
print(((size1-yfalse)/size1)*100)



# Test Accuracy

In [None]:
yfalse = 0
size1 = test_images.shape[0]
for i in range (size1):
    prediction,t = inst_mlp.predict(test_images[i].flatten())
    ytrue = int(test_labels[i])
    if ytrue != prediction:
        yfalse += 1  

print('Train_Accuracy:')
print(((size1-yfalse)/size1)*100)



# plot cost function for training phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(train_err))
plt.ylabel('Test Error')
plt.xlabel('Epochs')
plt.show()

# plot cost function for testing phase 

In [None]:
plt.plot(np.arange(1,inst_mlp.epochs+1), np.array(test_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()

# MLP with PyTorch (784,128,64,10)

### Import Library

In [None]:
import torch
from torchvision import datasets
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


### Load DataSet

In [None]:
transform = transforms.Compose([
                transforms.ToTensor()
            ])

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True, transform=transform))

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transform))

### Create MLP with pytorch

In [None]:
class MLP_pytorch(nn.Module):
    def __init__(self, epochs):
        super(MLP_pytorch, self).__init__()
        self.epochs = epochs
        self.first_layer = nn.Linear(784, 128)
        self.second_layer = nn.Linear(128, 64)
        self.third_layer = nn.Linear(64, 10)


    def forward_pass(self, x):
        x = self.first_layer(x)
        x = torch.sigmoid(x)
        x = self.second_layer(x)
        x = torch.sigmoid(x)
        x = self.third_layer(x)
        x = torch.softmax(x, dim=0)
        return x

    def train(self, train_loader, test_loader, optimizer, criterion):
        start_time = time.time()
        
        train_err=[]
        train_acc=[]
        test_err=[]
        test_acc=[]

        for iteration in range(self.epochs):
            loss = 0
            ep_err=0
            ep_err_test=0
            correct=0
            correct_test=0
            for x,y in train_loader:
                ytrue=y.item()
                encoded = torch.zeros([10], dtype=torch.float64)
                encoded[y[0]] = 1
                y = encoded
                optimizer.zero_grad()
                output = self.forward_pass(torch.flatten(x))
                ypred=torch.argmax(output).item()
                loss = criterion(output, y)
                ep_err=ep_err+loss
                loss.backward()
                optimizer.step()
                if(ypred==ytrue):
                    correct=correct+1
            for x,y in test_loader:
                ytrue=y.item()
                encoded = torch.zeros([10], dtype=torch.float64)
                encoded[y[0]] = 1
                y = encoded
                output = self.forward_pass(torch.flatten(x))
                ypred=torch.argmax(output).item()
                loss = criterion(output, y)
                ep_err_test=ep_err_test+loss
                if(ypred==ytrue):
                    correct_test=correct_test+1
                    
                
            
                #print('Epoch: {0}, Time Spent: {1:.2f}s, Loss: {2}'.format(
                #iteration+1, time.time() - start_time, loss))
            train_acc.append(correct/60000)
            train_err.append(ep_err.item()/60000)
            test_acc.append(correct_test/60000)
            test_err.append(ep_err_test.item()/60000)
            print("Epoch "+str(iteration)+" done!")
            print("Train Error:")
            print(ep_err.item()/60000)
            print("Test Error:")
            print(ep_err_test.item()/60000)
            print("Train accuracy:")
            print(correct/60000)
            print("Test accuracy:")
            print(correct_test/60000)
            
        return train_err, train_acc, test_err, test_acc
    
        

In [None]:
model = MLP_pytorch(3)
optimizer = optim.SGD(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
tr_err, tr_acc, te_err, te_acc=model.train(train_loader,test_loader, optimizer, criterion)


### plot test cost  function

In [None]:
plt.plot(np.arange(1,4), np.array(tr_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()

### plot train cost  function

In [None]:
plt.plot(np.arange(1,4), np.array(te_err))
plt.ylabel('Train Error')
plt.xlabel('Epochs')
plt.show()