In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import mnist, fashion_mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
import wandb
import seaborn as sns

In [None]:
batchsize = 16
no_of_features = 784
no_of_classes = 10
no_of_layers = 5
no_of_neurons_in_each_layer = [128,128,128]

In [None]:
from keras.datasets import fashion_mnist


In [None]:
(X,Y),(X_test,Y_test) = fashion_mnist.load_data()

In [None]:
visited_label = []
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat','Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
fig,axes = plt.subplots(2,5,figsize=(10,10))
axes = axes.flatten()
index = 0
for x in range(len(X_test)):
    if(Y_test[x] not in visited_label):
        visited_label.append(Y_test[x])
        axes[index].imshow(X_test[x],cmap = plt.cm.gray)
        axes[index].set_title("{}".format(class_names[Y_test[x]]))
        index += 1
plt.show()
        



In [None]:
X = X.reshape(len(X),784,1)
X[0].shape
X_test = X_test.reshape(len(X_test),784,1)
X_test[0].shape



In [None]:
X = X/255.0
X_test = X_test/255.0

In [None]:
X_train,X_val,Y_train,Y_val = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
def one_hot(l,no_of_classes):
    temp = np.array([0]*no_of_classes)
    temp[l] = 1
    return temp

In [None]:
def compute_loss(Y_hat,Y):
    temp = []
    for x in range(len(Y)):
        temp.append(one_hot(Y[x],no_of_classes))
    temp = np.array(temp)
    return (-1.0 * np.sum(np.multiply(temp,np.log(Y_hat+1e-9).reshape(Y_hat.shape[0],Y_hat.shape[1]))))



In [None]:
def sigmoid(x):
    return 1. / (1.+np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (np.ones_like(x)-sigmoid(x))

def Relu(x):
    return np.maximum(0,x)

def Relu_derivative(x):
    return 1*(x>0) 

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return (1 - (np.tanh(x)**2))

def softmax(x):
    l = []
    for i in range(len(x)):
        l.append(np.exp(x[i])/np.sum(np.exp(x[i]),axis=0))
    return np.array(l)


def softmax_derivative(x):
    for i in range(len(x)):
        x[i] = x[i] / max(x[i])
    return softmax(x) * (1-softmax(x))


In [None]:
#pass layers as no of layers including hidden,input and output
#pass no_of_neurons as no of neurons in each hidden layer
def feed_forward(data,thetas,layers):
    pre_activation = [1]*(layers)
    activation  = [1]*(layers)
    activation[0] = data
    for layer_no in range(1,layers):
        W = 'W' + str(layer_no)
        b = 'b' + str(layer_no)
        pre_activation[layer_no] = np.add(np.matmul(thetas[W],activation[layer_no - 1]),thetas[b])
        if(layer_no == layers-1):
            activation[layer_no] = softmax(pre_activation[layer_no])
        else:
            activation[layer_no] = sigmoid(pre_activation[layer_no])
    return activation,pre_activation
        
        



In [None]:
def initialization(no_of_features,no_of_classes,no_of_layers,no_of_neurons_in_each_layer):
    np.random.seed(42)
    thetas = {}
    for layer in range(1,no_of_layers):
        if(layer == 1):
            thetas['W'+str(layer)] = np.random.randn(no_of_neurons_in_each_layer[layer-1],no_of_features) #* np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_features))
            thetas['b'+str(layer)] = np.zeros((no_of_neurons_in_each_layer[layer-1],1))
        elif(layer == no_of_layers-1):
            thetas['W'+str(layer)] = np.random.randn(no_of_classes,no_of_neurons_in_each_layer[layer-2]) #* np.sqrt(2/(no_of_classes + no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.zeros((no_of_classes,1))
        else:
            thetas['W'+str(layer)] = np.random.randn(no_of_neurons_in_each_layer[layer-1],no_of_neurons_in_each_layer[layer-2]) #* np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.zeros((no_of_neurons_in_each_layer[layer-1],1))
    return thetas

    

In [None]:
def initialization(no_of_features,no_of_classes,no_of_layers,no_of_neurons_in_each_layer):
    thetas = {}
    np.random.seed(42)
    for layer in range(1,no_of_layers):
        if(layer == 1):
            thetas['W'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size = (no_of_neurons_in_each_layer[layer-1],no_of_features)) #* np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_features))
            thetas['b'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size = (no_of_neurons_in_each_layer[layer-1],1))
        elif(layer == no_of_layers-1):
            thetas['W'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size = (no_of_classes,no_of_neurons_in_each_layer[layer-2])) #* np.sqrt(2/(no_of_classes + no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size = (no_of_classes,1))
        else:
            thetas['W'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size =(no_of_neurons_in_each_layer[layer-1],no_of_neurons_in_each_layer[layer-2])) #*  np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.random.default_rng().uniform(low = -0.7,high =0.7,size = (no_of_neurons_in_each_layer[layer-1],1))
    return thetas

    

In [None]:
def initialization(no_of_features,no_of_classes,no_of_layers,no_of_neurons_in_each_layer):
    thetas = {}
    np.random.seed(42)
    for layer in range(1,no_of_layers):
        if(layer == 1):
            thetas['W'+str(layer)] = np.random.uniform(low = -0.7,high =0.7,size = (no_of_neurons_in_each_layer[layer-1],no_of_features)) #* np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_features))
            thetas['b'+str(layer)] = np.zeros((no_of_neurons_in_each_layer[layer-1],1))
        elif(layer == no_of_layers-1):
            thetas['W'+str(layer)] = np.random.uniform(low = -0.7,high =0.7,size = (no_of_classes,no_of_neurons_in_each_layer[layer-2])) #* np.sqrt(2/(no_of_classes + no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.zeros((no_of_classes,1))
        else:
            thetas['W'+str(layer)] = np.random.uniform(low = -0.7,high =0.7,size =(no_of_neurons_in_each_layer[layer-1],no_of_neurons_in_each_layer[layer-2])) #*  np.sqrt(2/(no_of_neurons_in_each_layer[layer-1]+no_of_neurons_in_each_layer[layer-2]))
            thetas['b'+str(layer)] = np.zeros((no_of_neurons_in_each_layer[layer-1],1))
    return thetas



In [None]:
#grad wrt output layer preactivation
# correct
def compute_grad_preactivation_output(activation,Y):
    grads = []
    for x in range(len(activation[-1])):
        act = activation[-1][x]
        grad = np.array([0]*len(act)).reshape(len(act),1)
        index = Y[x]
        grad[index] = 1
        grads.append(-(grad - act))
    return np.array(grads)
    




In [None]:
def compute_grad_weight(grad_ak,hk_1):
    temp = []
    for x in range(len(grad_ak)):
        temp.append(np.matmul(grad_ak[x],hk_1[x].T))
    return np.array(temp)

In [None]:
def compute_grad_activation(wk,grad_ak):
    return np.matmul(wk.T,grad_ak)

In [None]:
def compute_grad_preactivation(grad_hk_1,ak_1):
    return np.multiply(grad_hk_1,sigmoid_derivative(ak_1))

In [144]:
def back_propagate(activation,preactivation,thetas,Y):
    grads = {}
    grads['a' + str(no_of_layers-1)] = compute_grad_preactivation_output(activation,Y)
    for k in range(no_of_layers-1,0,-1):
        grads['W'+str(k)] = np.sum(compute_grad_weight(grads['a' + str(k)],activation[k-1]),axis = 0)/batchsize
        grads['b'+str(k)] = np.sum(grads['a' + str(k)],axis = 0)/batchsize
        if(k == 1):
            break
        grads['h'+str(k-1)] = compute_grad_activation(thetas['W'+str(k)],grads['a'+str(k)])
        grads['a'+str(k-1)] = compute_grad_preactivation(grads['h'+str(k-1)],preactivation[k-1])
    return grads

In [273]:
def gradient_descent(optimizer):
    thetas = initialization(no_of_features,no_of_classes,no_of_layers,no_of_neurons_in_each_layer)
    max_epochs = 15
    eta = 0.001
    beta1 = 0.9
    beta2 = 0.99
    delta = 1e-5
    grads = {}
    for i in thetas.keys():
        grads[i] = 0
    for t in range(max_epochs):
        #previous_update
        ut = {}
        vt = {}
        for i in thetas.keys():
            ut[i] = 0
            vt[i] = 0
        params_look_ahead = {}
        for x in range(0,X_train.shape[0],batchsize):
            if(optimizer == 'nesterov'):
                for i in thetas.keys():
                    params_look_ahead[i] = thetas[i] - beta*ut[i]
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,params_look_ahead,Y_train[x:x+batchsize])
                for i in thetas.keys():
                    ut[i] = beta1*ut[i] + (1-beta1)*grads[i]
                    thetas[i] = thetas[i] - eta*ut[i]
            elif(optimizer == 'mgd'):
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,thetas,Y_train[x:x+batchsize])     
                for i in thetas.keys():
                    ut[i] = beta1*ut[i] + grads[i]
                    thetas[i] = thetas[i] - eta*ut[i]
            elif(optimizer == 'sgd'):
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,thetas,Y_train[x:x+batchsize])
                for i in thetas.keys():
                    thetas[i] = thetas[i] - eta*grads[i]
            elif(optimizer == 'RMSprop'):
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,thetas,Y_train[x:x+batchsize])
                for i in thetas.keys():
                    ut[i] = beta1*ut[i] + (1-beta1)*np.multiply(grads[i],grads[i])
                    thetas[i] = thetas[i] - eta*grads[i]/((np.sqrt(ut[i])+delta))
            elif(optimizer == 'adam'):
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,thetas,Y_train[x:x+batchsize])
                for i in thetas.keys():
                    ut[i] = beta1*ut[i] + (1-beta1)*grads[i]
                    uthat = ut[i]/(1 - pow(beta1,t+1))
                    vt[i] = beta2*vt[i] + (1-beta2)*np.multiply(grads[i],grads[i])
                    vthat = vt[i]/(1 - pow(beta2,t+1))
                    thetas[i] = thetas[i] - eta*uthat/((np.sqrt(vthat) + delta))
            elif(optimizer == 'nadam'):
                activation,preactivation = feed_forward(X_train[x:x+batchsize],thetas,no_of_layers)
                grads = back_propagate(activation,preactivation,thetas,Y_train[x:x+batchsize])
                for i in thetas.keys():
                    ut[i] = beta1*ut[i] + (1-beta1)*grads[i]
                    uthat = ut[i]/(1 - pow(beta1,t+1))
                    vt[i] = beta2*vt[i] + (1-beta2)*np.multiply(grads[i],grads[i])
                    vthat = vt[i]/(1 - pow(beta2,t+1))
                    thetas[i] = thetas[i] - (eta*(beta1*uthat + (1-beta1)*grads[i]/(1-pow(beta1,t+1))))/(np.sqrt(vthat) + delta)
        ac,pre = feed_forward(X_train[:],thetas,no_of_layers)
        print_accuracy(ac,Y_train)   
            
    return thetas
    

In [263]:
batchsize = 32
no_of_features = 784
no_of_classes = 10
no_of_layers = 5
no_of_neurons_in_each_layer = [128,128,128]

In [264]:
def print_accuracy(yhat,y):
    correct = 0
    for x in range(len(yhat[-1])):
        if(np.argmax(yhat[-1][x]) == y[x]):
            correct+=1
    print(correct/len(y)*100)

In [274]:
t = gradient_descent(optimizer='nadam')

86.46458333333334
87.73333333333333
88.42291666666667
89.12916666666668
89.60625
90.10208333333334
90.5
90.84791666666666
91.19583333333333
91.52083333333333
91.80833333333334
92.06041666666667
92.24375
92.48333333333333
92.72500000000001


In [275]:
a,h = feed_forward(X_test[:],t,no_of_layers)

In [276]:
print_accuracy(a,Y_test)

88.03


In [277]:
a,h = feed_forward(X_train[:],t,no_of_layers)

In [278]:
print_accuracy(a,Y_train)

92.72500000000001
