The network is trained using minibatch stochastic gradient descent.


Network specification:

1.   Input layer - one hidden layer - output layer
2.   Activation functions: for hidden layer "ReLU" and for output layer "softmax"
3.   Loss function: categorical cross-entropy

In [6]:
import numpy as np
from keras.datasets import mnist
from matplotlib import pyplot
import math
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [234]:
from sklearn.datasets import fetch_openml

x, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [235]:
#x = (x/255).astype('float32')
y = to_categorical(y)

In [236]:
train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.15, random_state=42)
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))

X_train: (59500, 784)
Y_train: (59500, 10)


In [237]:
x1 = x
y1 = y

In [256]:
def softmax(X):
    exps = np.exp(X - np.max(X))
    return exps/np.sum(exps,axis=0)

def d_softmax(x):
    exp_element=np.exp(x-x.max())
    return exp_element/np.sum(exp_element,axis=0)*(1-exp_element/np.sum(exp_element,axis=0))

def D_relu(x):
    x[x <= 0] = 0
    x[x > 0] = 1
    return x

def relu(X):
    return np.maximum(0.0, X)

def L2(w1, w2, rate = 0.00001):
    w1 = np.array(w1)**2
    w2 = np.array(w2)**2
    return (rate * (np.sum(w1) + np.sum(w2)))/2

def sigmoid(x):
    return 1/(np.exp(-x)+1)    

def d_sigmoid(x):
    return (np.exp(-x))/((np.exp(-x)+1)**2)

def loss(predicted, target):
    return target * np.log(predicted + 1e-8)

def d_loss(predicted, target):
    return predicted - target

In [257]:
def generateWeights(s, e):
    we = []
    for w in range(0, e):
        f = []
        for u in range(0, s):
            f.append(np.random.rand())
        we.append(f)  
    return we

def batchGenarator(data, target,  batchSize = 10):
    sample=np.random.randint(0,data.shape[0],size=(batchSize))
    x=data[sample]
    y=target[sample]
    return x, y

In [496]:
def evaluate(weightsInputLayer,
             weightsOutputLayer,
             data, target, alpha, logs = False):

    H, A, H_OUTPUT, A_OUTPUT, loss_entropy = forward_pass(weightsInputLayer,
                                                          weightsOutputLayer,
                                                          data, target, logs = logs)
    pred = np.argmax(A_OUTPUT, axis=1)
    Y = np.argmax(target, axis=1)
    accuracy = (pred == Y).mean()
    
    return accuracy

def getNumberForBatch(data, batchSize):
    all_batches = data.shape[0] // batchSize 
    if data.shape[0] % batchSize != 0:
        all_batches += 1    
        
    return all_batches

def L2_regu(weightsInputLayer, lambda_L2 = 0.0001, isL2=False):
    return weightsInputLayer * lambda_L2

def network_init(data, outputLayerNeurons = 3, hiddenLayerNeurons = 5):
    weightsOutputLayer = generateWeights(outputLayerNeurons, hiddenLayerNeurons)
    weightsInputLayer = generateWeights(hiddenLayerNeurons, len(data[0][:]))
    return weightsInputLayer, weightsOutputLayer

def forward_pass(weightsInputLayer,
                 weightsOutputLayer,
                 data, target,
                 isL2 = True, L2rate = 0.0001, logs = False):
    
    H = data.dot(weightsInputLayer)
    if logs:
        print("H: " + str(H.shape))     

    A = relu(H)
    if logs:                
        print("A: " + str(A.shape))
          
    H_OUTPUT = A.dot(weightsOutputLayer)
    if logs:
        print("H OUTPUT: " + str(H_OUTPUT.shape)) 
            
    A_OUTPUT = softmax(H_OUTPUT)
    if logs:
        print("A OUTPUT: " + str(A_OUTPUT.shape))
        
    if isL2:
        L2_ = L2(weightsOutputLayer, weightsInputLayer, L2rate)
        
    loss_entropy = loss(A_OUTPUT, target)
    if logs:
        print("Loss: " + str(loss_entropy))
        
    return H, A, H_OUTPUT, A_OUTPUT, loss_entropy

def backward_pass(H, A,
                  H_OUTPUT,
                  A_OUTPUT,
                  loss,
                   weightsInputLayer,
                   weightsOutputLayer,
                  data, target, alpha,
                  opt, hyper_param,
                  logs = False,
                  isL2 = False,
                  lambda_L2 = 0.001):
    
    REL_D = D_relu(H)
    if logs:
        print("REL_D: " + str(REL_D.shape))
        
    SOFT_D = d_softmax(H_OUTPUT)
    if logs:
        print("SOFT_D: " + str(SOFT_D.shape))
        
    LOSS_D = d_loss(A_OUTPUT, target)
    if logs:
        print("LOSS_D: " + str(LOSS_D.shape))   
        
    g_out, g_input = copute_gradient(target,
                          LOSS_D,
                          A,
                          A_OUTPUT,
                          REL_D,
                          SOFT_D,
                          weightsInputLayer,
                          weightsOutputLayer,
                          data, alpha, logs,
                           hyper_param, opt,
                           isL2 = isL2,
                           lambda_L2 = lambda_L2)
    
    return REL_D, SOFT_D, LOSS_D, g_out, g_input
                
def copute_gradient(target,
                          loss,
                          A,
                          A_OUTPUT,
                          REL_D,
                          SOFT_D,
                          weightsInputLayer,
                          weightsOutputLayer,
                          data, alpha, logs,
                          hyper_param, 
                          opt = 'sgd',
                          isL2 = False,
                          lambda_L2 = 0.001):
    
    arrL2_IN = L2_regu(weightsInputLayer, lambda_L2)
    arrL2_Out = L2_regu(weightsOutputLayer, lambda_L2) 
    
    gradient = loss * SOFT_D
    gradient_out = (A.T @ gradient) 
    if isL2:
        gradient_out + arrL2_Out
    
    gradient_rel = (np.array(weightsOutputLayer).dot(gradient.T)).T * REL_D
    gradient_input =  data.T @ gradient_rel  
    if isL2:
        gradient_input + arrL2_IN
        
    op1 = hyper_param[0]
    op2 = hyper_param[1]
    
    if opt == 'momentum':
        gradient_out, v1 =  momentum(gradient_out, alpha, op1[0], rho = 0.9)
        gradient_input, v2 =  momentum(gradient_input, alpha, op2[0], rho = 0.9)
        hyper_param[0][0] = v1
        hyper_param[1][0] = v2
        
    elif  (opt == 'adam'):
        gradient_out, first_moment, second_moment =  adam(gradient_out, alpha, op1[0], op1[1], b1 = 0.9, b2 = 0.99)
        gradient_input, first_moment1, second_moment1 =  adam(gradient_input, alpha, op2[0], op2[1], b1 = 0.9, b2 = 0.99)
        hyper_param = [[first_moment, second_moment], [first_moment1, second_moment1]]

    elif  opt == 'rmspro':
        gradient_out, ad = rmspro(gradient_out, alpha, op1[0], t = 0.5)
        gradient_input, ad1 = rmspro(gradient_input, alpha, op2[0], t = 0.5)
        hyper_param[0][0] = ad
        hyper_param[1][0] = ad1
        
    elif  opt == 'adaGrad':
        gradient_out, ad = adaGrad(gradient_out, alpha, op1[0], t = 0.5)
        gradient_input, ad1 = adaGrad(gradient_input, alpha, op2[0], t = 0.5)
        hyper_param[0][0] = ad
        hyper_param[1][0] = ad1    
        
    else :
        gradient_out = alpha * gradient_out
        gradient_input = alpha * gradient_input
        
    return gradient_out, gradient_input
###

def momentum(dx, alpha = 0.001, vx = 0, rho = 0.9):
    vx = rho * vx + dx
    x = alpha * vx
    return x, vx

def adam(dx, alpha, first_moment = 0, second_moment = 0, b1 = 0.9, b2 = 0.99):      
    first_moment = b1 * first_moment + (1 - b1) * dx
    second_moment = b2 * second_moment + (1 - b2) * dx * dx
    x = alpha * first_moment/(np.sqrt(second_moment) + 1e-7)
    return x, first_moment, second_moment  

def rmspro(dx, alpha, grad_squ = 0, t = 0.5):
    decay_rate = alpha - t * alpha
    grad_squ = decay_rate * grad_squ + (1 - decay_rate) * dx * dx
    x = alpha * dx / (np.sqrt(grad_squ) + 1e-7)
    return x, grad_squ

def adaGrad(dx, alpha, grad_squ = 0, t = 0.5):
    grad_squ += dx * dx
    x = alpha * dx / (np.sqrt(grad_squ) + 1e-7)
    return x, grad_squ

###
    
def train(data, target, batchSize, batches,
          weightsInputLayer, weightsOutputLayer, epochs, 
          logs, alpha = 0.001, opt = "rmspro", isL2 = False, lambda_L2 = 0.001):
    
    loss_w, accuracies = [], []
    hyper_param = [[0, 0],[0, 0]]
    
    for epoch in range(epochs):
        if logs:
            print("\n")
            print("Epoch: " + str(epoch))

        data_batch, target_batch  =  batchGenarator(data.copy(), target.copy(),
                                                         batchSize)

        H, A, H_OUTPUT, A_OUTPUT, loss_E = forward_pass(weightsInputLayer.copy(),
                                                            weightsOutputLayer.copy(),
                                                            data_batch.copy(),
                                                            target_batch.copy(),                                                
                                                            logs = logs)
        loss_w.append((loss_E.mean()))

        REL_D, SOFT_D, LOSS_D, g_out, g_input = backward_pass(H.copy(),
                                                  A.copy(),
                                                  H_OUTPUT.copy(),
                                                  A_OUTPUT.copy(),
                                                  loss_E,                                                   
                                                  weightsInputLayer.copy(),                    
                                                  weightsOutputLayer.copy(),
                                                  data_batch.copy(),
                                                  target_batch.copy(),
                                                  alpha , opt, hyper_param, logs = logs,
                                                  isL2 = isL2, lambda_L2 = lambda_L2)
            
        weightsOutputLayer -= g_out
        if logs:
            print("Weights out: " + str(weightsOutputLayer))
                
        weightsInputLayer -= g_input
        if logs:
            print("Weights in: " + str(weightsInputLayer))
                
        category=np.argmax(A_OUTPUT ,axis=1)
        y_=np.argmax(target_batch,axis=1)
        accuracy=(category==y_).mean()
        accuracies.append(accuracy)
        
        if logs:
            print("Accurancy : " + str(accuracy))
            
    return weightsInputLayer, weightsOutputLayer, loss_w, accuracies

In [634]:
logs = False
batchSize = 24
outputLayerNeurons = 10
hiddenLayerNeurons = 16

def init(x,y):
    layer=np.random.uniform(-1.,1.,size=(x,y))/np.sqrt(x*y)
    return layer.astype(np.float32)

np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)

In [635]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 10,
                      logs=True, alpha = 0.0001, opt = "sdg", isL2 = True)



Epoch: 0
H: (24, 16)
A: (24, 16)
H OUTPUT: (24, 10)
A OUTPUT: (24, 10)
Loss: [[1.73886373e-03 9.16552028e-01 1.60529690e-03 4.78940033e-03
  6.62794170e-04 6.82250236e-05 5.99606049e-04 8.09146041e-04
  1.21915596e-03 5.76416224e-05]
 [1.32872848e-03 9.36619246e-01 1.59632105e-03 1.29294396e-03
  1.17395714e-03 6.16963798e-04 1.22616775e-03 5.10740723e-04
  5.58818797e-04 4.95150506e-04]
 [6.61887046e-04 8.14446877e-05 8.52238099e-04 9.38897336e-01
  2.07321452e-03 1.44673678e-02 2.65256377e-04 1.40433828e-05
  1.20427277e-04 6.17309874e-04]
 [9.61085571e-04 8.69063556e-05 8.76830448e-01 1.32162220e-04
  1.94624712e-04 2.44038821e-03 2.60615827e-04 3.42709253e-04
  2.09860726e-03 1.42386229e-03]
 [9.72616691e-01 3.89573335e-05 3.53142103e-03 1.12122958e-03
  6.93433462e-04 8.05689059e-03 1.67662645e-03 5.54445217e-05
  2.78094931e-04 8.46219313e-03]
 [6.32097805e-04 4.71029482e-05 8.69186762e-01 6.66678427e-05
  1.59682398e-03 1.45018991e-02 8.03572838e-05 1.83893252e-04
  1.86443321

H: (24, 16)
A: (24, 16)
H OUTPUT: (24, 10)
A OUTPUT: (24, 10)
Loss: [[9.55904395e-01 1.32438293e-03 1.24587927e-04 1.28906157e-03
  5.93215332e-04 1.68608710e-04 3.83074893e-03 2.74656534e-03
  8.24530250e-04 4.86222044e-05]
 [1.36829904e-03 3.36946859e-04 5.03466738e-03 9.92724924e-04
  1.54477681e-03 1.22052688e-03 3.81772408e-04 2.22957649e-04
  9.52547724e-01 6.56377874e-04]
 [7.31170378e-04 9.62534044e-01 5.68825208e-04 9.77720400e-04
  5.07021688e-04 4.54403583e-04 1.09987412e-03 2.56063503e-03
  2.30605653e-03 6.64417393e-05]
 [2.41900613e-03 9.84281703e-01 1.44499022e-03 8.56315288e-03
  4.23789590e-04 1.86797784e-03 1.28731439e-03 5.63886529e-04
  4.04010765e-03 8.13921301e-04]
 [2.89495211e-03 2.31875219e-03 9.73700983e-01 2.03320941e-03
  3.21200703e-03 3.87621180e-04 1.37679195e-03 5.55790554e-04
  1.10592180e-03 1.33081107e-04]
 [6.55074761e-04 1.28817439e-04 3.77098097e-03 2.83894759e-05
  2.11477386e-04 2.33166220e-03 9.64315703e-01 1.87921782e-03
  5.90441858e-03 1.1534

H: (24, 16)
A: (24, 16)
H OUTPUT: (24, 10)
A OUTPUT: (24, 10)
Loss: [[1.31292336e-03 2.56072940e-04 8.31879386e-01 1.34254686e-05
  1.28324239e-03 5.94774806e-03 3.29601289e-04 7.35029997e-04
  8.73640839e-03 4.83266723e-03]
 [7.19924050e-05 1.32983234e-03 5.58920058e-03 1.91863722e-06
  8.84181796e-04 1.88337068e-04 8.03115680e-04 4.51460304e-02
  6.22360976e-03 8.32361009e-01]
 [9.11560549e-01 1.12977008e-04 6.20604815e-05 2.17068458e-04
  4.18142371e-05 3.55036676e-03 7.27716236e-03 9.85583547e-04
  5.84834102e-03 1.96438970e-04]
 [1.94855662e-03 6.67408818e-03 2.13660454e-03 1.65501988e-04
  2.88541517e-03 6.56378480e-04 1.19235648e-03 1.45888036e-03
  9.43874687e-01 1.46729367e-03]
 [7.62861956e-04 2.86596933e-03 4.54630707e-04 1.02738315e-04
  9.56296610e-01 1.75324950e-04 9.56340299e-03 5.71370622e-03
  1.23704622e-03 8.03968621e-04]
 [3.24500825e-04 1.94886074e-03 1.42557434e-03 6.43382265e-05
  9.57446757e-01 1.72061627e-04 1.98087072e-03 4.62327834e-03
  1.78239475e-03 2.0922

H: (24, 16)
A: (24, 16)
H OUTPUT: (24, 10)
A OUTPUT: (24, 10)
Loss: [[4.38463099e-03 2.52638218e-02 6.90403044e-04 2.68381070e-06
  1.09333915e-04 3.46442939e-05 7.59081577e-03 6.67603012e-01
  1.18264099e-02 3.11216444e-03]
 [2.97866719e-03 1.18158747e-03 4.53413009e-03 4.06893103e-05
  3.60820371e-03 1.49961068e-03 9.64083562e-01 9.31695402e-04
  1.74076682e-03 3.71048704e-04]
 [9.41734849e-01 5.31364369e-05 7.65431425e-04 8.03818448e-03
  8.91432416e-04 1.94146229e-03 1.93849813e-03 3.49737740e-05
  1.94663935e-04 6.53182381e-04]
 [1.35180055e-04 8.23684507e-04 2.33505486e-02 1.70485332e-07
  9.03486960e-04 1.54298083e-03 9.63161029e-04 8.87132633e-03
  6.19023756e-03 4.94210006e-01]
 [3.35942473e-03 1.06327667e-03 3.64918827e-03 2.83030467e-05
  2.82776433e-03 8.59683630e-04 9.61158153e-01 6.61312126e-04
  3.19867703e-03 1.81438341e-03]
 [1.32565503e-03 5.09803182e-05 2.51936046e-04 2.05573194e-01
  6.13545579e-03 6.45966961e-03 7.44258962e-04 1.07114909e-06
  6.68225620e-06 5.3834

H: (24, 16)
A: (24, 16)
H OUTPUT: (24, 10)
A OUTPUT: (24, 10)
Loss: [[9.38260686e-04 9.14956059e-04 5.93757577e-04 1.75649318e-05
  1.85253228e-04 1.20362187e-04 1.94404994e-03 2.04382070e-03
  8.89687506e-01 6.44875616e-04]
 [2.41923873e-03 6.45248191e-05 3.05030548e-03 9.89746256e-01
  3.58360400e-04 4.17518906e-03 3.82860868e-04 8.17845638e-05
  8.76578720e-04 1.78120579e-04]
 [5.28145729e-03 6.16410049e-04 1.38603278e-04 5.69566563e-01
  4.94676837e-02 6.57356136e-04 6.39829260e-04 5.51594410e-06
  6.23746088e-05 5.08723315e-05]
 [2.63233929e-03 2.76386165e-04 4.01106324e-04 2.61312313e-05
  2.65044270e-05 9.70094904e-01 1.33696099e-03 7.57290851e-04
  4.00028580e-03 2.50320119e-04]
 [2.36901049e-04 2.16358799e-02 1.07753857e-03 1.46444239e-06
  1.65760718e-03 8.48273475e-05 6.35712676e-03 5.20401877e-02
  1.95612680e-03 8.84625175e-01]
 [9.65268630e-01 2.12992513e-04 1.34932734e-03 3.25484768e-05
  4.24020660e-04 7.01491402e-04 1.44705189e-03 7.31822454e-04
  1.03532472e-03 5.8038

In [636]:
EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.17095238095238094


In [520]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.0001, opt = "sdg", isL2 = True)

In [521]:
EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3578253968253968


In [575]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sdg")

In [576]:
EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.26165079365079363


In [577]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "sdg")

In [578]:
EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.305952380952381


In [579]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 300,
                      logs=False, alpha = 0.0001, opt = "sdg", isL2 = True)

In [580]:
EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.24722222222222223


In [528]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "adam")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3902380952380952


In [529]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.001, opt = "adam")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.18752380952380954


In [582]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.001, opt = "adam")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.378047619047619


In [584]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 50,
                      logs=False, alpha = 0.0001, opt = "momentum")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.2614761904761905


In [534]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "rmspro")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.4408888888888889


In [585]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 30,
                      logs=False, alpha = 0.0001, opt = "momentum")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.149


In [537]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3964920634920635


In [538]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.0001, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.24171428571428571


In [586]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 30,
                      logs=False, alpha = 0.01, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.13766666666666666


In [631]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 30,
                      logs=False, alpha = 0.0001, opt = "momentum")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.21384126984126983


In [632]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.36823809523809525


In [542]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 50,
                      logs=False, alpha = 0.001, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.4603968253968254


In [543]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 150,
                      logs=False, alpha = 0.0001, opt = "rmspro")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.4935079365079365


In [544]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 50,
                      logs=False, alpha = 0.001, opt = "rmspro")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.21787301587301589


In [545]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "rmspro")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.5929523809523809


In [638]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "adam")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3832857142857143


In [546]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 50,
                      logs=False, alpha = 0.001, opt = "rmspro")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.23766666666666666


In [637]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "adaGrad")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.5407301587301587


In [587]:
logs = False
batchSize = 256
outputLayerNeurons = 10
hiddenLayerNeurons = 32

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)


np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [548]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sgd")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3757142857142857


In [588]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 300,
                      logs=False, alpha = 0.0001, opt = "sgd")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.264031746031746


In [589]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sgd")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.30842857142857144


In [614]:
logs = False
batchSize = 128
outputLayerNeurons = 10
hiddenLayerNeurons = 32

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)


np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [603]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.0001, opt = "sgd")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.35603174603174603


In [604]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.34525396825396826


In [615]:
logs = False
batchSize = 64
outputLayerNeurons = 10
hiddenLayerNeurons = 32

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)

np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [607]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sgd")

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.20803174603174604


In [628]:
logs = False
batchSize = 256
outputLayerNeurons = 10
hiddenLayerNeurons = 128

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)


np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [618]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "adaGrad", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3051111111111111


In [619]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.0001, opt = "adaGrad", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.25066666666666665


In [621]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 800,
                      logs=False, alpha = 0.00001, opt = "sgd", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3078888888888889


In [626]:
logs = False
batchSize = 64
outputLayerNeurons = 10
hiddenLayerNeurons = 16

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)

np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [624]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 200,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = False)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.2284920634920635


In [567]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 100,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = False)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.34485714285714286


In [None]:
logs = False
batchSize = 32
outputLayerNeurons = 10
hiddenLayerNeurons = 16

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)

np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [570]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 400,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = False)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.48103174603174603


In [613]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.36717460317460315


In [592]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 1000,
                      logs=False, alpha = 0.0001, opt = "rmspro", isL2 = False)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.24653968253968253


In [593]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 1000,
                      logs=False, alpha = 0.0001, opt = "rmspro", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.0001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.31546031746031744


In [594]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "adaGrad", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.17817460317460318


In [595]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "adaGrad", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.1918888888888889


In [596]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 1000,
                      logs=False, alpha = 0.0001, opt = "adam", isL2 = False)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.3417142857142857


In [597]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 500,
                      logs=False, alpha = 0.0001, opt = "sgd", isL2 = True, lambda_L2 = 0.01)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.39776190476190476


In [599]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 1000,
                      logs=False, alpha = 0.0001, opt = "rmspro", isL2 = True)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accuracy : " + str(EVE))

Accuracy : 0.2999047619047619


In [600]:
logs = False
batchSize = 24
outputLayerNeurons = 10
hiddenLayerNeurons = 8

train_X, test_X , train_y, test_y = train_test_split(x, y, test_size=0.10, random_state=11)

w1, w2 = network_init(train_X,
                               outputLayerNeurons = outputLayerNeurons,
                               hiddenLayerNeurons = hiddenLayerNeurons)


np.random.seed(42)
l1=init(28*28,hiddenLayerNeurons)
l2=init(hiddenLayerNeurons,10)

In [454]:
wi, wo, loss_, acc_ = train(train_X,
                            train_y,
                      batchSize = batchSize,
                      batches = batches,
                      weightsInputLayer = l1.copy(),
                      weightsOutputLayer = l2.copy(),
                      epochs = 3000,
                      logs=False, alpha = 0.0001, opt = "rmspro", isL2 = True, lambda_L2 = 0.01)

EVE = evaluate(wi.copy(), wo.copy(), train_X, train_y, alpha = 0.001, logs = logs)    
print("Accurancy : " + str(EVE))

Accurancy : 0.7263809523809523


### Conclusions: 
* L2 regularization have a big impact on network - the accuracy was higher,
* The best accuracy was 72%, with mini batch size 8, 16 hidden neurons, learning rate 0.001 and optimizer RMSpro, L2 regularization with lambda 0.01 and 3000 numbers of epochs ,
* Bad impact on network had momentum optimizer with accuracy 20% after 500 epochs,
* Good impact on network had Adam optimizer with accuracy 39% after 500 epochs,
* Similar impact on network had adaGrad optimizer with accuracy 39% after 500 epochs,
* Better impact on network had RMSpro optimizer with accuracy 49% after 150 epochs
* The batch size was important - big batch size was having impact on execution time - too much computing. Smaller size had short execution time and the result was faster and in most cases better. 
* The number of hidden layer neurons also  had a big impact on the network, the smaller size was better 