# 3 Layer Network on MNIST

In [6]:
import sys         # sys is used for printing progress (not strictly necessary but helps show training progress)
import numpy as np # NumPy for numerical operations

from keras.datasets import mnist
# Keras has a built-in MNIST loader that returns (x_train, y_train), (x_test, y_test)

# ----------------------
# 1) LOAD AND PREPARE DATA
# ----------------------
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# x_train, x_test: arrays of shape (N, 28, 28) containing the pixel data
# y_train, y_test: integer labels [0..9] for each image

# We only use the first 1000 images from the training set
images = x_train[0:1000].reshape(1000, 28*28) / 255
#  - reshape(1000,28*28) flattens each 28x28 image into a 784-dim vector
#  - dividing by 255 scales the pixel values to [0..1]

labels = y_train[0:1000]
# labels are still integers from 0..9

# Convert integer labels to one-hot vectors (size = 10 for digits 0..9)
one_hot_labels = np.zeros((len(labels), 10))
for i, l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels
# Now 'labels' is an array of shape (1000,10),
# where each row is a one-hot vector of the correct digit.

# ----------------------
# 2) PREPARE TEST DATA (OPTIONAL FOR EVALUATION)
# ----------------------
test_images = x_test.reshape(len(x_test), 28*28) / 255
# Flatten test images to shape (N,784) and scale pixel values to [0..1]

test_labels = np.zeros((len(y_test), 10))
for i, l in enumerate(y_test):
    test_labels[i][l] = 1
# Convert test labels to one-hot vectors as well

# ----------------------
# 3) CREATE ACTIVATION FUNCTIONS
# ----------------------
np.random.seed(1)  # fix random seed for reproducibility of results

relu = lambda x: (x >= 0) * x
# ReLU: returns x if x >= 0, else 0
# Applied elementwise

relu2deriv = lambda x: (x >= 0)
# Derivative of ReLU:
# returns 1 if x >= 0, else 0
# Applied elementwise

# ----------------------
# 4) SET HYPERPARAMETERS
# ----------------------
alpha = 0.005          # learning rate
iterations = 350       # how many times we'll loop over entire training set
hidden_size = 40       # number of neurons in hidden layer
pixels_per_image = 784 # each image is 28*28
num_labels = 10        # digits 0..9

# ----------------------
# 5) INIT WEIGHTS
# ----------------------
# weights_0_1: connects input layer (784) -> hidden layer (40)
# weights_1_2: connects hidden layer (40) -> output layer (10)
# We randomly initialize them in range [-0.1, 0.1]
weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

# ----------------------
# 6) TRAINING LOOP
# ----------------------
for j in range(iterations):
    # We'll track total error across all 1000 samples and how many are correctly classified
    error = 0.0
    correct_cnt = 0

    # --- 6a) LOOP OVER EACH TRAINING EXAMPLE ---
    for i in range(len(images)):

        # -------------------
        # FORWARD PASS
        # -------------------
        layer_0 = images[i:i+1]    # shape (1, 784) for the current sample
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        # shape (1, 40)
        # dot the input with the (784,40) weights -> (1,40), then apply ReLU

        layer_2 = np.dot(layer_1, weights_1_2)
        # shape (1, 10)
        # final output is a linear combination from the hidden layer to output

        # -------------------
        # ERROR & ACCURACY
        # -------------------
        # measure squared error for this sample's output vs. label
        error += np.sum((labels[i:i+1] - layer_2) ** 2)

        # check if predicted digit matches the true label
        # np.argmax finds the index of the largest value in each vector (the predicted digit)
        # we compare it to the index of the largest value in the actual label
        correct_cnt += int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))

        # -------------------
        # BACKPROPAGATION
        # -------------------
        # 1) Output layer delta:
        layer_2_delta = (labels[i:i+1] - layer_2)
        # derivative of squared error wrt layer_2 = (target - output)

        # 2) Hidden layer delta:
        # push the delta back through weights_1_2, then apply ReLU derivative
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        # -------------------
        # WEIGHT UPDATES
        # -------------------
        # 1) Update weights_1_2
        # Transpose (1,40) to (40,1) so we can multiply with (1,10) for a (40,10) update
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)

        # 2) Update weights_0_1
        # Transpose (1,784) to (784,1) so we can multiply with (1,40) for a (784,40) update
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    # Print training progress after each iteration
    # \r will overwrite the line each time
    sys.stdout.write(
        "\r I:" + str(j) +
        " Train-Err:" + str(error / float(len(images)))[:5] +  # average error per sample
        " Train-Acc:" + str(correct_cnt / float(len(images)))  # fraction of correctly classified
    )


 I:349 Train-Err:0.108 Train-Acc:1.099

In [5]:
# This if-statement triggers every 10th training epoch, or on the final epoch
if (j % 10 == 0) or (j == iterations - 1):
    # Initialize counters to accumulate total test error and correct classifications
    error, correct_cnt = (0.0, 0)

    # Loop over all test images
    for i in range(len(test_images)):

        # 1) layer_0 is a single test image (reshaped to 1 x 784)
        layer_0 = test_images[i:i+1]
        
        # 2) layer_1 is the hidden layer output after applying ReLU
        #    dot product: (1 x 784) · (784 x hidden_size) -> (1 x hidden_size)
        layer_1 = relu(np.dot(layer_0, weights_0_1))

        # 3) layer_2 is the final output (1 x 10) after the dot product
        #    dot product: (1 x hidden_size) · (hidden_size x 10) -> (1 x 10)
        layer_2 = np.dot(layer_1, weights_1_2)

        # 4) Accumulate error: sum of squared differences between
        #    the predicted output (layer_2) and the one-hot test label
        error += np.sum((test_labels[i:i+1] - layer_2) ** 2)

        # 5) Check if we predicted the correct digit.
        #    np.argmax(...) finds which index in the vector has the largest value.
        #    If the predicted index matches the true label's index, it's correct.
        correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

    # Print the test error (averaged by number of test samples) and accuracy
    sys.stdout.write(
        " Test-Err:" + str(error / float(len(test_images)))[:5] +  # truncate to 5 characters
        " Test-Acc:" + str(correct_cnt / float(len(test_images))) + "\n"
    )
    print()  # print a blank line


 Test-Err:0.653 Test-Acc:0.7073



In [28]:
import sys, numpy as np
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

images, labels = (x_train[0:1000].reshape(1000,28*28) / 255, y_train[0:1000])

one_hot_labels = np.zeros((len(labels),10))
for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = x_test.reshape(len(x_test),28*28) / 255
test_labels = np.zeros((len(y_test),10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

np.random.seed(1)
relu = lambda x:(x>=0) * x # returns x if x > 0, return 0 otherwise
relu2deriv = lambda x: x>=0 # returns 1 for input > 0, return 0 otherwise
alpha, iterations, hidden_size, pixels_per_image, num_labels = (0.005, 350, 40, 784, 10)

weights_0_1 = 0.2*np.random.random((pixels_per_image,hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(np.dot(layer_0,weights_0_1))
        layer_2 = np.dot(layer_1,weights_1_2)

        error += np.sum((labels[i:i+1] - layer_2) ** 2)
        correct_cnt += int(np.argmax(layer_2) == \
                                        np.argmax(labels[i:i+1]))

        layer_2_delta = (labels[i:i+1] - layer_2)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)\
                                    * relu2deriv(layer_1)
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    sys.stdout.write("\r I:"+str(j)+ \
                     " Train-Err:" + str(error/float(len(images)))[0:5] +\
                     " Train-Acc:" + str(correct_cnt/float(len(images))))
    
    if(j % 10 == 0 or j == iterations-1):
        error, correct_cnt = (0.0, 0)

        for i in range(len(test_images)):

            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0,weights_0_1))
            layer_2 = np.dot(layer_1,weights_1_2)

            error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
            correct_cnt += int(np.argmax(layer_2) == \
                                            np.argmax(test_labels[i:i+1]))
        sys.stdout.write(" Test-Err:" + str(error/float(len(test_images)))[0:5] +\
                         " Test-Acc:" + str(correct_cnt/float(len(test_images))))
        print()

 I:0 Train-Err:0.722 Train-Acc:0.537 Test-Err:0.601 Test-Acc:0.6488
 I:10 Train-Err:0.312 Train-Acc:0.901 Test-Err:0.420 Test-Acc:0.8114
 I:20 Train-Err:0.260 Train-Acc:0.93 Test-Err:0.414 Test-Acc:0.8111
 I:30 Train-Err:0.232 Train-Acc:0.946 Test-Err:0.417 Test-Acc:0.8066
 I:40 Train-Err:0.215 Train-Acc:0.956 Test-Err:0.426 Test-Acc:0.8019
 I:50 Train-Err:0.204 Train-Acc:0.966 Test-Err:0.437 Test-Acc:0.7982
 I:60 Train-Err:0.194 Train-Acc:0.967 Test-Err:0.448 Test-Acc:0.7921
 I:70 Train-Err:0.186 Train-Acc:0.975 Test-Err:0.458 Test-Acc:0.7864
 I:80 Train-Err:0.179 Train-Acc:0.979 Test-Err:0.466 Test-Acc:0.7817
 I:90 Train-Err:0.172 Train-Acc:0.981 Test-Err:0.474 Test-Acc:0.7758
 I:100 Train-Err:0.166 Train-Acc:0.984 Test-Err:0.482 Test-Acc:0.7706
 I:110 Train-Err:0.161 Train-Acc:0.984 Test-Err:0.489 Test-Acc:0.7686
 I:120 Train-Err:0.157 Train-Acc:0.986 Test-Err:0.496 Test-Acc:0.766
 I:130 Train-Err:0.153 Train-Acc:0.99 Test-Err:0.502 Test-Acc:0.7622
 I:140 Train-Err:0.149 Train-Acc:0

# Dropout In Code

In [7]:
i = 0
# We'll examine just the first training example (index = 0).

layer_0 = images[i:i+1]
# layer_0 is the input vector (1 x number_of_pixels) for the first image.
# Slicing with i:i+1 ensures it remains a 2D array of shape (1, 784).

dropout_mask = np.random.randint(2, size=layer_1.shape)
# Generate a dropout mask for layer_1: random 0s or 1s, same shape as layer_1.
# Each neuron has a 50% chance of being “dropped” (set to zero).

layer_1 *= dropout_mask * 2
# Apply dropout to layer_1 by multiplying elementwise with the dropout mask.
# Multiplying by 2 compensates for the 50% of neurons dropped
# (thus maintaining a similar total activation on average).

layer_2 = np.dot(layer_1, weights_1_2)
# Calculate the output layer (layer_2) by matrix-multiplying
# the dropout-adjusted layer_1 with weights_1_2.

error += np.sum((labels[i:i+1] - layer_2) ** 2)
# Accumulate the squared error for this training example.

correct_cnt += int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))
# Check if the predicted class (index of max in layer_2) matches
# the true class (index of max in labels[i:i+1]). If yes, increment correct_cnt.

layer_2_delta = (labels[i:i+1] - layer_2)
# Compute the output layer's delta (gradient):
# derivative of squared error wrt. layer_2 is (target - output).

layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
# Backpropagate from the output layer to the hidden layer:
# multiply output delta by the transpose of weights_1_2,
# then apply the derivative of ReLU (relu2deriv) to layer_1.

layer_1_delta *= dropout_mask
# Zero out the deltas for any neurons that were dropped. 
# They get no updates since they were “off” in this forward pass.

weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
# Update weights from hidden layer to output layer.
# The adjustment is proportional to layer_1^T ⋅ layer_2_delta scaled by alpha.

weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
# Update weights from input layer to hidden layer.
# The adjustment is proportional to layer_0^T ⋅ layer_1_delta scaled by alpha.


In [8]:
import numpy, sys

# Fix the random seed so results are repeatable
np.random.seed(1)

def relu(x):
    # ReLU activation function:
    # returns x if x >= 0, otherwise 0
    return (x >= 0) * x

def relu2deriv(output):
    # Derivative of ReLU:
    # returns 1 if output > 0, else 0
    return output >= 0

# Hyperparameters
alpha          = 0.005   # learning rate
iterations     = 300     # how many epochs we'll train
hidden_size    = 100     # number of neurons in hidden layer
pixels_per_image = 784   # each MNIST image is 28 x 28
num_labels     = 10      # digits from 0..9

# Initialize weights
# weights_0_1: from input layer (784) to hidden layer (100)
# weights_1_2: from hidden layer (100) to output layer (10)
weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels))       - 0.1

for j in range(iterations):
    # Track training error and correct classification count this epoch
    error, correct_cnt = (0.0, 0)

    # Loop over each training example
    for i in range(len(images)):
        # 1) Forward pass
        layer_0 = images[i:i+1]                        # shape (1, 784)
        layer_1 = relu(np.dot(layer_0, weights_0_1))    # shape (1, 100)

        # 2) Create dropout mask for layer_1
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        # Multiply layer_1 by the mask, then scale by 2 so
        # the "active" neurons are effectively doubled
        layer_1 *= dropout_mask * 2

        # 3) Continue forward pass from hidden to output
        layer_2 = np.dot(layer_1, weights_1_2)          # shape (1, 10)

        # 4) Accumulate training error (sum of squared errors)
        error += np.sum((labels[i:i+1] - layer_2) ** 2)

        # 5) Check classification correctness
        correct_cnt += int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))

        # 6) Backpropagation
        #    a) Output layer delta: derivative of MSE w.r.t. layer_2
        layer_2_delta = (labels[i:i+1] - layer_2)

        #    b) Hidden layer delta: push error back, then apply ReLU derivative
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        #    c) Apply the same dropout mask on the hidden layer deltas
        layer_1_delta *= dropout_mask

        # 7) Weight updates: gradient descent
        #    a) Hidden→Output
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        #    b) Input→Hidden
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    # Every 10 epochs, evaluate on test set
    if (j % 10 == 0):
        test_error      = 0.0
        test_correct_cnt = 0

        # Perform forward pass on each test sample (without dropout)
        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

        # Print performance metrics
        sys.stdout.write(
            "\n" + \
            "I:" + str(j) + \
            " Test-Err:" + str(test_error / float(len(test_images)))[:5] + \
            " Test-Acc:" + str(test_correct_cnt / float(len(test_images))) + \
            " Train-Err:" + str(error / float(len(images)))[:5] + \
            " Train-Acc:" + str(correct_cnt / float(len(images)))
        )



I:0 Test-Err:0.641 Test-Acc:0.6333 Train-Err:0.891 Train-Acc:0.413
I:10 Test-Err:0.458 Test-Acc:0.787 Train-Err:0.472 Train-Acc:0.764
I:20 Test-Err:0.415 Test-Acc:0.8133 Train-Err:0.430 Train-Acc:0.809
I:30 Test-Err:0.421 Test-Acc:0.8114 Train-Err:0.415 Train-Acc:0.811
I:40 Test-Err:0.419 Test-Acc:0.8112 Train-Err:0.413 Train-Acc:0.827
I:50 Test-Err:0.409 Test-Acc:0.8133 Train-Err:0.392 Train-Acc:0.836
I:60 Test-Err:0.412 Test-Acc:0.8236 Train-Err:0.402 Train-Acc:0.836
I:70 Test-Err:0.412 Test-Acc:0.8033 Train-Err:0.383 Train-Acc:0.857
I:80 Test-Err:0.410 Test-Acc:0.8054 Train-Err:0.386 Train-Acc:0.854
I:90 Test-Err:0.411 Test-Acc:0.8144 Train-Err:0.376 Train-Acc:0.868
I:100 Test-Err:0.411 Test-Acc:0.7903 Train-Err:0.369 Train-Acc:0.864
I:110 Test-Err:0.411 Test-Acc:0.8003 Train-Err:0.371 Train-Acc:0.868
I:120 Test-Err:0.402 Test-Acc:0.8046 Train-Err:0.353 Train-Acc:0.857
I:130 Test-Err:0.408 Test-Acc:0.8091 Train-Err:0.352 Train-Acc:0.867
I:140 Test-Err:0.405 Test-Acc:0.8083 Train-Er

# Batch Gradient Descent

In [9]:
import numpy as np
import sys

np.random.seed(1)  # Fix random seed for reproducible results

def relu(x):
    # ReLU activation function:
    # returns x if x >= 0, else 0 (elementwise)
    return (x >= 0) * x

def relu2deriv(output):
    # Derivative of ReLU:
    # returns 1 if output > 0, else 0 (elementwise)
    return output >= 0

batch_size = 100       # Number of samples to process in each mini-batch
alpha = 0.001          # Learning rate
iterations = 300       # Number of epochs (full passes through the dataset)
pixels_per_image = 784 # Each MNIST image is 28 x 28
num_labels = 10        # Digits 0..9 (10 possible classes)
hidden_size = 100      # Number of neurons in the hidden layer

# Initialize weights with small random values in the range [-0.1, 0.1]
weights_0_1 = 0.2 * np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

for j in range(iterations):
    # Track training error and number of correct predictions this epoch
    error, correct_cnt = 0.0, 0

    # Loop over the training set in mini-batches
    # int(len(images) / batch_size) is how many mini-batches we can form
    for i in range(int(len(images) / batch_size)):

        # Identify the slice of the dataset for this batch
        batch_start = i * batch_size
        batch_end   = (i + 1) * batch_size

        # 1) Forward pass (Input --> Hidden)
        # layer_0 has shape (batch_size, 784)
        layer_0 = images[batch_start:batch_end]

        # layer_1 has shape (batch_size, 100)
        # We apply ReLU to the matrix multiplication of layer_0 and weights_0_1
        layer_1 = relu(np.dot(layer_0, weights_0_1))

        # Apply dropout in the hidden layer
        # dropout_mask is a matrix of 0s/1s of the same shape as layer_1
        dropout_mask = np.random.randint(2, size=layer_1.shape)

        # Multiply layer_1 by the mask and scale by 2 (compensates 50% dropout)
        layer_1 *= dropout_mask * 2

        # 2) Forward pass (Hidden --> Output)
        # layer_2 has shape (batch_size, 10)
        layer_2 = np.dot(layer_1, weights_1_2)

        # 3) Measure error over the entire batch
        # Sum of squared errors between predictions and true labels
        error += np.sum((labels[batch_start:batch_end] - layer_2) ** 2)

        # 4) Count how many samples in the batch are predicted correctly
        for k in range(batch_size):
            # Compare argmax of the output vs. the label for each sample
            correct_cnt += int(
                np.argmax(layer_2[k:k+1]) == np.argmax(labels[batch_start + k : batch_start + k + 1])
            )

        # 5) Backpropagation (calculate deltas for the entire batch)
        # layer_2_delta is the gradient at the output layer
        # We divide by batch_size to get average gradient for the batch
        layer_2_delta = (labels[batch_start:batch_end] - layer_2) / batch_size

        # layer_1_delta is the gradient at the hidden layer
        # We multiply by the ReLU derivative and also push error back
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        # Apply the same dropout mask to the hidden layer deltas
        layer_1_delta *= dropout_mask

        # 6) Weight updates:
        #    a) Hidden -> Output
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)

        #    b) Input -> Hidden
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    # Every 10 epochs, evaluate on the test set (without dropout)
    if (j % 10 == 0):
        test_error = 0.0
        test_correct_cnt = 0

        # Loop over each example in the test set
        for i in range(len(test_images)):
            # Forward pass only (no dropout in testing)
            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            # Accumulate test error (sum of squared differences)
            test_error += np.sum((test_labels[i:i+1] - layer_2) ** 2)

            # Increment correct predictions if argmax matches
            test_correct_cnt += int(
                np.argmax(layer_2) == np.argmax(test_labels[i:i+1])
            )

        # Print progress: epoch index, test/train error, test/train accuracy
        sys.stdout.write(
            "\n"
            + "I:" + str(j)
            + " Test-Err:" + str(test_error / float(len(test_images)))[:5]
            + " Test-Acc:" + str(test_correct_cnt / float(len(test_images)))
            + " Train-Err:" + str(error / float(len(images)))[:5]
            + " Train-Acc:" + str(correct_cnt / float(len(images)))
        )



I:0 Test-Err:1.513 Test-Acc:0.0667 Train-Err:2.080 Train-Acc:0.084
I:10 Test-Err:1.107 Test-Acc:0.118 Train-Err:1.534 Train-Acc:0.097
I:20 Test-Err:1.020 Test-Acc:0.1621 Train-Err:1.347 Train-Acc:0.147
I:30 Test-Err:0.974 Test-Acc:0.1973 Train-Err:1.268 Train-Acc:0.147
I:40 Test-Err:0.938 Test-Acc:0.2319 Train-Err:1.225 Train-Acc:0.141
I:50 Test-Err:0.909 Test-Acc:0.2609 Train-Err:1.127 Train-Acc:0.19
I:60 Test-Err:0.885 Test-Acc:0.2943 Train-Err:1.102 Train-Acc:0.213
I:70 Test-Err:0.863 Test-Acc:0.3233 Train-Err:1.062 Train-Acc:0.221
I:80 Test-Err:0.845 Test-Acc:0.3509 Train-Err:1.028 Train-Acc:0.253
I:90 Test-Err:0.828 Test-Acc:0.3755 Train-Err:0.992 Train-Acc:0.271
I:100 Test-Err:0.814 Test-Acc:0.3953 Train-Err:0.975 Train-Acc:0.28
I:110 Test-Err:0.801 Test-Acc:0.4166 Train-Err:0.937 Train-Acc:0.312
I:120 Test-Err:0.789 Test-Acc:0.4347 Train-Err:0.923 Train-Acc:0.316
I:130 Test-Err:0.779 Test-Acc:0.451 Train-Err:0.910 Train-Acc:0.33
I:140 Test-Err:0.769 Test-Acc:0.4686 Train-Err:0.