# Upgrading our MNIST Network

![](./images/img_1.png)

![](./images/img.png)

In [2]:
import numpy as np, sys
np.random.seed(1)  # Fix the random seed so that results are reproducible

from keras.datasets import mnist

# -------------------------
# 1) Load and Preprocess Data
# -------------------------
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# x_train, x_test: shape (N, 28, 28) containing pixel data
# y_train, y_test: integer labels [0..9]

# We'll only use the first 1000 training images for demonstration
images = x_train[0:1000].reshape(1000, 28*28) / 255
# Flatten each 28 x 28 image into a single 784-dimensional row,
# and scale pixels from [0..255] to [0..1]

labels = y_train[0:1000]
# These are the integer labels for the first 1000 training examples

# Convert integer labels to one-hot vectors
one_hot_labels = np.zeros((len(labels), 10))
for i, l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels
# Now 'labels' is (1000, 10) where each row is a one-hot encoding of the digit

# Preprocess the entire test set similarly
test_images = x_test.reshape(len(x_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))
for i, l in enumerate(y_test):
    test_labels[i][l] = 1
# test_labels is now (len(x_test), 10) in one-hot format

# -------------------------
# 2) Define Activation Functions
# -------------------------
def tanh(x):
    # Hyperbolic tangent activation:
    # output in range (-1, 1)
    return np.tanh(x)

def tanh2deriv(output):
    # Derivative of tanh:
    # If y = tanh(x), derivative = 1 - y^2
    return 1 - (output ** 2)

def softmax(x):
    # Softmax function:
    # exponentiate each value and divide by the sum of exponents
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# -------------------------
# 3) Hyperparameters
# -------------------------
alpha = 2            # learning rate
iterations = 300     # total epochs to train
hidden_size = 100    # number of neurons in the hidden layer
pixels_per_image = 784  # 28 x 28
num_labels = 10         # digits from 0..9
batch_size = 100        # mini-batch size

# -------------------------
# 4) Weight Initialization
# -------------------------
# weights_0_1: shape (784, 100) connects input -> hidden
# weights_1_2: shape (100, 10)  connects hidden -> output
weights_0_1 = 0.02 * np.random.random((pixels_per_image, hidden_size)) - 0.01
weights_1_2 = 0.2  * np.random.random((hidden_size, num_labels))      - 0.1

# -------------------------
# 5) Training Loop
# -------------------------
for j in range(iterations):
    correct_cnt = 0  # track how many training samples in each epoch are classified correctly

    # Process the training data in batches
    for i in range(int(len(images) / batch_size)):
        # Identify the start and end indices of this batch
        batch_start, batch_end = ((i * batch_size), ((i+1) * batch_size))

        # 1) Forward Pass
        layer_0 = images[batch_start:batch_end]         # shape (batch_size, 784)
        layer_1 = tanh(np.dot(layer_0, weights_0_1))    # shape (batch_size, 100)
        
        # Apply dropout in the hidden layer:
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        # Multiply by 2 to compensate for ~50% dropout
        layer_1 *= dropout_mask * 2

        # Output layer uses softmax
        layer_2 = softmax(np.dot(layer_1, weights_1_2)) # shape (batch_size, 10)

        # 2) Accuracy: compare predicted and true labels
        for k in range(batch_size):
            # For each example in this batch, check if predicted digit matches actual
            correct_cnt += int(
                np.argmax(layer_2[k:k+1]) == np.argmax(labels[batch_start + k : batch_start + k + 1])
            )

        # 3) Backpropagation
        # layer_2_delta: derivative of loss wrt. layer_2
        # (labels - predictions) / (batch_size * layer_2.shape[0]) to scale for batch
        layer_2_delta = (labels[batch_start:batch_end] - layer_2) / (batch_size * layer_2.shape[0])

        # layer_1_delta: push errors back through weights_1_2,
        # then multiply by derivative of tanh
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1)

        # Zero out deltas for dropped neurons so they don't update weights
        layer_1_delta *= dropout_mask

        # 4) Weight Updates
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    # -------------------------
    # 6) Test/Evaluation Loop
    # -------------------------
    test_correct_cnt = 0
    for i in range(len(test_images)):
        # No dropout for test images
        layer_0 = test_images[i:i+1]         # shape (1, 784)
        layer_1 = tanh(np.dot(layer_0, weights_0_1))  # shape (1, 100)
        layer_2 = np.dot(layer_1, weights_1_2)        # shape (1, 10)

        # Increment test_correct_cnt if predicted digit matches the label
        test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

    # Print status every 10 epochs
    if (j % 10) == 0:
        sys.stdout.write(
            "\n"
            + "I:" + str(j)
            + " Test-Acc:" + str(test_correct_cnt / float(len(test_images)))
            + " Train-Acc:" + str(correct_cnt / float(len(images)))
        )



I:0 Test-Acc:0.394 Train-Acc:0.156
I:10 Test-Acc:0.6867 Train-Acc:0.723
I:20 Test-Acc:0.7025 Train-Acc:0.732
I:30 Test-Acc:0.734 Train-Acc:0.763
I:40 Test-Acc:0.7663 Train-Acc:0.794
I:50 Test-Acc:0.7913 Train-Acc:0.819
I:60 Test-Acc:0.8102 Train-Acc:0.849
I:70 Test-Acc:0.8228 Train-Acc:0.864
I:80 Test-Acc:0.831 Train-Acc:0.867
I:90 Test-Acc:0.8364 Train-Acc:0.885
I:100 Test-Acc:0.8407 Train-Acc:0.883
I:110 Test-Acc:0.845 Train-Acc:0.891
I:120 Test-Acc:0.8481 Train-Acc:0.901
I:130 Test-Acc:0.8505 Train-Acc:0.901
I:140 Test-Acc:0.8526 Train-Acc:0.905
I:150 Test-Acc:0.8555 Train-Acc:0.914
I:160 Test-Acc:0.8577 Train-Acc:0.925
I:170 Test-Acc:0.8596 Train-Acc:0.918
I:180 Test-Acc:0.8619 Train-Acc:0.933
I:190 Test-Acc:0.863 Train-Acc:0.933
I:200 Test-Acc:0.8642 Train-Acc:0.926
I:210 Test-Acc:0.8653 Train-Acc:0.931
I:220 Test-Acc:0.8668 Train-Acc:0.93
I:230 Test-Acc:0.8672 Train-Acc:0.937
I:240 Test-Acc:0.8681 Train-Acc:0.938
I:250 Test-Acc:0.8687 Train-Acc:0.937
I:260 Test-Acc:0.8684 Train-