# Upgrading our MNIST Network

![](./images/img_4.png)

![](./images/img_5.png)

![](./images/img_6.png)

![](./images/img_7.png)

In [1]:
import numpy as np, sys
np.random.seed(1)  # Fix the random seed for reproducibility

from keras.datasets import mnist

# --------------------------------------------------------------------
# 1) Load and prepare the MNIST data
# --------------------------------------------------------------------
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# 'x_train' and 'x_test' each are arrays of shape (N, 28, 28),
# containing the pixel data for the images (0..255).
# 'y_train' and 'y_test' each are arrays of shape (N,),
# containing integer labels (0..9) for each image.

# We limit ourselves to the first 1000 training samples to speed up this demo
images = x_train[0:1000].reshape(1000, 28*28) / 255.0
# 1) .reshape(1000, 28*28): Flatten each 28x28 image into 784 pixels per row.
# 2) / 255.0: Scale pixel values to the range [0..1].

labels = y_train[0:1000]
# 'labels' are still integers in [0..9].

# Convert these integer labels to one-hot vectors of length 10.
one_hot_labels = np.zeros((len(labels), 10))
for i, l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels
# Now 'labels' is shape (1000, 10). Each row might look like [0,0,0,0,1,0,0,0,0,0] for digit '4'.

# Prepare the entire test set in the same way: flatten, scale, one-hot
test_images = x_test.reshape(len(x_test), 28*28) / 255.0
test_labels = np.zeros((len(y_test), 10))
for i, l in enumerate(y_test):
    test_labels[i][l] = 1

# --------------------------------------------------------------------
# 2) Define Activation Functions
# --------------------------------------------------------------------
def tanh(x):
    # Hyperbolic tangent: maps real numbers to (-1, 1)
    return np.tanh(x)

def tanh2deriv(output):
    # Derivative of tanh: given output = tanh(x),
    # derivative = 1 - (tanh(x))^2 = 1 - output^2
    return 1 - (output ** 2)

def softmax(x):
    # Softmax turns a vector of numbers into a probability distribution.
    # 1) exponentiate each number
    # 2) divide by sum of all exponents
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# --------------------------------------------------------------------
# 3) Hyperparameters and Dimensions
# --------------------------------------------------------------------
alpha = 2          # learning rate
iterations = 300   # total number of epochs (full passes over the data)
pixels_per_image = 784  # each image is 28x28
num_labels = 10         # digits 0..9
batch_size = 128        # mini-batch size

# We'll do a "manual convolution" with 3x3 kernels on 28x28 images
input_rows = 28
input_cols = 28
kernel_rows = 3
kernel_cols = 3
num_kernels = 16
# We have 16 different kernels, each of size 3x3.
# This means each kernel has 9 weights.

# "hidden_size" is how large the hidden layer is after we convolve
#   - For each image, we can slide a 3x3 patch across 26 steps horizontally
#     and 26 steps vertically (since 28-3=25 +1 => 26 possible positions).
#   - That means 26 * 26 = 676 patches per image.
#   - Each patch is processed by 16 kernels => total 676*16 = 10816 "neuron inputs."
hidden_size = ((input_rows - kernel_rows) *
               (input_cols - kernel_cols)) * num_kernels

# --------------------------------------------------------------------
# 4) Initialize Weights (Kernels and Final Layer)
# --------------------------------------------------------------------
# 'kernels': shape (9, 16) => 9 weights per kernel, 16 kernels
kernels = 0.02 * np.random.random((kernel_rows * kernel_cols, num_kernels)) - 0.01

# 'weights_1_2': shape (hidden_size, 10)
# connects the flatten "convolution output" to the 10 output classes
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

# --------------------------------------------------------------------
# 5) Helper Function to Extract a Section of an Image
# --------------------------------------------------------------------
def get_image_section(layer, row_from, row_to, col_from, col_to):
    """
    Given 'layer' of shape (batch_size, 28, 28),
    extract a slice from row_from..row_to, col_from..col_to for each image,
    returning shape (batch_size, 1, (row_to-row_from), (col_to-col_from)).
    This is used to gather 3x3 patches from each image.
    """
    section = layer[:, row_from:row_to, col_from:col_to]
    return section.reshape(-1, 1, (row_to - row_from), (col_to - col_from))

# --------------------------------------------------------------------
# 6) Main Training Loop
# --------------------------------------------------------------------
for j in range(iterations):  # Repeat for 'iterations' epochs
    correct_cnt = 0  # Track how many training samples we classify correctly this epoch

    # We'll split the data into mini-batches
    # The number of mini-batches = len(images)/batch_size
    for i in range(int(len(images) / batch_size)):

        # Identify the start and end indices of this batch
        batch_start = i * batch_size
        batch_end   = (i + 1) * batch_size

        # 1) Reshape from (batch_size, 784) to (batch_size, 28, 28)
        layer_0 = images[batch_start:batch_end]
        layer_0 = layer_0.reshape(layer_0.shape[0], 28, 28)

        # 2) For each image, extract all 3x3 "patches"
        # We'll store them in 'sects'
        sects = []
        for row_start in range(layer_0.shape[1] - kernel_rows):
            for col_start in range(layer_0.shape[2] - kernel_cols):
                # get_image_section returns shape (batch_size, 1, 3, 3)
                sect = get_image_section(layer_0,
                                         row_start,
                                         row_start + kernel_rows,
                                         col_start,
                                         col_start + kernel_cols)
                sects.append(sect)

        # 3) Concatenate these patches along the second dimension
        # so we get shape (batch_size, number_of_patches, 3, 3)
        # number_of_patches = 26*26=676 if 28x28 -> 3x3
        expanded_input = np.concatenate(sects, axis=1)
        es = expanded_input.shape
        # expanded_input: (batch_size, 676, 3, 3)

        # 4) Flatten each 3x3 patch into a length-9 vector
        #    So final shape is (batch_size * 676, 9)
        flattened_input = expanded_input.reshape(es[0]*es[1], -1)

        # 5) Multiply these 9-element patches by our kernel matrix (shape (9,16))
        #    => kernel_output: (batch_size*676, 16)
        kernel_output = flattened_input.dot(kernels)

        # 6) Activation (tanh) + Dropout
        #    Reshape from (batch_size*676, 16) to (batch_size, 676*16)
        layer_1 = tanh(kernel_output.reshape(es[0], -1))
        
        # Apply dropout: random 0 or 1, then multiply by 2 to keep same scale
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2

        # 7) Final layer: multiply by weights_1_2 and apply softmax
        layer_2 = softmax(np.dot(layer_1, weights_1_2))
        # layer_2 now has shape (batch_size, 10), representing class probabilities

        # 8) Calculate training accuracy: how many predictions are correct
        for k in range(batch_size):
            # 'labelset' is the true one-hot label for this sample
            labelset = labels[batch_start + k : batch_start + k + 1]
            # Compare argmax
            _inc = int(np.argmax(layer_2[k : k+1]) == np.argmax(labelset))
            correct_cnt += _inc

        # 9) Backpropagation
        #    a) Output layer delta:
        #       'layer_2_delta' = (true_label - prediction)
        #       We divide by (batch_size * layer_2.shape[0]) to average the gradient
        layer_2_delta = (labels[batch_start:batch_end] - layer_2) / (batch_size * layer_2.shape[0])

        #    b) Hidden layer delta:
        #       push the error back through 'weights_1_2',
        #       then multiply by derivative of tanh
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1)

        #    c) Zero out deltas for neurons that were dropped (0) in dropout
        layer_1_delta *= dropout_mask

        # 10) Weight Updates
        #     a) hidden -> output
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)

        #     b) kernels
        #        reshape 'layer_1_delta' back to (batch_size*676, 16),
        #        same shape as kernel_output,
        #        so we can multiply with 'flattened_input' to get gradient wrt. kernels
        l1d_reshape = layer_1_delta.reshape(kernel_output.shape)
        k_update = flattened_input.T.dot(l1d_reshape)
        # We do 'kernels -= alpha * k_update' (a gradient descent step)
        kernels -= alpha * k_update

    # ----------------------------------------------------------------
    # After each epoch, test on the entire test set (no dropout here)
    # ----------------------------------------------------------------
    test_correct_cnt = 0
    for i in range(len(test_images)):
        layer_0 = test_images[i : i+1]
        # reshape from (1, 784) to (1, 28, 28)
        layer_0 = layer_0.reshape(layer_0.shape[0], 28, 28)
        
        # Extract 3x3 patches from this single image
        sects = []
        for row_start in range(layer_0.shape[1] - kernel_rows):
            for col_start in range(layer_0.shape[2] - kernel_cols):
                sect = get_image_section(layer_0,
                                         row_start,
                                         row_start + kernel_rows,
                                         col_start,
                                         col_start + kernel_cols)
                sects.append(sect)

        expanded_input = np.concatenate(sects, axis=1)
        es = expanded_input.shape

        # Flatten from (1, #patches, 3, 3) to (#patches, 9)
        flattened_input = expanded_input.reshape(es[0]*es[1], -1)

        # Dot with 'kernels' => shape (#patches, 16)
        kernel_output = flattened_input.dot(kernels)
        # Then reshape to (1, #patches*16)
        layer_1 = tanh(kernel_output.reshape(es[0], -1))

        # final layer (no dropout in testing)
        layer_2 = np.dot(layer_1, weights_1_2)

        # Check if predicted class == true label
        test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i : i+1]))

    # Print results every epoch
    sys.stdout.write(
        "\n"
        + "I:" + str(j)  # which epoch
        + " Test-Acc:" + str(test_correct_cnt / float(len(test_images)))
        + " Train-Acc:" + str(correct_cnt / float(len(images)))
    )


2025-03-21 10:43:27.641652: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-21 10:43:27.655793: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 10:43:27.750387: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 10:43:27.837043: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742546607.947245  128400 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742546607.97


I:0 Test-Acc:0.0288 Train-Acc:0.055
I:1 Test-Acc:0.0273 Train-Acc:0.037
I:2 Test-Acc:0.028 Train-Acc:0.037
I:3 Test-Acc:0.0292 Train-Acc:0.04
I:4 Test-Acc:0.0339 Train-Acc:0.046
I:5 Test-Acc:0.0478 Train-Acc:0.068
I:6 Test-Acc:0.076 Train-Acc:0.083
I:7 Test-Acc:0.1316 Train-Acc:0.096
I:8 Test-Acc:0.2137 Train-Acc:0.127
I:9 Test-Acc:0.2941 Train-Acc:0.148
I:10 Test-Acc:0.3563 Train-Acc:0.181
I:11 Test-Acc:0.4023 Train-Acc:0.209
I:12 Test-Acc:0.4358 Train-Acc:0.238
I:13 Test-Acc:0.4473 Train-Acc:0.286
I:14 Test-Acc:0.4389 Train-Acc:0.274
I:15 Test-Acc:0.3951 Train-Acc:0.257
I:16 Test-Acc:0.2222 Train-Acc:0.243
I:17 Test-Acc:0.0613 Train-Acc:0.112
I:18 Test-Acc:0.0266 Train-Acc:0.035
I:19 Test-Acc:0.0127 Train-Acc:0.026
I:20 Test-Acc:0.0133 Train-Acc:0.022
I:21 Test-Acc:0.0185 Train-Acc:0.038
I:22 Test-Acc:0.0363 Train-Acc:0.038
I:23 Test-Acc:0.0928 Train-Acc:0.067
I:24 Test-Acc:0.1994 Train-Acc:0.081
I:25 Test-Acc:0.3086 Train-Acc:0.154
I:26 Test-Acc:0.4276 Train-Acc:0.204
I:27 Test-Acc

KeyboardInterrupt: 