In [1]:
import numpy as np

Utility functions

In [2]:

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(z):
    # Subtract max for numerical stability
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def to_one_hot(y, num_classes):
    """Converts integer labels (0, 1, 2) to one-hot ([[1,0,0], ...])"""
    return np.eye(num_classes)[y]

ANN tailored for our CNN, backprop should return the gradients for input of flatten layer, same logic as in ann

In [3]:
class NeuralNet():
    def __init__(self, layers):
        self.layers = layers
        self.input_len = self.layers[0]
        
        # Initialize weights
        self.weights = [np.random.randn(self.layers[i], self.layers[i+1]) * 0.1 
                        for i in range(len(self.layers)-1)] 
        self.biases = [np.zeros((1, self.layers[i+1])) 
                       for i in range(len(self.layers)-1)]

    def forward(self, x):
        self.A_cache = [x]
        self.Z_cache = []
        A = x
        for i in range(len(self.weights)):
            Z = np.dot(A, self.weights[i]) + self.biases[i]
            self.Z_cache.append(Z)
            
            # --- Activation Logic ---
            if i < len(self.weights) - 1:
                # Hidden Layers: ReLU
                A = relu(Z)
            else:
                # Output Layer: Softmax (For Classification)
                A = softmax(Z)
                
            self.A_cache.append(A)
        return A
    
    def backward(self, x, y_true, lr):
        """
        y_true: Must be One-Hot Encoded shape (Batch, Num_Classes)
        """
        y_pred = self.forward(x)
        L = len(self.weights)
        m = y_true.shape[0]
        dW = [None] * L
        dB = [None] * L
        dZ = [None] * L

        # --- Output Layer Gradients ---
        # Derivative of Cross-Entropy with respect to Softmax input Z is simply: (Pred - Truth)
        dZ[L-1] = y_pred - y_true
        
        dW[L-1] = np.matmul(self.A_cache[L-1].T, dZ[L-1]) / m
        dB[L-1] = np.sum(dZ[L-1], axis=0, keepdims=True) / m

        # --- Hidden Layers Gradients ---
        for l in range(L-2, -1, -1):
            dZ[l] = np.matmul(dZ[l+1], self.weights[l+1].T) * relu_derivative(self.Z_cache[l])
            dW[l] = np.matmul(self.A_cache[l].T, dZ[l]) / m
            dB[l] = np.sum(dZ[l], axis=0, keepdims=True) / m

        # --- Update Weights ---
        for l in range(L):
            self.weights[l] -= lr * dW[l]
            self.biases[l]  -= lr * dB[l]
            
        # --- Return Gradient to CNN ---
        # dX = dZ[0] dot W[0].T
        d_input = np.matmul(dZ[0], self.weights[0].T)
        return d_input

Main CNN class - One convolution layer then flatten then ANN to do digit classification on MNIST dataset

In [4]:
class CNN():
    def __init__(self, input_shape, num_filters, kernel_size, dense_layers, padding=0, stride=1):
        """
        dense_layers: e.g. [64, 10] (where 10 is number of classes, 64 hidden neurons)
        """
        self.n_c, self.h_in, self.w_in = input_shape  # Channels, Height, Width
        self.n_f = num_filters
        self.k = kernel_size
        self.p = padding
        self.s = stride

        # Compute Output Dimensions of convolution layer
        self.h_out = ((self.h_in - self.k + 2 * self.p) // self.s) + 1
        self.w_out = ((self.w_in - self.k + 2 * self.p) // self.s) + 1
        
        if self.h_out <= 0 or self.w_out <= 0:
            raise ValueError(f"Invalid output size. Check params.")

        # Initialize Convolution Filters
        self.conv_w = np.random.randn(self.n_f, self.n_c, self.k, self.k) * 0.1
        self.conv_b = np.zeros((self.n_f, 1))

        # The flatten layer size
        fc_input_len = self.n_f * self.h_out * self.w_out
        
        # Structure: [Flattened_Size, Hidden..., Num_Classes]
        ann_architecture = [fc_input_len] + dense_layers
        self.ann = NeuralNet(ann_architecture)
        
        self.num_classes = dense_layers[-1] #  for one-hot conversion
        print(f"CNN Initialized. Output Classes: {self.num_classes}")

    def convolve_step(self, image, kernel):
        # helper function to perform a single convolution step

        # extract dimensions
        h_in, w_in = image.shape
        k_size = kernel.shape[0]

        # Apply padding if needed
        if self.p > 0:
            image = np.pad(image, ((self.p, self.p), (self.p, self.p)), mode='constant')

        out = np.zeros((self.h_out, self.w_out))

        # loop thorugh every pizel of output
        for i in range(self.h_out):
            for j in range(self.w_out):

                #check where sliding window starts
                h_start, w_start = i * self.s, j * self.s
                
                # slice out our region to do convolution on
                region = image[h_start : h_start+k_size, w_start : w_start+k_size]

                # element wise multuiply and sum
                out[i, j] = np.sum(region * kernel)
        return out

    def forward(self, X):
        # forward pass through CNN
        m = X.shape[0]

        # cache for backprop
        self.X_cache = X 
        self.Z_conv = np.zeros((m, self.n_f, self.h_out, self.w_out))

        # loop through every image
        for i in range(m):
            # loop through every filter
            for f in range(self.n_f):
                filter_sum = 0
                # loop through every channel
                for c in range(self.n_c):
                    filter_sum += self.convolve_step(X[i, c], self.conv_w[f, c])
                self.Z_conv[i, f] = filter_sum + self.conv_b[f] # add bias

        self.flattened = self.Z_conv.reshape(m, -1) # m to keep batch size and -1 to squish all other into one dimension
        return self.ann.forward(self.flattened)

    def backward(self, Y_true, lr=0.01):
        """
        Y_true: One-Hot Encoded
        """
        # Backprop through ANN (Classification)
        d_flat = self.ann.backward(self.flattened, Y_true, lr)
        
        #  Reshape back or say deflatten
        d_Z_conv = d_flat.reshape(self.X_cache.shape[0], self.n_f, self.h_out, self.w_out)
        
        #  Conv Gradients
        m = self.X_cache.shape[0]

        # grad has same size as weights and biases
        d_conv_w = np.zeros_like(self.conv_w)
        d_conv_b = np.zeros_like(self.conv_b)

        # for each image in batch
        for i in range(m):

            for f in range(self.n_f): # for each filter

                d_conv_b[f] += np.sum(d_Z_conv[i, f]) # bias gradient is sum of all dZs for that filter

                for c in range(self.n_c): # for each channel

                    img_slice = self.X_cache[i, c] # original image slice

                    if self.p > 0: img_slice = np.pad(img_slice, ((self.p,self.p),(self.p,self.p)), 'constant') # reapplies padding if necessary
                    
                    # loop through each weight in filter
                    for h in range(self.k):
                        for w in range(self.k):

                            # define the region of conv operation and extract that slice.. 
                            vert_start = h
                            vert_end = vert_start + self.s * self.h_out
                            horiz_start = w
                            horiz_end = horiz_start + self.s * self.w_out

                            patch = img_slice[vert_start:vert_end:self.s, horiz_start:horiz_end:self.s]

                            # multiply input pixel by error grad , basically convolution of dZ with input image
                            if patch.shape == d_Z_conv[i, f].shape:
                                d_conv_w[f, c, h, w] += np.sum(patch * d_Z_conv[i, f])

        # Update Conv Weights and Biases
        self.conv_w -= lr * (d_conv_w / m)
        self.conv_b -= lr * (d_conv_b / m)


    def train(self, X, Y, lr=0.01, epochs=10):
        """
        Y can be shape (m,) [Integers] or (m, num_classes) [One-Hot]
        """
        # Convert Y to one-hot if it isn't already
        if Y.ndim == 1:
            Y_one_hot = to_one_hot(Y.astype(int), self.num_classes)
        else:
            Y_one_hot = Y
            
        for epoch in range(epochs):
            # Forward
            y_probs = self.forward(X)
            
            # Loss Calculation (Cross Entropy)
            epsilon = 1e-15 # prevent log(0) or negative infinity
            y_probs_clipped = np.clip(y_probs, epsilon, 1 - epsilon) # forces all pprobs to be within [epsilon, 1-epsilon]

            # Cross-Entropy Loss
            loss = -np.mean(np.sum(Y_one_hot * np.log(y_probs_clipped), axis=1))
            
            # Accuracy Calculation
            predictions = np.argmax(y_probs, axis=1)
            true_labels = np.argmax(Y_one_hot, axis=1)
            acc = np.mean(predictions == true_labels)
            
            # Backward
            self.backward(Y_one_hot, lr)
            
            print(f"Epoch {epoch+1}: Loss = {loss:.4f}, Accuracy = {acc*100:.2f}%")

In [5]:
from tensorflow.keras.datasets import mnist

In [6]:
def run_mnist_demo():
    print("Loading MNIST data...")
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    # --- Preprocessing ---
    #  Normalize (0-255 -> 0-1)
    x_train = x_train.astype('float32') / 255.0
    x_test = x_test.astype('float32') / 255.0

    #  Reshape to (Batch, Channels, Height, Width)
    # Batch:-1 Calculate this dimension automatically based on the total number of items
    # MNIST is grayscale, so it has 1 channel
    x_train = x_train.reshape(-1, 1, 28, 28) 
    x_test = x_test.reshape(-1, 1, 28, 28)

    #  SUBSET DATA
    # We will use only 500 images for training to keep runtime fast. 
    TRAIN_SIZE = 500
    TEST_SIZE = 100
    
    x_train_sub = x_train[:TRAIN_SIZE]
    y_train_sub = y_train[:TRAIN_SIZE]
    x_test_sub = x_test[:TEST_SIZE]
    y_test_sub = y_test[:TEST_SIZE]

    print(f"Training on {TRAIN_SIZE} images. Input shape: {x_train_sub.shape}")

    # --- Configuration ---
    # Input: 1 channel, 28x28
    # Conv: 4 filters of size 3x3
    # Dense: 64 hidden neurons -> 10 output classes
    input_shape = (1, 28, 28)
    num_filters = 4   #  4 different feature maps   
    kernel_size = 3   # Size of each filter
    dense_layers = [64, 10] 
    
    # Initialize CNN
    cnn = CNN(input_shape, num_filters, kernel_size, dense_layers, padding=0, stride=1)

    # Train
    print("\nStarting Training ")
    # Higher learning rate because batch size is small and data is normalized
    cnn.train(x_train_sub, y_train_sub, lr=0.1, epochs=25,)

    # --- Test ---
    print("\nEvaluatin on Test Set...")
    probs = cnn.forward(x_test_sub)
    preds = np.argmax(probs, axis=1)
    acc = np.mean(preds == y_test_sub) ## average of boolean array of correct predictions
    print(f"Test Accuracy: {acc*100:.2f}%")

In [7]:
np.random.seed(42)
run_mnist_demo()

Loading MNIST data...
Training on 500 images. Input shape: (500, 1, 28, 28)
CNN Initialized. Output Classes: 10

Starting Training 
Epoch 1: Loss = 2.3651, Accuracy = 10.80%
Epoch 2: Loss = 2.2936, Accuracy = 15.00%
Epoch 3: Loss = 2.2488, Accuracy = 16.40%
Epoch 4: Loss = 2.2089, Accuracy = 19.60%
Epoch 5: Loss = 2.1676, Accuracy = 22.20%
Epoch 6: Loss = 2.1203, Accuracy = 24.40%
Epoch 7: Loss = 2.0632, Accuracy = 27.20%
Epoch 8: Loss = 1.9929, Accuracy = 31.80%
Epoch 9: Loss = 1.9056, Accuracy = 37.00%
Epoch 10: Loss = 1.7994, Accuracy = 45.60%
Epoch 11: Loss = 1.6739, Accuracy = 53.40%
Epoch 12: Loss = 1.5343, Accuracy = 57.20%
Epoch 13: Loss = 1.3895, Accuracy = 61.80%
Epoch 14: Loss = 1.2481, Accuracy = 66.20%
Epoch 15: Loss = 1.1173, Accuracy = 69.40%
Epoch 16: Loss = 1.0019, Accuracy = 72.40%
Epoch 17: Loss = 0.9030, Accuracy = 75.60%
Epoch 18: Loss = 0.8195, Accuracy = 78.40%
Epoch 19: Loss = 0.7503, Accuracy = 81.40%
Epoch 20: Loss = 0.6927, Accuracy = 82.40%
Epoch 21: Loss = 