In [86]:
import numpy as np

# Activation functions
def relu(x):
    """ReLU activation: max(0, x) applied elementwise."""
    return np.maximum(0, x)

def tanh(x):
    """tanh activation: (e^x - e^{-x})/(e^x + e^{-x}) applied elementwise."""
    return np.tanh(x)

def softmax(x):
    """Softmax activation for output layer (assumes x is 1D or 2D batch of scores)."""
    # For numeric stability, subtract max
    x_shifted = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [87]:
def conv_forward(X, W, b):
    """
    Convolutional layer forward pass.
    X: input array of shape (C_in, H_in, W_in)
    W: weight/filter array of shape (C_out, C_in, kH, kW)
    b: bias array of shape (C_out,)
    Returns: output feature map of shape (C_out, H_out, W_out)

    TODO: use numpy for simplifying for loops
    """
    C_out, C_in, kH, kW = W.shape
    _, H_in, W_in = X.shape
    H_out = H_in - kH + 1  # using stride 1, no padding
    W_out = W_in - kW + 1
    # Initialize output volume
    out = np.zeros((C_out, H_out, W_out))
    # Convolution: slide each filter over the input
    for oc in range(C_out):             # for each output channel (filter)
        for i in range(H_out):          # slide vertically
            for j in range(W_out):      # slide horizontally
                # current region of input of shape (C_in, kH, kW)
                region = X[:, i:i+kH, j:j+kW]
                # element-wise multiply and sum -> dot product
                out[oc, i, j] = np.sum(region * W[oc]) + b[oc]
    return out

# Example usage:
# X has shape (1, 32, 32) for a single grayscale image, W shape (6, 1, 5, 5), b shape (6,)
X = np.random.rand(1, 32, 32)  # dummy input
W1 = np.random.rand(6, 1, 5, 5)
b1 = np.random.rand(6,)
out1 = conv_forward(X, W1, b1)
print(out1)  # should be (6, 28, 28)

out1_act = tanh(out1)


[[[ 8.65269689  9.42854675  8.95199431 ...  8.74085145  9.24134948
    9.55777101]
  [ 8.82670369  9.53657871  9.78077039 ...  8.09002482  7.8895319
    7.93820106]
  [ 9.2575507   9.44096558  8.817653   ...  7.8226503   7.86533669
    7.35286737]
  ...
  [ 7.98316966  8.64130349  7.83604953 ...  7.82073939  7.49803918
    7.97315672]
  [ 7.49586556  8.15670343  8.11818799 ...  7.86417731  7.86818697
    7.27293307]
  [ 8.25721098  8.72984269  8.56161828 ...  8.60501633  8.44472628
    7.9733721 ]]

 [[ 7.18332206  7.70161094  7.4187746  ...  7.5267895   8.05437829
    6.9330012 ]
  [ 7.67976425  8.18695511  7.58049322 ...  7.00137078  7.5586372
    6.63283405]
  [ 7.98354905  8.9939599   7.91851696 ...  6.71862261  7.09160049
    6.51825187]
  ...
  [ 6.67493552  7.3215337   7.05382329 ...  7.19265886  6.15742912
    6.21267981]
  [ 6.25791461  7.55573582  6.86366544 ...  6.77531094  6.50329463
    6.59781375]
  [ 6.12297282  7.19130921  7.03384585 ...  7.20716243  6.97956501
    6.88

In [88]:
def avg_pool_forward(X):
    """
    Average pooling forward pass (2x2 pool, stride 2).
    X: input array of shape (C, H, W)
    Returns: output array of shape (C, H/2, W/2)
    """
    C, H, W = X.shape
    # Assuming H and W are even and divisible by 2 for simplicity
    out = np.zeros((C, H//2, W//2))
    for c in range(C):
        for i in range(0, H, 2):         # step by 2
            for j in range(0, W, 2):     # step by 2
                patch = X[c, i:i+2, j:j+2]        # 2x2 region
                out[c, i//2, j//2] = np.mean(patch)
    return out

# Example usage:
X_pool_in = np.random.rand(6, 28, 28)  # e.g. output from conv layer (6,28,28)
out_pool = avg_pool_forward(X_pool_in)
print(out_pool.shape)  # should be (6, 14, 14)

(6, 14, 14)


In [89]:
def fc_forward(x, W, b):
    """
    Fully-connected layer forward pass.
    x: input vector of shape (N_in,)
    W: weight matrix of shape (N_out, N_in)
    b: bias vector of shape (N_out,)
    Returns: output vector of shape (N_out,)
    """
    return W.dot(x) + b

# Example usage:
x = np.random.rand(120,)    # input from previous layer (flattened conv output)
W4 = np.random.rand(84, 120)
b4 = np.random.rand(84,)
out_fc = fc_forward(x, W4, b4)
print(out_fc.shape)  # should be (84,)


(84,)


In [90]:
def lenet5_forward(image):
    """
    Forward pass for the entire LeNet-5 model on a single image.
    image: input image array of shape (1, 32, 32)  - 1 channel, 32x32 (assuming already zero-padded to 32x32 if originally 28x28).
    Returns: output probabilities for 10 classes (softmax output vector of shape (10,))
    """
    # Layer C1: Conv 5x5 -> 6 feature maps, then activation
    out_c1 = conv_forward(image, W1, b1)         # shape (6, 28, 28)
    out_c1 = tanh(out_c1)                        # apply tanh or relu
    
    # Layer S2: 2x2 Pooling -> 6 feature maps
    out_s2 = avg_pool_forward(out_c1)            # shape (6, 14, 14)
    
    # Layer C3: Conv 5x5 -> 16 feature maps, then activation
    out_c3 = conv_forward(out_s2, W2, b2)        # shape (16, 10, 10)
    out_c3 = tanh(out_c3)
    
    # Layer S4: 2x2 Pooling -> 16 feature maps
    out_s4 = avg_pool_forward(out_c3)            # shape (16, 5, 5)
    
    # Layer C5: Conv 5x5 -> 120 feature maps (1x1 each), then activation
    out_c5 = conv_forward(out_s4, W3, b3)        # shape (120, 1, 1)
    out_c5 = tanh(out_c5)
    # Flatten output of C5 to a vector of length 120:
    out_c5_flat = out_c5.reshape(-1)             # shape (120,)
    
    # Layer F6: Fully connected -> 84, then activation
    out_f6 = fc_forward(out_c5_flat, W4, b4)     # shape (84,)
    out_f6 = tanh(out_f6)
    
    # Output layer: Fully connected -> 10, then softmax
    out_out = fc_forward(out_f6, W5, b5)         # shape (10,)
    probs = softmax(out_out)                     # shape (10,) probabilities
    
    return probs


In [91]:
def fc_backward(dout, x, W):
    """
    Backprop for a fully-connected layer.
    dout: gradient of loss w.r.t. layer output (shape: N_out,)
    x: input to the layer from forward pass (shape: N_in,)
    W: weight matrix of this layer (shape: N_out, N_in)
    Returns: (dx, dW, db)
    """
    # Gradient w.rt input
    dx = W.T.dot(dout)         # shape: (N_in,)
    # Gradient w.rt weights
    dW = np.outer(dout, x)     # outer product of dout (N_out,) and x (N_in,) to get (N_out, N_in)
    # Gradient w.rt bias
    db = dout.copy()           # shape: (N_out,) (for a single sample, just copy dout; for batch, we'd sum dout over batch)
    return dx, dW, db

# Example usage:
dout = np.random.rand(84,)   # say gradient from next layer for F6 output (84,)
x = np.random.rand(120,)     # the input that was fed into this FC layer (flattened conv output)
W4 = np.random.rand(84, 120)
dx, dW, db = fc_backward(dout, x, W4)
print(dW.shape, db.shape, dx.shape)  # (84, 120), (84,), (120,)

(84, 120) (84,) (120,)


In [92]:
def conv_backward(dout, X, W):
    """
    Backprop for a convolutional layer (stride=1, no padding).
    dout: gradient of loss w.rt. conv layer output (shape: C_out, H_out, W_out)
    X: input to the conv layer from forward pass (shape: C_in, H_in, W_in)
    W: conv layer weight matrix (shape: C_out, C_in, kH, kW)
    Returns: (dX, dW, db)
    """
    C_out, C_in, kH, kW = W.shape
    _, H_in, W_in = X.shape
    _, H_out, W_out = dout.shape

    # Initialize gradients to zero
    dW = np.zeros_like(W)
    db = np.zeros(C_out)
    dX = np.zeros_like(X)
    # Compute gradients
    for oc in range(C_out):
        # Bias grad: sum of dout for this output channel
        db[oc] += np.sum(dout[oc])
        for i in range(H_out):
            for j in range(W_out):
                # The gradient from this output element
                grad_val = dout[oc, i, j]
                # The corresponding input region that contributed to this output
                input_region = X[:, i:i+kH, j:j+kW]
                # Weight gradients: add input_region * grad_val
                dW[oc] += grad_val * input_region
                # Input gradients: add filter weights * grad_val
                dX[:, i:i+kH, j:j+kW] += grad_val * W[oc]
    return dX, dW, db

# Example usage:
# dout has shape (6, 28, 28) matching conv1 output, X was (1, 32, 32), W1 was (6, 1, 5, 5)
dout = np.random.rand(6, 28, 28)
X = np.random.rand(1, 32, 32)
W1 = np.random.rand(6, 1, 5, 5)
dX, dW, db = conv_backward(dout, X, W1)
print(dW.shape, db.shape, dX.shape)  # (6,1,5,5), (6,), (1,32,32)


(6, 1, 5, 5) (6,) (1, 32, 32)


In [93]:
def avg_pool_backward(dout, X):
    """
    Backprop for 2x2 average pooling layer.
    dout: gradient of loss w.rt pooling output (shape: C, H_out, W_out)
    X: input to pooling layer from forward pass (shape: C, H_in, W_in)
    Returns: dX (same shape as X)
    """
    C, H_in, W_in = X.shape
    # H_out, W_out are half of H_in, W_in
    dX = np.zeros_like(X)
    # Each output gradient splits evenly to 4 inputs
    for c in range(C):
        for i in range(0, H_in, 2):
            for j in range(0, W_in, 2):
                # gradient for this 2x2 block in output
                grad_val = dout[c, i//2, j//2]
                # distribute it to each of the 4 input cells
                dX[c, i:i+2, j:j+2] += grad_val / 4.0
    return dX

# Example usage:
dout = np.random.rand(6, 14, 14)  # gradient from next layer (S2 output shape)
X = np.random.rand(6, 28, 28)     # original input to pooling (C1 output)
dX = avg_pool_backward(dout, X)
print(dX.shape)  # (6, 28, 28)


(6, 28, 28)


In [94]:
# Determine fan_in and fan_out for each layer
# Layer C1: 6 filters of size 5x5x1 -> fan_in = 1*5*5, fan_out = 6*5*5 (if considering each filter connects to 5x5 region in one output)
fan_in_c1 = 1 * 5 * 5
fan_out_c1 = 6 * 5 * 5
# Using Xavier (tanh) for conv layers as in original LeNet
W1 = np.random.randn(6, 1, 5, 5) * np.sqrt(2.0 / (fan_in_c1 + fan_out_c1))
b1 = np.zeros(6)

# Layer C3: 16 filters of size 5x5x6
fan_in_c3 = 6 * 5 * 5
fan_out_c3 = 16 * 5 * 5
W2 = np.random.randn(16, 6, 5, 5) * np.sqrt(2.0 / (fan_in_c3 + fan_out_c3))
b2 = np.zeros(16)

# Layer C5: 120 filters of size 5x5x16
fan_in_c5 = 16 * 5 * 5
fan_out_c5 = 120 * 5 * 5
W3 = np.random.randn(120, 16, 5, 5) * np.sqrt(2.0 / (fan_in_c5 + fan_out_c5))
b3 = np.zeros(120)

# Fully connected F6: 120 -> 84
fan_in_f6 = 120
fan_out_f6 = 84
W4 = np.random.randn(84, 120) * np.sqrt(2.0 / (fan_in_f6 + fan_out_f6))
b4 = np.zeros(84)

# Output layer: 84 -> 10
fan_in_out = 84
fan_out_out = 10
W5 = np.random.randn(10, 84) * np.sqrt(2.0 / (fan_in_out + fan_out_out))
b5 = np.zeros(10)


In [95]:
def cross_entropy_loss(logits, y_true):
    """
    Compute cross-entropy loss for classification.
    
    Parameters:
    - logits: Raw model outputs of shape (batch_size, num_classes)
    - y_true: True labels, can be either:
              - Class indices of shape (batch_size,)
              - One-hot encoded vectors of shape (batch_size, num_classes)
    
    Returns:
    - Average cross-entropy loss over the batch
    """
    # Apply softmax to get probabilities
    # Shift by max for numerical stability
    shifted_logits = logits - np.max(logits, axis=1, keepdims=True)
    exp_logits = np.exp(shifted_logits)
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
    
    batch_size = logits.shape[0]
    
    # Handle both one-hot and class index formats
    if len(y_true.shape) == 1:
        # y_true contains class indices
        batch_indices = np.arange(batch_size)
        true_probs = probs[batch_indices, y_true]
        loss = -np.mean(np.log(true_probs + 1e-12))  # Add epsilon to avoid log(0)
    else:
        # y_true is one-hot encoded
        loss = -np.sum(y_true * np.log(probs + 1e-12)) / batch_size
    
    return loss

In [96]:
from tensorflow.keras.datasets import mnist

# Load MNIST dataset (it returns train and test splits)
(X_train, y_train), (X_test, y_test) = mnist.load_data()
print("MNIST data shape:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

MNIST data shape: (60000, 28, 28) (60000,) (10000, 28, 28) (10000,)


In [97]:
# Normalize pixel values to 0-1
X_train = X_train.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0

# Select a smaller subset of training data
train_subset_size = 5000  # Adjust as needed
indices_train = np.random.choice(X_train.shape[0], train_subset_size, replace=False)
X_train = X_train[indices_train]
y_train = y_train[indices_train]
print(f"Using {train_subset_size} training examples instead of 60000")

# Select a smaller subset of test data
test_subset_size = 5000  # Adjust as needed
indices_test = np.random.choice(X_test.shape[0], test_subset_size, replace=False)
X_test = X_test[indices_test]
y_test = y_test[indices_test]
print(f"Using {test_subset_size} test examples instead of 10000")

# Pad images from 28x28 to 32x32
X_train_padded = np.pad(X_train, ((0,0),(2,2),(2,2)), mode='constant')
X_test_padded = np.pad(X_test, ((0,0),(2,2),(2,2)), mode='constant')
print("After padding:", X_train_padded.shape, X_test_padded.shape)

# Reshape to add channel dimension
X_train_padded = X_train_padded.reshape(-1, 1, 32, 32)
X_test_padded = X_test_padded.reshape(-1, 1, 32, 32)
print("After adding channel dim:", X_train_padded.shape, X_test_padded.shape)

Using 5000 training examples instead of 60000
Using 5000 test examples instead of 10000
After padding: (5000, 32, 32) (5000, 32, 32)
After adding channel dim: (5000, 1, 32, 32) (5000, 1, 32, 32)


In [None]:
# Hyperparameters
learning_rate = 0.01
num_epochs = 5

for epoch in range(num_epochs):
    # Shuffle training data (optional for SGD)
    permutation = np.random.permutation(X_train_padded.shape[0])
    X_train_padded = X_train_padded[permutation]
    y_train = y_train[permutation]
    
    total_loss = 0.0
    for n in range(X_train_padded.shape[0]):
        x = X_train_padded[n]        # shape (1,32,32)
        y = y_train[n]              # true label 0-9
        
        # ===== Forward pass =====
        out_c1 = tanh(conv_forward(x, W1, b1))
        out_s2 = avg_pool_forward(out_c1)
        out_c3 = tanh(conv_forward(out_s2, W2, b2))
        out_s4 = avg_pool_forward(out_c3)
        out_c5 = tanh(conv_forward(out_s4, W3, b3))    # shape (120,1,1)
        out_c5_flat = out_c5.reshape(-1)               # flatten to (120,)
        out_f6 = tanh(fc_forward(out_c5_flat, W4, b4)) # shape (84,)
        out_final = fc_forward(out_f6, W5, b5)         # shape (10,)
        probs = softmax(out_final)                    # shape (10,)
        
        # Compute cross-entropy loss and initial gradient
        # Create one-hot vector for true label
        y_onehot = np.zeros(10)
        y_onehot[y] = 1.0
        # Loss for this sample (cross-entropy): -sum(y_onehot * log(probs))
        loss = -np.sum(y_onehot * np.log(probs + 1e-8))
        total_loss += loss
        
        # Gradient of loss w.rt. pre-softmax scores
        dout = probs - y_onehot  # shape (10,)
        
        # ===== Backward pass =====
        # Backprop through output layer (FC 10)
        d_out_f6, dW5, db5 = fc_backward(dout, out_f6, W5)
        # Backprop through activation (tanh) at F6
        d_out_f6 *= (1 - out_f6**2)
        
        # F6 layer (FC 84) backward
        d_out_c5_flat, dW4, db4 = fc_backward(d_out_f6, out_c5_flat, W4)
        # Backprop through activation (tanh) at C5 output
        out_c5_vec = out_c5.reshape(-1)       # 120,
        d_out_c5_vec = d_out_c5_flat * (1 - out_c5_vec**2)
        d_out_c5 = d_out_c5_vec.reshape(120, 1, 1)  # reshape to (120,1,1) to match conv output shape
        
        # C5 conv layer backward
        d_out_s4, dW3, db3 = conv_backward(d_out_c5, out_s4, W3)
        # Backprop through activation (tanh) at C3 output
        d_out_s4 = d_out_s4  # S4 output was before activation, actually C5 input = S4 output, no activation directly on S4.
        
        # S4 pooling backward
        d_out_c3 = avg_pool_backward(d_out_s4, out_c3)
        # Backprop through activation (tanh) at C3
        d_out_c3 *= (1 - out_c3**2)
        
        # C3 conv layer backward
        d_out_s2, dW2, db2 = conv_backward(d_out_c3, out_s2, W2)
        # S2 pooling backward
        d_out_c1 = avg_pool_backward(d_out_s2, out_c1)
        # Backprop through activation (tanh) at C1
        d_out_c1 *= (1 - out_c1**2)
        
        # C1 conv layer backward
        _, dW1, db1 = conv_backward(d_out_c1, x, W1)  # we don't need dX for input layer further
        
        # ===== Update weights with SGD =====
        W5 -= learning_rate * dW5; b5 -= learning_rate * db5
        W4 -= learning_rate * dW4; b4 -= learning_rate * db4
        W3 -= learning_rate * dW3; b3 -= learning_rate * db3
        W2 -= learning_rate * dW2; b2 -= learning_rate * db2
        W1 -= learning_rate * dW1; b1 -= learning_rate * db1
    # End of epoch
    avg_loss = total_loss / X_train_padded.shape[0]
    print(f"Epoch {epoch+1}: average loss = {avg_loss:.4f}")


   


Epoch 1: average loss = 0.5404
Epoch 2: average loss = 0.2573
Epoch 3: average loss = 0.1679
Epoch 4: average loss = 0.1143
Epoch 5: average loss = 0.0725


In [None]:
correct = 0
total = X_test_padded.shape[0]
for n in range(total):
    x = X_test_padded[n]
    y_true = y_test[n]
    # Forward pass (using the same lenet5_forward or expanded steps as before)
    probs = lenet5_forward(x)         # get softmax probabilities for this test image
    y_pred = np.argmax(probs)         # predicted class is index of max probability
    if y_pred == y_true:
        correct += 1
accuracy = correct / total
print(f"Test Accuracy = {accuracy*100:.2f}%")

Test Accuracy = 95.80%


In [106]:
image_path = "./image.png"
img = Image.open(image_path).convert('L')  # Convert to grayscale

# 2. Resize to 28x28 (MNIST standard size)
img = img.resize((28, 28))

# 3. Convert to numpy array and normalize
img_array = np.array(img).astype(np.float32) / 255.0

# 4. Invert colors if needed (assuming black digit on white background)
# MNIST has white digits on black background
img_array = 1.0 - img_array  # Comment this out if your image is already white digit on black

# 6. Pad to 32x32 (to match your model's expected input)
img_padded = np.pad(img_array, ((2, 2), (2, 2)), mode='constant')

img_input = img_padded.reshape(1, 32, 32)

probs = lenet5_forward(img_input)
y_pred = np.argmax(probs)

# 5. Display prediction result
print(f"Model prediction: {y_pred}")
print(f"Confidence: {probs[y_pred]:.4f}")

Model prediction: 4
Confidence: 0.9943
