1a

In [1]:

import numpy as np

class FourLayerMLP:
    """
    A simple 4-layer MLP with fully connected layers:
      - Layer 1: Input -> Hidden1
      - Layer 2: Hidden1 -> Hidden2
      - Layer 3: Hidden2 -> Hidden3
      - Layer 4: Hidden3 -> Output
    """

    def __init__(self, input_dim, h1_dim, h2_dim, h3_dim, output_dim, activation='relu'):
        """
        input_dim:  size of input features
        h1_dim:     number of neurons in hidden layer 1
        h2_dim:     number of neurons in hidden layer 2
        h3_dim:     number of neurons in hidden layer 3
        output_dim: number of output neurons
        activation: 'relu' or 'sigmoid'
        """
        self.activation = activation

        # Initialize weight & bias parameters
        # We have 4 sets of weights/biases:
        #   W1, b1 -> shape (input_dim, h1_dim)
        #   W2, b2 -> shape (h1_dim, h2_dim)
        #   W3, b3 -> shape (h2_dim, h3_dim)
        #   W4, b4 -> shape (h3_dim, output_dim)
        self.W1 = 0.01 * np.random.randn(input_dim, h1_dim)
        self.b1 = np.zeros((1, h1_dim))
        self.W2 = 0.01 * np.random.randn(h1_dim, h2_dim)
        self.b2 = np.zeros((1, h2_dim))
        self.W3 = 0.01 * np.random.randn(h2_dim, h3_dim)
        self.b3 = np.zeros((1, h3_dim))
        self.W4 = 0.01 * np.random.randn(h3_dim, output_dim)
        self.b4 = np.zeros((1, output_dim))

    def _activate(self, z, derivative=False):
        """
        Applies the activation function (ReLU or Sigmoid).
        If derivative=True, returns the local gradient w.r.t. z.
        """
        if self.activation == 'relu':
            if derivative:
                return (z > 0).astype(float)
            return np.maximum(0, z)
        elif self.activation == 'sigmoid':
            s = 1.0 / (1.0 + np.exp(-z))
            if derivative:
                return s * (1.0 - s)
            return s

    def forward(self, X):
        """
        Forward pass:
          Z1 = X W1 + b1   -> A1 = activate(Z1)
          Z2 = A1 W2 + b2  -> A2 = activate(Z2)
          Z3 = A2 W3 + b3  -> A3 = activate(Z3)
          Z4 = A3 W4 + b4  -> Output = A4
        """
        self.X0 = X  # store input
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = self._activate(self.Z1)

        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = self._activate(self.Z2)

        self.Z3 = self.A2 @ self.W3 + self.b3
        self.A3 = self._activate(self.Z3)

        self.Z4 = self.A3 @ self.W4 + self.b4
        self.A4 = self.Z4  # Often final layer is linear; or apply an activation if desired

        return self.A4

    def backward(self, X, Y, lr=0.001):
        """
        Backpropagation for MSE loss:
          dLoss/dA4 = A4 - Y
          Then we propagate through each layer in reverse.
        """
        m = X.shape[0]  # batch size
        dA4 = (self.A4 - Y)  # dLoss/dA4

        # Layer 4
        dZ4 = dA4  # if no activation on final output
        dW4 = (self.A3.T @ dZ4) / m
        db4 = np.sum(dZ4, axis=0, keepdims=True) / m

        # backprop to layer 3
        dA3 = dZ4 @ self.W4.T
        dZ3 = dA3 * self._activate(self.Z3, derivative=True)
        dW3 = (self.A2.T @ dZ3) / m
        db3 = np.sum(dZ3, axis=0, keepdims=True) / m

        # backprop to layer 2
        dA2 = dZ3 @ self.W3.T
        dZ2 = dA2 * self._activate(self.Z2, derivative=True)
        dW2 = (self.A1.T @ dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m

        # backprop to layer 1
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * self._activate(self.Z1, derivative=True)
        dW1 = (self.X0.T @ dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m

        # Update parameters
        self.W4 -= lr * dW4
        self.b4 -= lr * db4
        self.W3 -= lr * dW3
        self.b3 -= lr * db3
        self.W2 -= lr * dW2
        self.b2 -= lr * db2
        self.W1 -= lr * dW1
        self.b1 -= lr * db1

    def train(self, X, Y, epochs=100, lr=0.001, verbose=True):
        """
        Train with gradient descent for 'epochs' passes over data (X, Y).
        """
        for epoch in range(epochs):
            self.forward(X)
            self.backward(X, Y, lr=lr)
            if verbose and (epoch % 10 == 0 or epoch == epochs-1):
                loss = np.mean((self.A4 - Y)**2)
                print(f"Epoch {epoch} | MSE Loss: {loss:.4f}")


def demo():
    # Example usage:
    # We want a 4-layer MLP: input-> h1-> h2-> h3-> output
    net = FourLayerMLP(input_dim=2, h1_dim=3, h2_dim=4, h3_dim=3, output_dim=1, activation='relu')

    # Fake data: X in R^2, Y in R^1
    np.random.seed(42)
    X = np.random.randn(100, 2)
    Y = (X[:, 0:1]**2 + 0.5 * X[:, 1:2] - 1.0)  # some arbitrary target

    net.train(X, Y, epochs=50, lr=0.01, verbose=True)

if __name__ == "__main__":
    demo()


Epoch 0 | MSE Loss: 1.4333
Epoch 10 | MSE Loss: 1.4225
Epoch 20 | MSE Loss: 1.4137
Epoch 30 | MSE Loss: 1.4064
Epoch 40 | MSE Loss: 1.4005
Epoch 49 | MSE Loss: 1.3961


1b

In [2]:
#!/usr/bin/env python

import numpy as np

class MLP:
    """
    A simple MLP that supports multiple layers with either ReLU or Sigmoid activation.
    Uses NumPy for forward and backward propagation, illustrating the use of np.einsum
    for outer products in the backward pass.
    """

    def __init__(self, layer_sizes, activation='relu'):
        """
        layer_sizes: list of layer dimensions, e.g. [input_dim, hidden1_dim, hidden2_dim, output_dim].
        activation:  'relu' or 'sigmoid'.
        """
        self.activation = activation
        self.num_layers = len(layer_sizes) - 1
        # Initialize weights and biases
        # Weights[i] has shape (layer_sizes[i], layer_sizes[i+1])
        # Biases[i]   has shape (layer_sizes[i+1], )
        self.weights = []
        self.biases = []
        for i in range(self.num_layers):
            w = 0.01 * np.random.randn(layer_sizes[i], layer_sizes[i+1])
            b = np.zeros(layer_sizes[i+1])
            self.weights.append(w)
            self.biases.append(b)

    def _activate(self, z, derivative=False):
        """
        Applies either ReLU or Sigmoid. If derivative=True, returns derivative w.r.t. z.
        """
        if self.activation == 'relu':
            if derivative:
                return (z > 0).astype(float)
            return np.maximum(0, z)
        elif self.activation == 'sigmoid':
            s = 1.0 / (1.0 + np.exp(-z))
            if derivative:
                return s * (1.0 - s)
            return s

    def forward(self, X):
        """
        Forward pass through all layers. Store pre-activation (Z) and activation (A) for each layer.
        A[0] = X (input), A[L] = final output.
        """
        self.A = [X]          # Activations for each layer
        self.Z = []           # Pre-activations for each layer
        current = X

        for i in range(self.num_layers):
            # Z_i = A_{i} * W_i + b_i
            # shape of current: (batch_size, layer_sizes[i])
            # shape of weights[i]: (layer_sizes[i], layer_sizes[i+1])
            z_i = current @ self.weights[i] + self.biases[i]
            self.Z.append(z_i)
            # Activation
            a_i = self._activate(z_i)
            self.A.append(a_i)
            current = a_i

        return self.A[-1]  # final layer activation

    def backward(self, Y, lr=0.01):
        """
        Backprop for MSE loss: loss = 0.5 * sum((A[L] - Y)^2).
        We'll compute dLoss/dA[L], then propagate backwards.
        Use np.einsum to form outer products for weight updates.
        
        Y shape: (batch_size, layer_sizes[-1])
        """
        batch_size = Y.shape[0]
        # Output layer error: dLoss/dA[L] = (A[L] - Y)
        dA = (self.A[-1] - Y)

        # We'll go layer by layer in reverse
        for i in reversed(range(self.num_layers)):
            # dZ = dA * activation'(Z)
            dZ = dA * self._activate(self.Z[i], derivative=True)

            # Weight gradient: dW = A[i]^T * dZ  (batched)
            # But let's do it with np.einsum to demonstrate usage:
            # shape(A[i]) = (batch_size, layer_sizes[i])
            # shape(dZ)   = (batch_size, layer_sizes[i+1])
            # We want outer sum => (layer_sizes[i], layer_sizes[i+1])
            dW = np.zeros_like(self.weights[i])
            db = np.sum(dZ, axis=0)

            # For each sample in the batch:
            for b_idx in range(batch_size):
                # outer product of A[i][b_idx,:] and dZ[b_idx,:]
                dW += np.einsum('i,j->ij', self.A[i][b_idx], dZ[b_idx])
            dW /= batch_size
            db /= batch_size

            # Update weights and biases
            self.weights[i] -= lr * dW
            self.biases[i]  -= lr * db

            # Compute dA for next layer if not the first layer
            if i > 0:
                # dA_{i-1} = dZ * W_i^T
                # shape(dZ) = (batch_size, layer_sizes[i+1])
                # shape(W_i^T) = (layer_sizes[i+1], layer_sizes[i])
                dA = dZ @ self.weights[i].T

    def train(self, X, Y, epochs=100, lr=0.01, verbose=True):
        """
        Simple training loop: forward -> backward -> update.
        X shape: (batch_size, input_dim)
        Y shape: (batch_size, output_dim)
        """
        for e in range(epochs):
            # Forward
            pred = self.forward(X)
            # Compute MSE
            loss = np.mean(0.5 * (pred - Y)**2)
            # Backprop
            self.backward(Y, lr=lr)

            if verbose and e % 10 == 0:
                print(f"Epoch {e}, MSE Loss: {loss:.5f}")


def demo():
    # Example usage:
    # We'll create a small MLP with 1 hidden layer [2 -> 4 -> 1]
    # or 2 hidden layers if we like, e.g. [2 -> 4 -> 4 -> 1].
    layer_sizes = [2, 4, 1]
    net = MLP(layer_sizes, activation='sigmoid')

    # Create dummy data: input X of shape (100, 2), Y of shape (100, 1)
    np.random.seed(42)
    X = np.random.randn(100, 2)
    Y = (X[:, :1] * 0.5 + X[:, 1:] * (-0.3))  # some linear combination

    # Train
    net.train(X, Y, epochs=50, lr=0.01, verbose=True)

if __name__ == "__main__":
    demo()


Epoch 0, MSE Loss: 0.29671
Epoch 10, MSE Loss: 0.29264
Epoch 20, MSE Loss: 0.28866
Epoch 30, MSE Loss: 0.28479
Epoch 40, MSE Loss: 0.28101


2A 2B

In [3]:
#!/usr/bin/env python

import numpy as np

def relu(x, derivative=False):
    if derivative:
        return (x > 0).astype(float)
    return np.maximum(0, x)

def sigmoid(x, derivative=False):
    s = 1.0 / (1.0 + np.exp(-x))
    if derivative:
        return s * (1.0 - s)
    return s

class SimpleCNN:
    """
    A minimal CNN with:
      - One convolutional layer (with a small number of filters).
      - One fully connected layer for classification/regression.
      - Supports ReLU or Sigmoid activation for both layers.
    """

    def __init__(self, 
                 in_channels=1, 
                 out_channels=2, 
                 kernel_size=3, 
                 fc_size=4, 
                 num_classes=2,
                 activation='relu'):
        """
        in_channels:   number of channels in the input image (e.g. 1 for grayscale, 3 for RGB).
        out_channels:  number of filters in the conv layer.
        kernel_size:   size of each filter (assume square).
        fc_size:       number of hidden units in the fully connected layer.
        num_classes:   output dimension (e.g. number of classes).
        activation:    'relu' or 'sigmoid'.
        """
        self.activation_name = activation
        if activation == 'relu':
            self.act_fn = relu
        else:
            self.act_fn = sigmoid

        # Convolution filter weights: shape (out_channels, in_channels, kernel_size, kernel_size)
        # Bias for each filter: shape (out_channels,)
        self.conv_w = 0.01 * np.random.randn(out_channels, in_channels, kernel_size, kernel_size)
        self.conv_b = np.zeros((out_channels,))

        # We won't know the conv output shape until we see an input. Suppose we do 'valid' conv.
        # We'll do a lazy init for the fully connected layer once we see the input shape in forward().

        self.fc_initialized = False
        self.fc_w = None  # will be (flattened_dim, fc_size)
        self.fc_b = None
        self.fc_w2 = None # final layer (fc_size, num_classes)
        self.fc_b2 = None
        self.num_classes = num_classes
        self.fc_size = fc_size

    def _conv2d_forward(self, x):
        """
        Naive forward pass for a 'valid' convolution (no padding, stride=1).
        x shape: (batch_size, in_channels, H, W)
        returns: (batch_size, out_channels, H_out, W_out)
        """
        batch_size, in_c, H, W = x.shape
        out_c, _, K, _ = self.conv_w.shape
        H_out = H - K + 1
        W_out = W - K + 1

        # Output
        out = np.zeros((batch_size, out_c, H_out, W_out))

        # Convolution loop (naive)
        for n in range(batch_size):
            for oc in range(out_c):
                for i in range(H_out):
                    for j in range(W_out):
                        # region from x
                        patch = x[n, :, i:i+K, j:j+K]  # shape (in_c, K, K)
                        # elementwise multiply with conv_w[oc], then sum
                        out[n, oc, i, j] = np.sum(patch * self.conv_w[oc]) + self.conv_b[oc]
        return out

    def _conv2d_backward(self, x, d_out):
        """
        Naive backward pass for conv layer. 
        x shape: (batch_size, in_channels, H, W)
        d_out shape: (batch_size, out_channels, H_out, W_out) => gradient wrt conv output
        returns:
          dx: gradient wrt input x
          dW: gradient wrt conv_w
          db: gradient wrt conv_b
        """
        batch_size, in_c, H, W = x.shape
        out_c, _, K, _ = self.conv_w.shape
        _, _, H_out, W_out = d_out.shape

        dx = np.zeros_like(x)
        dW = np.zeros_like(self.conv_w)
        db = np.zeros_like(self.conv_b)

        # Loop
        for n in range(batch_size):
            for oc in range(out_c):
                for i in range(H_out):
                    for j in range(W_out):
                        grad_val = d_out[n, oc, i, j]
                        # This grad_val contributes to each element in patch
                        dx[n, :, i:i+K, j:j+K] += self.conv_w[oc] * grad_val
                        dW[oc] += x[n, :, i:i+K, j:j+K] * grad_val
                        db[oc] += grad_val
        return dx, dW, db

    def forward(self, x):
        """
        Forward pass:
         1) Convolution -> activation
         2) Flatten
         3) Fully connected layer -> activation
         4) Fully connected layer -> final output (logits or regression)
        """
        self.x_in = x
        # 1) Conv
        self.z_conv = self._conv2d_forward(x)
        self.a_conv = self.act_fn(self.z_conv)

        # 2) Flatten
        self.batch_size = x.shape[0]
        self.conv_out_shape = self.a_conv.shape  # (batch, out_c, H_out, W_out)
        flat_dim = np.prod(self.conv_out_shape[1:])  # out_c*H_out*W_out

        self.a_flat = self.a_conv.reshape(self.batch_size, flat_dim)

        # Lazy init for FC layers if needed
        if not self.fc_initialized:
            # first FC
            self.fc_w = 0.01 * np.random.randn(flat_dim, self.fc_size)
            self.fc_b = np.zeros((self.fc_size,))
            # second FC
            self.fc_w2 = 0.01 * np.random.randn(self.fc_size, self.num_classes)
            self.fc_b2 = np.zeros((self.num_classes,))
            self.fc_initialized = True

        # 3) First FC
        self.z_fc = self.a_flat @ self.fc_w + self.fc_b  # shape (batch_size, fc_size)
        self.a_fc = self.act_fn(self.z_fc)

        # 4) Second FC (final)
        self.z_out = self.a_fc @ self.fc_w2 + self.fc_b2  # shape (batch_size, num_classes)
        # If classification, might want softmax here. We'll keep it linear for demonstration.
        self.out = self.z_out
        return self.out

    def backward(self, d_out, lr=0.001):
        """
        Backprop:
          d_out shape: (batch_size, num_classes) => gradient w.r.t. final output
        We'll propagate all the way back to the input x.
        """
        # 1) Backprop final FC: out = a_fc @ fc_w2 + fc_b2
        d_a_fc = d_out @ self.fc_w2.T
        d_w2 = self.a_fc.T @ d_out
        d_b2 = np.sum(d_out, axis=0)

        # 2) Backprop first FC: a_fc = activation(z_fc), z_fc = a_flat @ fc_w + fc_b
        d_z_fc = d_a_fc * self.act_fn(self.z_fc, derivative=True)
        d_a_flat = d_z_fc @ self.fc_w.T
        d_w = self.a_flat.T @ d_z_fc
        d_b = np.sum(d_z_fc, axis=0)

        # 3) Reshape d_a_flat back to conv output shape
        d_a_conv = d_a_flat.reshape(self.conv_out_shape)

        # 4) Backprop activation from conv layer: a_conv = activation(z_conv)
        d_z_conv = d_a_conv * self.act_fn(self.z_conv, derivative=True)

        # 5) Backprop conv: x_in -> z_conv
        dx_conv, d_conv_w, d_conv_b = self._conv2d_backward(self.x_in, d_z_conv)

        # Update parameters
        self.fc_w2 -= lr * d_w2
        self.fc_b2 -= lr * d_b2
        self.fc_w  -= lr * d_w
        self.fc_b  -= lr * d_b
        self.conv_w -= lr * d_conv_w
        self.conv_b -= lr * d_conv_b

        return dx_conv  # in case we want the gradient wrt input

    def train(self, x, y, epochs=10, lr=0.001):
        """
        A simple training loop for demonstration. We'll assume MSE loss or something similar.
        y shape: (batch_size, num_classes).
        """
        for e in range(epochs):
            out = self.forward(x)
            # Suppose we do MSE loss: L = 0.5 * sum((out - y)^2)
            loss = 0.5 * np.mean((out - y)**2)
            # d_out = (out - y) for MSE
            d_out = (out - y) / x.shape[0]  # average over batch

            self.backward(d_out, lr=lr)
            if e % 1 == 0:
                print(f"Epoch {e}, Loss = {loss:.5f}")


# -------------------------------
# Demo usage
# -------------------------------
def demo():
    # Create a random 'image' dataset: e.g. 10 images, single-channel, 6x6
    np.random.seed(42)
    x_data = np.random.randn(10, 1, 6, 6)  # (batch_size=10, in_channels=1, H=6, W=6)
    # Let's say we want 2-class output => y shape = (10, 2)
    y_data = np.random.randn(10, 2)

    # Create a SimpleCNN: 1 conv layer -> 1 hidden FC -> final output
    net = SimpleCNN(in_channels=1, out_channels=2, kernel_size=3,
                    fc_size=4, num_classes=2, activation='relu')

    net.train(x_data, y_data, epochs=5, lr=0.01)

if __name__ == "__main__":
    demo()


Epoch 0, Loss = 0.47075
Epoch 1, Loss = 0.46744
Epoch 2, Loss = 0.46419
Epoch 3, Loss = 0.46101
Epoch 4, Loss = 0.45789


3

In [12]:
#
# This is a sample Notebook to demonstrate how to read "MNIST Dataset"
#
import numpy as np # linear algebra
import struct
from array import array
from os.path  import join

#
# MNIST Data Loader Class
#
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test) 

In [13]:
#
# Verify Reading Dataset via MnistDataloader class
#
%matplotlib inline
import random
import matplotlib.pyplot as plt

#
# Set file paths based on added MNIST Datasets
#
input_path = '../input'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

#
# Helper function to show a list of images with their relating titles
#
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

#
# Show some random training and test images 
#
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

show_images(images_2_show, titles_2_show)


FileNotFoundError: [Errno 2] No such file or directory: '../input/train-labels-idx1-ubyte/train-labels-idx1-ubyte'