<a href="https://colab.research.google.com/github/Cliffochi/aviva_data_science_course/blob/main/simple_conv2d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###[Problem 1] Creating a 2D convolution layer

In [5]:
import numpy as np

class Conv1d:
    """
    1D Convolutional layer.

    Parameters
    ----------
    in_channels : int
      Number of input channels.
    out_channels : int
      Number of output channels.
    kernel_size : int
      Size of the convolutional kernel.
    stride : int, default=1
      Stride of the convolution.
    padding : int, default=0
      Padding added to both sides of the input.
    initial_bias : float, default=0.0
      Initial value for the bias term.
    lr : float, default=0.01
      Learning rate for weight and bias updates.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, initial_bias=0.0, lr=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.lr = lr
        self.W = np.random.randn(out_channels, in_channels, kernel_size)
        self.b = np.full(out_channels, initial_bias)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.x = None
        self.col = None

    def forward(self, x):
        """
        Forward propagation of the 1D convolutional layer.

        Parameters
        ----------
        x : ndarray of shape (n_samples, in_channels, width)
          Input to the convolutional layer.

        Returns
        -------
        a : ndarray of shape (n_samples, out_channels, output_width)
          Output of the convolutional layer.
        """
        self.x = x
        n_samples, in_channels, width = x.shape
        output_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1
        a = np.zeros((n_samples, self.out_channels, output_width))

        x_padded = np.pad(x, [(0, 0), (0, 0), (self.padding, self.padding)], 'constant')
        # Corrected col shape for 1D
        self.col = np.zeros((n_samples, in_channels, self.kernel_size, output_width))

        for i in range(output_width):
            start = i * self.stride
            end = start + self.kernel_size
            self.col[:, :, :, i] = x_padded[:, :, start:end]

        # Reshape col for matrix multiplication: (n_samples * output_width, in_channels * kernel_size)
        col_reshaped = self.col.transpose(0, 3, 1, 2).reshape(n_samples * output_width, -1)
        col_W = self.W.reshape(self.out_channels, -1).T # (in_channels * kernel_size, out_channels)

        # Matrix multiplication: (n_samples * output_width, out_channels)
        a = np.dot(col_reshaped, col_W) + self.b
        # Reshape back to (n_samples, out_channels, output_width)
        a = a.reshape(n_samples, output_width, self.out_channels).transpose(0, 2, 1)

        return a

    def backward(self, da):
        """
        Backward propagation of the 1D convolutional layer.

        Parameters
        ----------
        da : ndarray of shape (n_samples, out_channels, output_width)
          Gradients of the following layer with respect to the output of this layer.

        Returns
        -------
        dx : ndarray of shape (n_samples, in_channels, width)
          Gradients of this layer with respect to the input.
        """
        n_samples, out_channels, output_width = da.shape
        in_channels = self.in_channels
        kernel_size = self.kernel_size
        stride = self.stride
        padding = self.padding
        width = self.x.shape[2]

        # Gradient for bias
        self.db = np.sum(da, axis=(0, 2))

        # Gradient for weights
        # Reshape da to (n_samples * output_width, out_channels)
        da_reshaped = da.transpose(0, 2, 1).reshape(n_samples * output_width, out_channels)
        # Reshape col to (n_samples * output_width, in_channels * kernel_size)
        col_reshaped = self.col.transpose(0, 3, 1, 2).reshape(n_samples * output_width, in_channels * kernel_size)

        # dW is dot product of col_reshaped.T and da_reshaped
        # (in_channels * kernel_size, n_samples * output_width) @ (n_samples * output_width, out_channels)
        # Result is (in_channels * kernel_size, out_channels)
        # Transpose to get (out_channels, in_channels * kernel_size)
        # Reshape to (out_channels, in_channels, kernel_size)
        self.dW = np.dot(col_reshaped.T, da_reshaped).T.reshape(out_channels, in_channels, kernel_size)


        # Gradient for input
        # dcol is dot product of da_reshaped and W_reshaped.T
        # W_reshaped is (out_channels, in_channels * kernel_size)
        W_reshaped = self.W.reshape(out_channels, in_channels * kernel_size)
        # (n_samples * output_width, out_channels) @ (out_channels, in_channels * kernel_size)
        # Result is (n_samples * output_width, in_channels * kernel_size)
        dcol = np.dot(da_reshaped, W_reshaped).reshape(n_samples, output_width, in_channels, kernel_size).transpose(0, 2, 3, 1) # (n_samples, in_channels, kernel_size, output_width)


        # Distribute gradients to input using col2im
        dx = np.zeros_like(self.x)
        dx_padded_full = np.pad(dx, [(0, 0), (0, 0), (self.padding, self.padding)], 'constant')

        for i in range(output_width):
            start = i * self.stride
            end = start + self.kernel_size
            dx_padded_full[:, :, start:end] += dcol[:, :, :, i]

        if self.padding > 0:
            dx = dx_padded_full[:, :, self.padding:-self.padding]
        else:
            dx = dx_padded_full

        return dx


    def update(self):
        """
        Updates the weights and biases of the convolutional layer.
        """
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

class Conv2d(Conv1d):
    """
    2D Convolutional layer, extending Conv1d.

    Parameters
    ----------
    in_channels : int
      Number of input channels.
    out_channels : int
      Number of output channels.
    kernel_size : tuple of int (height, width)
      Size of the convolutional kernel.
    stride : tuple of int (height_stride, width_stride), default=(1, 1)
      Stride of the convolution in height and width.
    padding : tuple of int (height_padding, width_padding), default=(0, 0)
      Padding added to the height and width of the input.
    initial_bias : float, default=0.0
      Initial value for the bias term.
    lr : float, default=0.01
      Learning rate for weight and bias updates.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=(1, 1), padding=(0, 0), initial_bias=0.0, lr=0.01):
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        # The super().__init__ call here is using 1D logic, which might be
        # problematic for a 2D convolution. It's setting self.kernel_size,
        # self.stride, and self.padding with values derived from the 2D inputs,
        # but these are meant for the Conv1d parent class.
        # It's better to initialize Conv2d-specific attributes directly.
        # super().__init__(in_channels, out_channels, kernel_size[0] * kernel_size[1], stride[1], padding[1], initial_bias, lr)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.lr = lr
        self.initial_bias = initial_bias # Store for potential re-initialization or inspection

        self.kernel_h, self.kernel_w = kernel_size
        self.stride_h, self.stride_w = stride
        self.padding_h, self.padding_w = padding

        # Weights and biases are 2D specific
        self.W = np.random.randn(out_channels, in_channels, self.kernel_h, self.kernel_w)
        self.b = np.full(out_channels, initial_bias)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

        self.x = None
        self.col = None # This will store the result of the im2col operation

    def forward(self, x):
        """
        Forward propagation of the 2D convolutional layer.

        Parameters
        ----------
        x : ndarray of shape (n_samples, in_channels, height, width)
          Input to the convolutional layer (NCHW format).

        Returns
        -------
        a : ndarray of shape (n_samples, out_channels, output_height, output_width)
          Output of the convolutional layer.
        """
        self.x = x
        n_samples, in_channels, height, width = x.shape
        kernel_h, kernel_w = self.kernel_h, self.kernel_w
        stride_h, stride_w = self.stride_h, self.stride_w
        padding_h, padding_w = self.padding_h, self.padding_w
        out_channels = self.out_channels

        output_height = (height + 2 * padding_h - kernel_h) // stride_h + 1
        output_width = (width + 2 * padding_w - kernel_w) // stride_w + 1
        a = np.zeros((n_samples, out_channels, output_height, output_width))

        x_padded = np.pad(x, [(0, 0), (0, 0), (padding_h, padding_h), (padding_w, padding_w)], 'constant')

        # Perform im2col
        # Output shape of im2col: (n_samples, out_h, out_w, in_c, k_h, k_w)
        # Then reshape to (n_samples * out_h * out_w, in_c * k_h * k_w)
        self.col = np.zeros((n_samples, output_height, output_width, in_channels, kernel_h, kernel_w))

        for h in range(output_height):
            for w in range(output_width):
                start_h = h * stride_h
                end_h = start_h + kernel_h
                start_w = w * stride_w
                end_w = start_w + kernel_w
                self.col[:, h, w, :, :, :] = x_padded[:, :, start_h:end_h, start_w:end_w]

        # Reshape col for matrix multiplication
        # Shape: (n_samples * output_height * output_width, in_channels * kernel_h * kernel_w)
        col_reshaped = self.col.reshape(n_samples * output_height * output_width, -1)

        # Reshape W for matrix multiplication
        # Shape: (out_channels, in_channels * kernel_h * kernel_w).T
        #       = (in_channels * kernel_h * kernel_w, out_channels)
        col_W = self.W.reshape(out_channels, -1).T

        # Matrix multiplication: (n_samples * output_height * output_width, out_channels)
        a_reshaped = np.dot(col_reshaped, col_W) + self.b

        # Reshape back to (n_samples, out_channels, output_height, output_width)
        a = a_reshaped.reshape(n_samples, output_height, output_width, out_channels).transpose(0, 3, 1, 2)

        return a

    def backward(self, da):
        """
        Backward propagation of the 2D convolutional layer.

        Parameters
        ----------
        da : ndarray of shape (n_samples, out_channels, output_height, output_width)
          Gradients of the following layer with respect to the output of this layer.

        Returns
        -------
        dx : ndarray of shape (n_samples, in_channels, height, width)
          Gradients of this layer with respect to the input.
        """
        n_samples, out_channels, output_height, output_width = da.shape
        in_channels = self.in_channels
        kernel_h, kernel_w = self.kernel_h, self.kernel_w
        stride_h, stride_w = self.stride_h, self.stride_w
        padding_h, padding_w = self.padding_h, self.padding_w
        height, width = self.x.shape[2:]


        # Gradient for bias
        self.db = np.sum(da, axis=(0, 2, 3))

        # Gradient for weights
        # Reshape da to (n_samples * output_height * output_width, out_channels)
        da_reshaped = da.transpose(0, 2, 3, 1).reshape(n_samples * output_height * output_width, out_channels)

        # Reshape col (from forward pass) to (n_samples * output_height * output_width, in_channels * kernel_h * kernel_w)
        col_reshaped = self.col.reshape(n_samples * output_height * output_width, -1)

        # dW is dot product of col_reshaped.T and da_reshaped
        # (in_channels * kernel_h * kernel_w, n_samples * output_height * output_width) @ (n_samples * output_height * output_width, out_channels)
        # Result is (in_channels * kernel_h * kernel_w, out_channels)
        # Transpose to get (out_channels, in_channels * kernel_h * kernel_w)
        # Reshape to (out_channels, in_channels, kernel_h, kernel_w)
        self.dW = np.dot(col_reshaped.T, da_reshaped).T.reshape(out_channels, in_channels, kernel_h, kernel_w)


        # Gradient for input
        # dcol is dot product of da_reshaped and W_reshaped.T
        # W_reshaped is (out_channels, in_channels * kernel_h * kernel_w)
        W_reshaped = self.W.reshape(out_channels, in_channels * kernel_h * kernel_w)
        # (n_samples * output_height * output_width, out_channels) @ (out_channels, in_channels * kernel_h * kernel_w)
        # Result is (n_samples * output_height * output_width, in_channels * kernel_h * kernel_w)
        dcol_reshaped = np.dot(da_reshaped, W_reshaped)

        # Reshape dcol_reshaped back to (n_samples, output_height, output_width, in_channels, kernel_h, kernel_w)
        dcol = dcol_reshaped.reshape(n_samples, output_height, output_width, in_channels, kernel_h, kernel_w)


        # Distribute gradients to input using col2im
        # Initialize dx_padded_full with zeros, matching padded input shape
        dx = np.zeros_like(self.x)
        padded_height = height + 2 * padding_h
        padded_width = width + 2 * padding_w
        dx_padded_full = np.zeros((n_samples, in_channels, padded_height, padded_width))

        for h in range(output_height):
            for w in range(output_width):
                start_h = h * stride_h
                end_h = start_h + kernel_h
                start_w = w * stride_w
                end_w = start_w + kernel_w
                dx_padded_full[:, :, start_h:end_h, start_w:end_w] += dcol[:, h, w, :, :, :]

        if padding_h > 0 or padding_w > 0:
            # Remove padding from the result
            dx = dx_padded_full[:, :, padding_h:height+padding_h, padding_w:width+padding_w]
        else:
            dx = dx_padded_full

        return dx


    def update(self):
        """
        Updates the weights and biases of the convolutional layer.
        """
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

if __name__ == '__main__':
    # Example usage with a dummy MNIST-like input
    n_samples = 2
    in_channels = 1
    height = 28
    width = 28
    input_data = np.random.randn(n_samples, in_channels, height, width)

    # Instantiate a Conv2d layer
    conv_layer = Conv2d(in_channels=1, out_channels=3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), lr=0.01)

    # Forward pass
    output_data = conv_layer.forward(input_data)
    print("Input shape:", input_data.shape)
    print("Output shape:", output_data.shape)

    # Dummy gradients from the next layer
    dout = np.random.randn(n_samples, 3, 28, 28)

    # Backward pass
    din = conv_layer.backward(dout)
    print("Input gradient shape:", din.shape)

    # Update weights and biases
    conv_layer.update()

    print("\nTesting with different stride and padding:")
    conv_layer_2 = Conv2d(in_channels=1, out_channels=2, kernel_size=(5, 5), stride=(2, 2), padding=(0, 0), lr=0.01)
    output_data_2 = conv_layer_2.forward(input_data)
    print("Input shape:", input_data.shape)
    print("Output shape (stride 2, no padding):", output_data_2.shape)

    # Calculate expected output shape for dout_2
    height_2 = 28
    width_2 = 28
    kernel_h_2, kernel_w_2 = (5, 5)
    stride_h_2, stride_w_2 = (2, 2)
    padding_h_2, padding_w_2 = (0, 0)
    output_height_2 = (height_2 + 2 * padding_h_2 - kernel_h_2) // stride_h_2 + 1
    output_width_2 = (width_2 + 2 * padding_w_2 - kernel_w_2) // stride_w_2 + 1
    expected_dout_shape_2 = (n_samples, 2, output_height_2, output_width_2)
    print(f"Expected dout_2 shape: {expected_dout_shape_2}")


    dout_2 = np.random.randn(*expected_dout_shape_2) # Use the calculated output shape for dummy gradients
    din_2 = conv_layer_2.backward(dout_2)
    print("Input gradient shape:", din_2.shape)
    conv_layer_2.update()

Input shape: (2, 1, 28, 28)
Output shape: (2, 3, 28, 28)
Input gradient shape: (2, 1, 28, 28)

Testing with different stride and padding:
Input shape: (2, 1, 28, 28)
Output shape (stride 2, no padding): (2, 2, 12, 12)
Expected dout_2 shape: (2, 2, 12, 12)
Input gradient shape: (2, 1, 28, 28)


###[Problem 2] Experiments with 2D convolutional layers on small arrays

In [6]:
import numpy as np

class Conv2d_Debug:
    """
    2D Convolutional layer for debugging with explicit calculations.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=(1, 1), padding=(0, 0), lr=0.01):
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_h, self.kernel_w = kernel_size
        self.stride_h, self.stride_w = stride
        self.padding_h, self.padding_w = padding
        self.lr = lr
        self.W = np.array([[[[ 0.,  0.,  0.],
                               [ 0.,  1.,  0.],
                               [ 0., -1.,  0.]]],

                              [[[ 0.,  0.,  0.],
                               [ 0., -1.,  1.],
                               [ 0.,  0.,  0.]]]]) # Initialize with the provided weights
        self.b = np.zeros(out_channels)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.x = None
        self.col = None

    def forward(self, x):
        """
        Forward propagation of the 2D convolutional layer.
        """
        self.x = x
        n_samples, in_channels, height, width = x.shape
        out_h = (height + 2 * self.padding_h - self.kernel_h) // self.stride_h + 1
        out_w = (width + 2 * self.padding_w - self.kernel_w) // self.stride_w + 1
        a = np.zeros((n_samples, self.out_channels, out_h, out_w))
        x_padded = np.pad(x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        # Assuming n_samples is 1 for this debug class based on the provided x
        # If n_samples can be > 1, this loop structure needs adjustment
        # to iterate over samples as well. For now, focusing on the provided x shape.
        for m in range(self.out_channels):
            for h in range(out_h):
                for w in range(out_w):
                    for k in range(self.in_channels):
                        for s in range(self.kernel_h):
                            for t in range(self.kernel_w):
                                i = h * self.stride_h + s
                                j = w * self.stride_w + t
                                a[0, m, h, w] += x_padded[0, k, i, j] * self.W[m, k, s, t]
                    a[0, m, h, w] += self.b[m]
        return a

    def backward(self, da):
        """
        Backward propagation of the 2D convolutional layer.
        """
        # Expected da shape is (n_samples, out_channels, out_h, out_w)
        n_samples, out_channels, out_h, out_w = da.shape
        in_channels, in_h, in_w = self.x.shape[1:]

        self.dW = np.zeros_like(self.W)
        self.db = np.sum(da, axis=(0, 2, 3))
        dx = np.zeros_like(self.x)
        x_padded = np.pad(self.x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')
        dx_padded = np.pad(dx, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        # Assuming n_samples is 1 for this debug class based on the provided x and delta shape fix
        for m in range(out_channels):
            for k in range(in_channels):
                for s in range(self.kernel_h):
                    for t in range(self.kernel_w):
                        for h in range(out_h):
                            for w in range(out_w):
                                i = h * self.stride_h + s
                                j = w * self.stride_w + t
                                self.dW[m, k, s, t] += da[0, m, h, w] * x_padded[0, k, i, j]

        for m in range(out_channels):
            for k in range(in_channels):
                for s in range(self.kernel_h):
                    for t in range(self.kernel_w):
                        for h in range(out_h):
                            for w in range(out_w):
                                di = h * self.stride_h + s
                                dj = w * self.stride_w + t
                                if (0 <= di < dx_padded.shape[2]) and (0 <= dj < dx_padded.shape[3]):
                                    dx_padded[0, k, di, dj] += da[0, m, h, w] * self.W[m, k, s, t]

        if self.padding_h > 0 or self.padding_w > 0:
            dx = dx_padded[:, :, self.padding_h:-self.padding_h, self.padding_w:-self.padding_w]
        else:
            dx = dx_padded

        return dx, self.dW, self.db

# Input x and weights w
x = np.array([[[[ 1,  2,  3,  4],
                [ 5,  6,  7,  8],
                [ 9, 10, 11, 12],
                [13, 14, 15, 16]]]])

w_provided = np.array([[[[ 0.,  0.,  0.],
                           [ 0.,  1.,  0.],
                           [ 0., -1.,  0.]]],

                         [[[ 0.,  0.,  0.],
                           [ 0., -1.,  1.],
                           [ 0.,  0.,  0.]]]])

# Instantiate the Conv2d_Debug layer with the provided weights
conv_debug = Conv2d_Debug(in_channels=1, out_channels=2, kernel_size=(3, 3), stride=(1, 1), padding=(0, 0))
conv_debug.W = w_provided
conv_debug.b = np.zeros(2) # Initialize bias to zero as not specified

# Forward propagation
output_forward = conv_debug.forward(x)
print("Forward Propagation Output:")
print(output_forward)

# Error delta - Corrected shape to match the expected output shape of the layer
# The layer processes input x with shape (1, 1, 4, 4).
# With kernel (3, 3), stride (1, 1), padding (0, 0), output shape is (1, 2, 2, 2).
# The gradient delta should have the same shape as the output.
delta = np.array([[[ -4,  -4],
                   [ 10,  11]]]) # Changed to (1, 2, 2) - Still not right based on the error,
                                  # The expected shape is (1, out_channels, out_h, out_w)

# Re-calculating the expected output shape for Conv2d_Debug with x (1, 1, 4, 4), kernel (3, 3), stride (1, 1), padding (0, 0)
n_samples_x, in_channels_x, height_x, width_x = x.shape
kernel_h_debug, kernel_w_debug = conv_debug.kernel_h, conv_debug.kernel_w
stride_h_debug, stride_w_debug = conv_debug.stride_h, conv_debug.stride_w
padding_h_debug, padding_w_debug = conv_debug.padding_h, conv_debug.padding_w
out_channels_debug = conv_debug.out_channels

output_height_debug = (height_x + 2 * padding_h_debug - kernel_h_debug) // stride_h_debug + 1
output_width_debug = (width_x + 2 * padding_w_debug - kernel_w_debug) // stride_w_debug + 1

# The correct shape for delta should be (n_samples, out_channels, output_height, output_width)
# With x having n_samples=1, out_channels=2, output_height=2, output_width=2
# The correct shape for delta is (1, 2, 2, 2)
delta = np.array([[[[ -4,  -4],
                    [ 10,  11]]],

                  [[[  1,  -7],
                    [  1, -11]]]])
# The above delta shape is (2, 2, 2, 2), which implies n_samples=2.
# Given x has n_samples=1, delta should have n_samples=1.
# Let's correct delta to have shape (1, 2, 2, 2)

delta = np.array([[[[ -4.,  -4.],
                    [ 10.,  11.]],

                   [[  1.,  -7.],
                    [  1., -11.]]]])
# This delta has shape (1, 2, 2, 2). Let's use this.


# Backward propagation
dx_backward, dW_backward, db_backward = conv_debug.backward(delta)
print("\nBackward Propagation Gradients:")
print("Gradient w.r.t. input (dx):")
print(dx_backward)
print("Gradient w.r.t. weights (dW):")
print(dW_backward)
print("Gradient w.r.t. bias (db):")
print(db_backward)

# Experiment with padding (Problem 2 - With padding)
conv_padded = Conv2d_Debug(in_channels=1, out_channels=2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
conv_padded.W = w_provided
conv_padded.b = np.zeros(2)
output_forward_padded = conv_padded.forward(x)
print("\nForward Propagation Output (with padding=1):")
print(output_forward_padded)

# Calculate expected output shape for delta_padded with padding=1
height_x_padded = height_x + 2 * conv_padded.padding_h
width_x_padded = width_x + 2 * conv_padded.padding_w
output_height_padded = (height_x + 2 * conv_padded.padding_h - conv_padded.kernel_h) // conv_padded.stride_h + 1
output_width_padded = (width_x + 2 * conv_padded.padding_w - conv_padded.kernel_w) // conv_padded.stride_w + 1
# Expected delta_padded shape is (n_samples=1, out_channels=2, output_height=4, output_width=4)

delta_padded = np.array([[[[ -4.,  -4.,  0.,  0.], # Output size with padding is (1, 2, 4, 4)
                            [ 10.,  11.,  0.,  0.],
                            [  0.,   0.,  0.,  0.],
                            [  0.,   0.,  0.,  0.]],

                           [[  1.,  -7.,  0.,  0.],
                            [  1., -11.,  0.,  0.],
                            [  0.,   0.,  0.,  0.],
                            [  0.,   0.,  0.,  0.]]]]) # Corrected shape to (1, 2, 4, 4)


dx_backward_padded, dW_backward_padded, db_backward_padded = conv_padded.backward(delta_padded)
print("\nBackward Propagation Gradients (with padding=1):")
print("Gradient w.r.t. input (dx_padded):")
print(dx_backward_padded)
print("Gradient w.r.t. weights (dW_backward_padded):")
print(dW_backward_padded)
print("Gradient w.r.t. bias (db_backward_padded):")
print(db_backward_padded)

# Manual Calculation Check for Forward Propagation (without padding)
print("\nManual Calculation Check (Forward - No Padding):")
output_manual = np.zeros((1, 2, 2, 2))
for m in range(2): # output channels
    for h_out in range(2): # output height
        for w_out in range(2): # output width
            for k_in in range(1): # input channels
                for s in range(3): # kernel height
                    for t in range(3): # kernel width
                        h_in = h_out * 1 + s
                        w_in = w_out * 1 + t
                        if 0 <= h_in < 4 and 0 <= w_in < 4:
                            output_manual[0, m, h_out, w_out] += x[0, k_in, h_in, w_in] * w_provided[m, k_in, s, t]
            output_manual[0, m, h_out, w_out] += 0 # bias is 0

print(output_manual)

# Manual Calculation Check for Backward Propagation (without padding) - Gradient w.r.t. weights
print("\nManual Calculation Check (Backward - dW - No Padding):")
dW_manual = np.zeros_like(w_provided)
# The delta shape should match the output shape of the forward pass.
# For the non-padded case, output shape is (1, 2, 2, 2).
# So, delta should be (1, 2, 2, 2).
# Let's use the corrected delta for the manual check as well.
# delta = np.array([[[[ -4.,  -4.], [ 10.,  11.]], [[  1.,  -7.], [  1., -11.]]]]) # Corrected delta shape (1, 2, 2, 2)

for m in range(2): # output channels
    for k in range(1): # input channels
        for s in range(3): # kernel height
            for t in range(3): # kernel width
                for h_out in range(2): # output height
                    for w_out in range(2): # output width
                        h_in = h_out * 1 + s
                        w_in = w_out * 1 + t
                        # Ensure input indices are within bounds
                        if 0 <= h_in < x.shape[2] and 0 <= w_in < x.shape[3]:
                             dW_manual[m, k, s, t] += delta[0, m, h_out, w_out] * x[0, k, h_in, w_in] # Use delta[0] because n_samples is 1
print(dW_manual)

# Manual Calculation Check for Backward Propagation (without padding) - Gradient w.r.t. input
print("\nManual Calculation Check (Backward - dx - No Padding):")
dx_manual = np.zeros_like(x)
# Use the corrected delta shape (1, 2, 2, 2)
# delta = np.array([[[[ -4.,  -4.], [ 10.,  11.]], [[  1.,  -7.], [  1., -11.]]]]) # Corrected delta shape (1, 2, 2, 2)

for k_in in range(1): # input channels
    for h_in in range(4): # input height
        for w_in in range(4): # input width
            for m_out in range(2): # output channels
                for s in range(3): # kernel height
                    for t in range(3): # kernel width
                        h_out = h_in - s
                        w_out = w_in - t
                        # Ensure output indices are within bounds
                        if 0 <= h_out < 2 and 0 <= w_out < 2:
                             dx_manual[0, k_in, h_in, w_in] += delta[0, m_out, h_out, w_out] * w_provided[m_out, k_in, s, t] # Use delta[0] because n_samples is 1

print(dx_manual)

Forward Propagation Output:
[[[[-4. -4.]
   [-4. -4.]]

  [[ 1.  1.]
   [ 1.  1.]]]]

Backward Propagation Gradients:
Gradient w.r.t. input (dx):
[[[[  0   0   0   0]
   [  0  -5   4  -7]
   [  0  13  27 -11]
   [  0 -10 -11   0]]]]
Gradient w.r.t. weights (dW):
[[[[ 104.  117.  130.]
   [ 156.  169.  182.]
   [ 208.  221.  234.]]]


 [[[ -74.  -90. -106.]
   [-138. -154. -170.]
   [-202. -218. -234.]]]]
Gradient w.r.t. bias (db):
[ 13. -16.]

Forward Propagation Output (with padding=1):
[[[[ -4.  -4.  -4.  -4.]
   [ -4.  -4.  -4.  -4.]
   [ -4.  -4.  -4.  -4.]
   [ 13.  14.  15.  16.]]

  [[  1.   1.   1.  -4.]
   [  1.   1.   1.  -8.]
   [  1.   1.   1. -12.]
   [  1.   1.   1. -16.]]]]

Backward Propagation Gradients (with padding=1):
Gradient w.r.t. input (dx_padded):
[[[[ -5   4  -7   0]
   [ 13  27 -11   0]
   [-10 -11   0   0]
   [  0   0   0   0]]]]
Gradient w.r.t. weights (dW_backward_padded):
[[[[  11.   32.   53.]
   [  51.  104.  117.]
   [  79.  156.  169.]]]


 [[[ -11.  

###[Problem 3] Output size after 2D convolution

In [7]:
def calculate_2d_convolution_output_size(input_height, input_width, padding_height, padding_width, filter_height, filter_width, stride_height, stride_width):
    """
    Calculates the output size (height and width) after a 2D convolution.

    Parameters
    ----------
    input_height : int
        Height of the input feature map.
    input_width : int
        Width of the input feature map.
    padding_height : int
        Padding applied to the height of the input.
    padding_width : int
        Padding applied to the width of the input.
    filter_height : int
        Height of the convolutional filter.
    filter_width : int
        Width of the convolutional filter.
    stride_height : int
        Stride of the convolution in the height direction.
    stride_width : int
        Stride of the convolution in the width direction.

    Returns
    -------
    output_height : int
        Height of the output feature map.
    output_width : int
        Width of the output feature map.
    """
    output_height = (input_height + 2 * padding_height - filter_height) // stride_height + 1
    output_width = (input_width + 2 * padding_width - filter_width) // stride_width + 1
    return output_height, output_width

if __name__ == '__main__':
    # Example usage:
    input_h = 28
    input_w = 28
    pad_h = 1
    pad_w = 1
    filter_h = 3
    filter_w = 3
    stride_h = 1
    stride_w = 1

    out_h, out_w = calculate_2d_convolution_output_size(input_h, input_w, pad_h, pad_w, filter_h, filter_w, stride_h, stride_w)
    print(f"Input height: {input_h}, width: {input_w}")
    print(f"Padding height: {pad_h}, width: {pad_w}")
    print(f"Filter height: {filter_h}, width: {filter_w}")
    print(f"Stride height: {stride_h}, width: {stride_w}")
    print(f"Output height: {out_h}, width: {out_w}")

    print("-" * 30)

    input_h = 10
    input_w = 10
    pad_h = 0
    pad_w = 0
    filter_h = 2
    filter_w = 2
    stride_h = 2
    stride_w = 2

    out_h, out_w = calculate_2d_convolution_output_size(input_h, input_w, pad_h, pad_w, filter_h, filter_w, stride_h, stride_w)
    print(f"Input height: {input_h}, width: {input_w}")
    print(f"Padding height: {pad_h}, width: {pad_w}")
    print(f"Filter height: {filter_h}, width: {filter_w}")
    print(f"Stride height: {stride_h}, width: {stride_w}")
    print(f"Output height: {out_h}, width: {out_w}")

Input height: 28, width: 28
Padding height: 1, width: 1
Filter height: 3, width: 3
Stride height: 1, width: 1
Output height: 28, width: 28
------------------------------
Input height: 10, width: 10
Padding height: 0, width: 0
Filter height: 2, width: 2
Stride height: 2, width: 2
Output height: 5, width: 5


**Explanation:**

1.  **Function Definition:**

      * The code defines a function `calculate_2d_convolution_output_size` that takes the input height, input width, padding in both height and width directions, filter size (height and width), and stride in both height and width directions as input.

2.  **Output Size Calculation:**

      * It directly implements the provided formulas for calculating the output height (`Nh,out`) and output width (`Nw,out`).
      * Integer division (`//`) is used to ensure that the output dimensions are integers.

3.  **Return Values:**

      * The function returns the calculated `output_height` and `output_width` as a tuple.

4.  **Example Usage:**

      * The `if __name__ == '__main__':` block demonstrates how to use the function with sample input values. It prints the input parameters and the resulting output height and width. Two different sets of parameters are used to show different output size calculations.

###[Problem 4] Creating a max pooling layer

In [10]:
import numpy as np

class MaxPool2D:
    """
    2D Max Pooling layer.

    Parameters
    ----------
    pool_size : tuple of int (height, width)
        Size of the pooling window.
    stride : tuple of int (height_stride, width_stride), default=None
        Stride of the pooling operation. If None, it defaults to pool_size.
    """
    def __init__(self, pool_size, stride=None):
        self.pool_h, self.pool_w = pool_size
        if stride is None:
            self.stride_h, self.stride_w = self.pool_h, self.pool_w
        else:
            self.stride_h, self.stride_w = stride
        self.x = None
        self.arg_max = None

    def forward(self, x):
        """
        Forward propagation of the 2D max pooling layer.

        Parameters
        ----------
        x : ndarray of shape (n_samples, n_channels, height, width)
            Input to the pooling layer.

        Returns
        -------
        a : ndarray of shape (n_samples, n_channels, output_height, output_width)
            Output of the pooling layer.
        """
        self.x = x
        n_samples, n_channels, height, width = x.shape
        output_height = (height - self.pool_h) // self.stride_h + 1
        output_width = (width - self.pool_w) // self.stride_w + 1
        a = np.zeros((n_samples, n_channels, output_height, output_width))
        self.arg_max = np.zeros_like(a, dtype=int)  # Store the index of the maximum value

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                pool_region = x[:, :, h_start:h_end, w_start:w_end]
                a[:, :, i, j] = np.max(pool_region, axis=(2, 3))

                # Store the index of the maximum value within the pooling region
                arg_max_local = np.argmax(pool_region.reshape(n_samples, n_channels, -1), axis=2)
                self.arg_max[:, :, i, j] = arg_max_local

        return a

    def backward(self, da):
        """
        Backward propagation of the 2D max pooling layer.

        Parameters
        ----------
        da : ndarray of shape (n_samples, n_channels, output_height, output_width)
            Gradients of the following layer with respect to the output of this layer.

        Returns
        -------
        dx : ndarray of shape (n_samples, n_channels, height, width)
            Gradients of this layer with respect to the input.
        """
        dx = np.zeros_like(self.x, dtype=np.float64)  # Initialize dx as float
        n_samples, n_channels, height, width = self.x.shape
        output_height, output_width = da.shape[2:]

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                # Create a mask to pass the gradient only to the max element
                mask = np.zeros((n_samples, n_channels, self.pool_h, self.pool_w))
                arg_max_local = self.arg_max[:, :, i, j]  # Get the stored index

                # Convert the flat index to 2D indices within the pooling region
                row_index = arg_max_local // self.pool_w
                col_index = arg_max_local % self.pool_w

                for ns in range(n_samples):
                    for nc in range(n_channels):
                        mask[ns, nc, row_index[ns, nc], col_index[ns, nc]] = 1

                # Distribute the gradient to the corresponding position in the input gradient
                dx[:, :, h_start:h_end, w_start:w_end] += da[:, :, i, j][:, :, np.newaxis, np.newaxis] * mask

        return dx

if __name__ == '__main__':
    # Example usage:
    x = np.array([[[[1, 2, 3, 4],
                    [5, 6, 7, 8],
                    [9, 10, 11, 12],
                    [13, 14, 15, 16]]]])
    print("Input:\n", x)
    pool_layer = MaxPool2D(pool_size=(2, 2), stride=(2, 2))
    output = pool_layer.forward(x)
    print("MaxPool Output:\n", output)

    dout = np.array([[[[6, 8],
                       [14, 16]]]])
    din = pool_layer.backward(dout)
    print("MaxPool Input Gradient:\n", din)

    print("-" * 30)

    x_multi_channel = np.array([[[[1, 2, 3, 4],
                                   [5, 6, 7, 8],
                                   [9, 10, 11, 12],
                                   [13, 14, 15, 16]],

                                  [[17, 18, 19, 20],
                                   [21, 22, 23, 24],
                                   [25, 26, 27, 28],
                                   [29, 30, 31, 32]]]])
    print("Multi-channel Input:\n", x_multi_channel)
    pool_layer_multi = MaxPool2D(pool_size=(2, 2), stride=(2, 2))
    output_multi = pool_layer_multi.forward(x_multi_channel)
    print("Multi-channel MaxPool Output:\n", output_multi)

    dout_multi = np.array([[[[3, 4],
                                [7, 8]],

                               [[11, 12],
                                [15, 16]]]])
    din_multi = pool_layer_multi.backward(dout_multi)
    print("Multi-channel MaxPool Input Gradient:\n", din_multi)

Input:
 [[[[ 1  2  3  4]
   [ 5  6  7  8]
   [ 9 10 11 12]
   [13 14 15 16]]]]
MaxPool Output:
 [[[[ 6.  8.]
   [14. 16.]]]]
MaxPool Input Gradient:
 [[[[ 0.  0.  0.  0.]
   [ 0.  6.  0.  8.]
   [ 0.  0.  0.  0.]
   [ 0. 14.  0. 16.]]]]
------------------------------
Multi-channel Input:
 [[[[ 1  2  3  4]
   [ 5  6  7  8]
   [ 9 10 11 12]
   [13 14 15 16]]

  [[17 18 19 20]
   [21 22 23 24]
   [25 26 27 28]
   [29 30 31 32]]]]
Multi-channel MaxPool Output:
 [[[[ 6.  8.]
   [14. 16.]]

  [[22. 24.]
   [30. 32.]]]]
Multi-channel MaxPool Input Gradient:
 [[[[ 0.  0.  0.  0.]
   [ 0.  3.  0.  4.]
   [ 0.  0.  0.  0.]
   [ 0.  7.  0.  8.]]

  [[ 0.  0.  0.  0.]
   [ 0. 11.  0. 12.]
   [ 0.  0.  0.  0.]
   [ 0. 15.  0. 16.]]]]


###[Problem 5] (Advanced assignment) Creating average pooling

In [11]:
import numpy as np

class AveragePool2D:
    """
    2D Average Pooling layer.

    Parameters
    ----------
    pool_size : tuple of int (height, width)
        Size of the pooling window.
    stride : tuple of int (height_stride, width_stride), default=None
        Stride of the pooling operation. If None, it defaults to pool_size.
    """
    def __init__(self, pool_size, stride=None):
        self.pool_h, self.pool_w = pool_size
        if stride is None:
            self.stride_h, self.stride_w = self.pool_h, self.pool_w
        else:
            self.stride_h, self.stride_w = stride
        self.x = None

    def forward(self, x):
        """
        Forward propagation of the 2D average pooling layer.

        Parameters
        ----------
        x : ndarray of shape (n_samples, n_channels, height, width)
            Input to the pooling layer.

        Returns
        -------
        a : ndarray of shape (n_samples, n_channels, output_height, output_width)
            Output of the pooling layer.
        """
        self.x = x
        n_samples, n_channels, height, width = x.shape
        output_height = (height - self.pool_h) // self.stride_h + 1
        output_width = (width - self.pool_w) // self.stride_w + 1
        a = np.zeros((n_samples, n_channels, output_height, output_width))

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                pool_region = x[:, :, h_start:h_end, w_start:w_end]
                a[:, :, i, j] = np.mean(pool_region, axis=(2, 3))

        return a

    def backward(self, da):
        """
        Backward propagation of the 2D average pooling layer.

        Parameters
        ----------
        da : ndarray of shape (n_samples, n_channels, output_height, output_width)
            Gradients of the following layer with respect to the output of this layer.

        Returns
        -------
        dx : ndarray of shape (n_samples, n_channels, height, width)
            Gradients of this layer with respect to the input.
        """
        dx = np.zeros_like(self.x, dtype=np.float64)
        n_samples, n_channels, height, width = self.x.shape
        output_height, output_width = da.shape[2:]

        # Gradient for each element in the pooling region is the output gradient
        # divided by the number of elements in the pooling region.
        scale = 1.0 / (self.pool_h * self.pool_w)

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                # Distribute the gradient equally to all elements in the pooling region
                gradient = da[:, :, i, j][:, :, np.newaxis, np.newaxis] * scale
                dx[:, :, h_start:h_end, w_start:w_end] += gradient

        return dx

if __name__ == '__main__':
    # Example usage:
    x = np.array([[[[1, 2, 3, 4],
                    [5, 6, 7, 8],
                    [9, 10, 11, 12],
                    [13, 14, 15, 16]]]])
    print("Input:\n", x)
    pool_layer = AveragePool2D(pool_size=(2, 2), stride=(2, 2))
    output = pool_layer.forward(x)
    print("AveragePool Output:\n", output)

    dout = np.array([[[[6, 8],
                       [14, 16]]]])
    din = pool_layer.backward(dout)
    print("AveragePool Input Gradient:\n", din)

    print("-" * 30)

    x_multi_channel = np.array([[[[1, 2, 3, 4],
                                   [5, 6, 7, 8],
                                   [9, 10, 11, 12],
                                   [13, 14, 15, 16]],

                                  [[17, 18, 19, 20],
                                   [21, 22, 23, 24],
                                   [25, 26, 27, 28],
                                   [29, 30, 31, 32]]]])
    print("Multi-channel Input:\n", x_multi_channel)
    pool_layer_multi = AveragePool2D(pool_size=(2, 2), stride=(2, 2))
    output_multi = pool_layer_multi.forward(x_multi_channel)
    print("Multi-channel AveragePool Output:\n", output_multi)

    dout_multi = np.array([[[[3, 4],
                                [7, 8]],

                               [[11, 12],
                                [15, 16]]]])
    din_multi = pool_layer_multi.backward(dout_multi)
    print("Multi-channel AveragePool Input Gradient:\n", din_multi)

Input:
 [[[[ 1  2  3  4]
   [ 5  6  7  8]
   [ 9 10 11 12]
   [13 14 15 16]]]]
AveragePool Output:
 [[[[ 3.5  5.5]
   [11.5 13.5]]]]
AveragePool Input Gradient:
 [[[[1.5 1.5 2.  2. ]
   [1.5 1.5 2.  2. ]
   [3.5 3.5 4.  4. ]
   [3.5 3.5 4.  4. ]]]]
------------------------------
Multi-channel Input:
 [[[[ 1  2  3  4]
   [ 5  6  7  8]
   [ 9 10 11 12]
   [13 14 15 16]]

  [[17 18 19 20]
   [21 22 23 24]
   [25 26 27 28]
   [29 30 31 32]]]]
Multi-channel AveragePool Output:
 [[[[ 3.5  5.5]
   [11.5 13.5]]

  [[19.5 21.5]
   [27.5 29.5]]]]
Multi-channel AveragePool Input Gradient:
 [[[[0.75 0.75 1.   1.  ]
   [0.75 0.75 1.   1.  ]
   [1.75 1.75 2.   2.  ]
   [1.75 1.75 2.   2.  ]]

  [[2.75 2.75 3.   3.  ]
   [2.75 2.75 3.   3.  ]
   [3.75 3.75 4.   4.  ]
   [3.75 3.75 4.   4.  ]]]]


###[Problem 6] Smoothing

In [12]:
import numpy as np

class Flatten:
    """
    Flattens the input tensor from (n_samples, n_channels, height, width)
    to (n_samples, n_channels * height * width).
    During backward pass, it reshapes the gradient back to the original shape.
    """
    def __init__(self):
        self.original_shape = None

    def forward(self, x):
        """
        Forward pass: Flattens the input tensor.

        Parameters
        ----------
        x : ndarray of shape (n_samples, n_channels, height, width)
            Input tensor.

        Returns
        -------
        flattened_x : ndarray of shape (n_samples, n_channels * height * width)
            Flattened tensor.
        """
        self.original_shape = x.shape
        n_samples = x.shape[0]
        flattened_x = x.reshape(n_samples, -1)
        return flattened_x

    def backward(self, da):
        """
        Backward pass: Reshapes the gradient back to the original shape.

        Parameters
        ----------
        da : ndarray of shape (n_samples, n_channels * height * width)
            Gradient with respect to the flattened output.

        Returns
        -------
        dx : ndarray of shape (n_samples, n_channels, height, width)
            Gradient with respect to the original input shape.
        """
        if self.original_shape is None:
            raise ValueError("Original shape not stored. Forward pass must be called before backward.")
        dx = da.reshape(self.original_shape)
        return dx

if __name__ == '__main__':
    # Example usage:
    x = np.random.rand(2, 3, 4, 4)
    print("Original shape:", x.shape)
    print("Input:\n", x)

    flatten_layer = Flatten()
    flattened_output = flatten_layer.forward(x)
    print("Flattened shape:", flattened_output.shape)
    print("Flattened Output:\n", flattened_output)

    dout = np.random.rand(2, 3 * 4 * 4)
    print("Gradient of the following layer (dout) shape:", dout.shape)
    print("dout:\n", dout)

    din = flatten_layer.backward(dout)
    print("Gradient of the input (din) shape:", din.shape)
    print("din:\n", din)

    # Example with a different shape
    y = np.random.rand(1, 1, 28, 28)
    print("\nOriginal shape (y):", y.shape)
    flattened_y = flatten_layer.forward(y)
    print("Flattened shape (flattened_y):", flattened_y.shape)
    dout_y = np.random.rand(1, 1 * 28 * 28)
    din_y = flatten_layer.backward(dout_y)
    print("Gradient of the input (din_y) shape:", din_y.shape)

Original shape: (2, 3, 4, 4)
Input:
 [[[[7.35607015e-01 3.17028548e-01 6.82923783e-01 7.24520752e-01]
   [7.77717403e-04 1.43967432e-01 7.59855998e-01 1.71671799e-01]
   [5.50528475e-01 2.93904110e-01 4.69574895e-01 7.12156737e-01]
   [4.25998974e-02 4.09931452e-01 6.53097263e-01 5.48578487e-01]]

  [[4.29252520e-01 4.33545861e-01 9.60185045e-02 5.96830267e-01]
   [5.04137085e-01 4.39141834e-01 7.18515014e-02 7.91706855e-01]
   [7.16735779e-01 2.63482227e-01 7.30861995e-01 6.53004818e-01]
   [1.73617255e-02 4.49810258e-01 3.79557824e-01 9.14156521e-01]]

  [[2.58634598e-03 8.29431888e-01 3.60113113e-01 3.80597397e-01]
   [6.31668841e-01 7.59946022e-01 2.32501441e-01 6.51771875e-01]
   [9.90624145e-02 1.65118967e-01 9.27820661e-01 6.30803147e-01]
   [3.09840072e-01 8.73077633e-01 9.47167316e-04 2.62593635e-01]]]


 [[[5.95331040e-01 9.23429033e-01 3.90021523e-01 2.38354040e-01]
   [9.68713951e-01 1.94899645e-01 6.53574158e-01 2.31971803e-01]
   [7.82141455e-01 6.82350475e-01 2.54593896e

##3. Verification
###[Problem 7] Learning and estimation

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load MNIST dataset (simplified for demonstration)
def load_mnist(n_train=1000, n_test=500):
    from tensorflow.keras.datasets import mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    # Normalize and reshape
    x_train = x_train[:n_train].astype('float32') / 255.0
    x_test = x_test[:n_test].astype('float32') / 255.0
    x_train = x_train.reshape(-1, 1, 28, 28)
    x_test = x_test.reshape(-1, 1, 28, 28)

    # One-hot encode labels
    encoder = OneHotEncoder(sparse_output=False)
    y_train_encoded = encoder.fit_transform(y_train[:n_train].reshape(-1, 1))
    y_test_encoded = encoder.transform(y_test[:n_test].reshape(-1, 1))

    return x_train, y_train_encoded, x_test, y_test_encoded

class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, lr=0.01):
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_h, self.kernel_w = kernel_size
        self.stride_h, self.stride_w = stride
        self.padding_h, self.padding_w = padding
        self.lr = lr
        self.W = np.random.randn(out_channels, in_channels, self.kernel_h, self.kernel_w) * 0.01
        self.b = np.zeros(out_channels)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.x = None

    def forward(self, x):
        self.x = x
        n_samples, in_channels, height, width = x.shape
        out_h = (height + 2 * self.padding_h - self.kernel_h) // self.stride_h + 1
        out_w = (width + 2 * self.padding_w - self.kernel_w) // self.stride_w + 1
        a = np.zeros((n_samples, self.out_channels, out_h, out_w))
        x_padded = np.pad(x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        for n in range(n_samples):
            for oc in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        h_end = h_start + self.kernel_h
                        w_start = j * self.stride_w
                        w_end = w_start + self.kernel_w
                        a[n, oc, i, j] = np.sum(x_padded[n, :, h_start:h_end, w_start:w_end] * self.W[oc, :, :, :]) + self.b[oc]
        return a

    def backward(self, da):
        n_samples, out_channels, out_h, out_w = da.shape
        in_channels, in_h, in_w = self.x.shape[1:]
        self.dW = np.zeros_like(self.W)
        self.db = np.sum(da, axis=(0, 2, 3))

        x_padded = np.pad(self.x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')
        dx = np.zeros_like(self.x)
        dx_padded = np.pad(dx, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        for n in range(n_samples):
            for oc in range(out_channels):
                for ic in range(in_channels):
                    for kh in range(self.kernel_h):
                        for kw in range(self.kernel_w):
                            for i in range(out_h):
                                for j in range(out_w):
                                    h_start = i * self.stride_h
                                    h_in = h_start + kh
                                    w_start = j * self.stride_w
                                    w_in = w_start + kw
                                    if (0 <= h_in < in_h + 2 * self.padding_h and
                                        0 <= w_in < in_w + 2 * self.padding_w):
                                        self.dW[oc, ic, kh, kw] += da[n, oc, i, j] * x_padded[n, ic, h_in, w_in]

        for n in range(n_samples):
            for ic in range(in_channels):
                for oc in range(out_channels):
                    for kh in range(self.kernel_h):
                        for kw in range(self.kernel_w):
                            for i in range(out_h):
                                for j in range(out_w):
                                    h_out = i * self.stride_h
                                    h_in = h_out + kh
                                    w_out = j * self.stride_w
                                    w_in = w_out + kw
                                    if (0 <= h_in < in_h + 2 * self.padding_h and
                                        0 <= w_in < in_w + 2 * self.padding_w):
                                        dx_padded[n, ic, h_in, w_in] += da[n, oc, i, j] * self.W[oc, ic, kh, kw]

        if self.padding_h > 0 or self.padding_w > 0:
            dx = dx_padded[:, :, self.padding_h:-self.padding_h, self.padding_w:-self.padding_w]
        else:
            dx = dx_padded

        return dx

    def update(self):
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

class ReLU:
    def forward(self, x):
        self.mask = (x <= 0)
        return np.maximum(0, x)

    def backward(self, da):
        da[self.mask] = 0
        return da

class Flatten:
    def forward(self, x):
        self.original_shape = x.shape
        n_samples = x.shape[0]
        return x.reshape(n_samples, -1)

    def backward(self, da):
        return da.reshape(self.original_shape)

class Dense:
    def __init__(self, in_size, out_size, lr=0.01):
        self.W = np.random.randn(in_size, out_size) * 0.01
        self.b = np.zeros(out_size)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.lr = lr
        self.x = None

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b

    def backward(self, da):
        self.dW = np.dot(self.x.T, da)
        self.db = np.sum(da, axis=0)
        dx = np.dot(da, self.W.T)
        return dx

    def update(self):
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y_pred = None
        self.y_true = None

    def forward(self, x, t):
        self.y_true = t
        exp_a = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.y_pred = exp_a / np.sum(exp_a, axis=1, keepdims=True)
        batch_size = self.y_pred.shape[0]
        cross_entropy_loss = -np.sum(t * np.log(self.y_pred + 1e-7)) / batch_size
        self.loss = cross_entropy_loss
        return self.loss

    def backward(self, dout=1):
        batch_size = self.y_true.shape[0]
        dx = (self.y_pred - self.y_true) * dout / batch_size
        return dx

class SimpleCNN:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.conv1 = Conv2d(1, 8, 3, padding=1, lr=lr)
        self.relu1 = ReLU()
        self.flatten = Flatten()
        self.fc1 = Dense(8 * 28 * 28, 10, lr=lr) # No pooling for simplicity in this minimal example
        self.loss_layer = SoftmaxWithLoss()

    def predict(self, x):
        out = self.conv1.forward(x)
        out = self.relu1.forward(out)
        out = self.flatten.forward(out)
        out = self.fc1.forward(out)
        return np.argmax(out, axis=1)

    def forward(self, x, t):
        out = self.conv1.forward(x)
        out = self.relu1.forward(out)
        out = self.flatten.forward(out)
        out = self.fc1.forward(out)
        loss = self.loss_layer.forward(out, t)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        dout = self.fc1.backward(dout)
        dout = self.flatten.backward(dout)
        dout = self.relu1.backward(dout)
        dout = self.conv1.backward(dout)
        return dout

    def update(self):
        self.conv1.update()
        self.fc1.update()

def train(model, x_train, t_train, epochs=5, batch_size=32):
    n_train = x_train.shape[0]
    for epoch in range(epochs):
        epoch_loss = 0
        for i in tqdm(range(0, n_train, batch_size), desc=f"Epoch {epoch+1}/{epochs}"):
            batch_x = x_train[i:i + batch_size]
            batch_t = t_train[i:i + batch_size]

            loss = model.forward(batch_x, batch_t)
            model.backward()
            model.update()
            epoch_loss += loss

        avg_loss = epoch_loss / (n_train // batch_size)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

def evaluate(model, x_test, t_test):
    y_pred = model.predict(x_test)
    y_true = np.argmax(t_test, axis=1)
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

if __name__ == '__main__':
    # Load a small subset of MNIST for faster training
    x_train, t_train, x_test, t_test = load_mnist(n_train=1000, n_test=500)

    # Initialize the simple CNN model
    model = SimpleCNN(lr=0.01)

    # Train the model
    train(model, x_train, t_train, epochs=5, batch_size=32)

    # Evaluate the model
    accuracy = evaluate(model, x_test, t_test)
    print(f"\nTest Accuracy: {accuracy:.4f}")

Epoch 1/5: 100%|██████████| 32/32 [03:28<00:00,  6.51s/it]


Epoch 1 Loss: 2.3760


Epoch 2/5: 100%|██████████| 32/32 [03:27<00:00,  6.48s/it]


Epoch 2 Loss: 2.3722


Epoch 3/5: 100%|██████████| 32/32 [03:26<00:00,  6.45s/it]


Epoch 3 Loss: 2.3582


Epoch 4/5: 100%|██████████| 32/32 [03:25<00:00,  6.43s/it]


Epoch 4 Loss: 2.2931


Epoch 5/5: 100%|██████████| 32/32 [03:26<00:00,  6.44s/it]


Epoch 5 Loss: 2.0251

Test Accuracy: 0.5620


###[Problem 8] (Advanced assignment) LeNet

In [15]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.datasets import mnist

def load_mnist_lenet(n_train=60000, n_test=10000):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    # Normalize and reshape for LeNet input (1 channel)
    x_train = x_train[:n_train].astype('float32') / 255.0
    x_test = x_test[:n_test].astype('float32') / 255.0
    x_train = x_train.reshape(-1, 1, 28, 28)
    x_test = x_test.reshape(-1, 1, 28, 28)

    # One-hot encode labels
    encoder = OneHotEncoder(sparse_output=False)
    y_train_encoded = encoder.fit_transform(y_train[:n_train].reshape(-1, 1))
    y_test_encoded = encoder.transform(y_test[:n_test].reshape(-1, 1))

    return x_train, y_train_encoded, x_test, y_test_encoded

class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, lr=0.01):
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_h, self.kernel_w = kernel_size
        self.stride_h, self.stride_w = stride
        self.padding_h, self.padding_w = padding
        self.lr = lr
        self.W = np.random.randn(out_channels, in_channels, self.kernel_h, self.kernel_w) * 0.01
        self.b = np.zeros(out_channels)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.x = None

    def forward(self, x):
        self.x = x
        n_samples, in_channels, height, width = x.shape
        out_h = (height + 2 * self.padding_h - self.kernel_h) // self.stride_h + 1
        out_w = (width + 2 * self.padding_w - self.kernel_w) // self.stride_w + 1
        a = np.zeros((n_samples, self.out_channels, out_h, out_w))
        x_padded = np.pad(x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        for n in range(n_samples):
            for oc in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        h_end = h_start + self.kernel_h
                        w_start = j * self.stride_w
                        w_end = w_start + self.kernel_w
                        a[n, oc, i, j] = np.sum(x_padded[n, :, h_start:h_end, w_start:w_end] * self.W[oc, :, :, :]) + self.b[oc]
        return a

    def backward(self, da):
        n_samples, out_channels, out_h, out_w = da.shape
        in_channels, in_h, in_w = self.x.shape[1:]
        self.dW = np.zeros_like(self.W)
        self.db = np.sum(da, axis=(0, 2, 3))

        x_padded = np.pad(self.x, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')
        dx = np.zeros_like(self.x)
        dx_padded = np.pad(dx, [(0, 0), (0, 0), (self.padding_h, self.padding_h), (self.padding_w, self.padding_w)], 'constant')

        for n in range(n_samples):
            for oc in range(out_channels):
                for ic in range(in_channels):
                    for kh in range(self.kernel_h):
                        for kw in range(self.kernel_w):
                            for i in range(out_h):
                                for j in range(out_w):
                                    h_start = i * self.stride_h
                                    h_in = h_start + kh
                                    w_start = j * self.stride_w
                                    w_in = w_start + kw
                                    if (0 <= h_in < in_h + 2 * self.padding_h and
                                        0 <= w_in < in_w + 2 * self.padding_w):
                                        self.dW[oc, ic, kh, kw] += da[n, oc, i, j] * x_padded[n, ic, h_in, w_in]

        for n in range(n_samples):
            for ic in range(in_channels):
                for oc in range(out_channels):
                    for kh in range(self.kernel_h):
                        for kw in range(self.kernel_w):
                            for i in range(out_h):
                                for j in range(out_w):
                                    h_out = i * self.stride_h
                                    h_in = h_out + kh
                                    w_out = j * self.stride_w
                                    w_in = w_out + kw
                                    if (0 <= h_in < in_h + 2 * self.padding_h and
                                        0 <= w_in < in_w + 2 * self.padding_w):
                                        dx_padded[n, ic, h_in, w_in] += da[n, oc, i, j] * self.W[oc, ic, kh, kw]

        if self.padding_h > 0 or self.padding_w > 0:
            dx = dx_padded[:, :, self.padding_h:-self.padding_h, self.padding_w:-self.padding_w]
        else:
            dx = dx_padded

        return dx

    def update(self):
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

class ReLU:
    def forward(self, x):
        self.mask = (x <= 0)
        return np.maximum(0, x)

    def backward(self, da):
        da[self.mask] = 0
        return da

class MaxPool2D:
    def __init__(self, pool_size, stride=None):
        self.pool_h, self.pool_w = pool_size
        if stride is None:
            self.stride_h, self.stride_w = self.pool_h, self.pool_w
        else:
            self.stride_h, self.stride_w = stride
        self.x = None
        self.arg_max = None

    def forward(self, x):
        self.x = x
        n_samples, n_channels, height, width = x.shape
        output_height = (height - self.pool_h) // self.stride_h + 1
        output_width = (width - self.pool_w) // self.stride_w + 1
        a = np.zeros((n_samples, n_channels, output_height, output_width))
        self.arg_max = np.zeros_like(a, dtype=int)

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                pool_region = x[:, :, h_start:h_end, w_start:w_end]
                a[:, :, i, j] = np.max(pool_region, axis=(2, 3))
                arg_max_local = np.argmax(pool_region.reshape(n_samples, n_channels, -1), axis=2)
                self.arg_max[:, :, i, j] = arg_max_local
        return a

    def backward(self, da):
        dx = np.zeros_like(self.x, dtype=np.float64)
        n_samples, n_channels, height, width = self.x.shape
        output_height, output_width = da.shape[2:]

        for i in range(output_height):
            for j in range(output_width):
                h_start = i * self.stride_h
                h_end = h_start + self.pool_h
                w_start = j * self.stride_w
                w_end = w_start + self.pool_w

                mask = np.zeros((n_samples, n_channels, self.pool_h, self.pool_w))
                arg_max_local = self.arg_max[:, :, i, j]
                row_index = arg_max_local // self.pool_w
                col_index = arg_max_local % self.pool_w

                for ns in range(n_samples):
                    for nc in range(n_channels):
                        mask[ns, nc, row_index[ns, nc], col_index[ns, nc]] = 1

                dx[:, :, h_start:h_end, w_start:w_end] += da[:, :, i, j][:, :, np.newaxis, np.newaxis] * mask
        return dx

class Flatten:
    def forward(self, x):
        self.original_shape = x.shape
        n_samples = x.shape[0]
        return x.reshape(n_samples, -1)

    def backward(self, da):
        return da.reshape(self.original_shape)

class Dense:
    def __init__(self, in_size, out_size, lr=0.01):
        self.W = np.random.randn(in_size, out_size) * 0.01
        self.b = np.zeros(out_size)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.lr = lr
        self.x = None

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b

    def backward(self, da):
        self.dW = np.dot(self.x.T, da)
        self.db = np.sum(da, axis=0)
        dx = np.dot(da, self.W.T)
        return dx

    def update(self):
        self.W -= self.lr * self.dW
        self.b -= self.lr * self.db

class Softmax:
    def forward(self, x):
        exp_a = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.y = exp_a / np.sum(exp_a, axis=1, keepdims=True)
        return self.y

    def backward(self, dout):
        batch_size = dout.shape[0]
        dx = (self.y - np.argmax(self.y, axis=1)[:, np.newaxis]) * dout / batch_size # Simplified for classification
        return dx

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y_pred = None
        self.y_true = None

    def forward(self, x, t):
        self.y_true = t
        exp_a = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.y_pred = exp_a / np.sum(exp_a, axis=1, keepdims=True)
        batch_size = self.y_pred.shape[0]
        cross_entropy_loss = -np.sum(t * np.log(self.y_pred + 1e-7)) / batch_size
        self.loss = cross_entropy_loss
        return self.loss

    def backward(self, dout=1):
        batch_size = self.y_true.shape[0]
        dx = (self.y_pred - self.y_true) * dout / batch_size
        return dx

class LeNet5:
    def __init__(self, lr=0.01):
        self.lr = lr
        # Layer 1: Convolutional (6 output channels, 5x5 kernel, stride 1)
        self.conv1 = Conv2d(1, 6, 5, stride=1, padding=0, lr=lr)
        self.relu1 = ReLU()
        # Layer 2: Max Pooling (2x2 pool size, stride 2)
        self.pool2 = MaxPool2D(pool_size=(2, 2), stride=(2, 2))
        # Layer 3: Convolutional (16 output channels, 5x5 kernel, stride 1)
        self.conv3 = Conv2d(6, 16, 5, stride=1, padding=0, lr=lr)
        self.relu3 = ReLU()
        # Layer 4: Max Pooling (2x2 pool size, stride 2)
        self.pool4 = MaxPool2D(pool_size=(2, 2), stride=(2, 2))
        # Layer 5: Flatten
        self.flatten5 = Flatten()
        # Layer 6: Fully Connected (120 output nodes)
        self.fc6 = Dense(16 * 5 * 5, 120, lr=lr)
        self.relu6 = ReLU()
        # Layer 7: Fully Connected (84 output nodes)
        self.fc7 = Dense(120, 84, lr=lr)
        self.relu7 = ReLU()
        # Layer 8: Fully Connected (10 output nodes)
        self.fc8 = Dense(84, 10, lr=lr)
        # Layer 9: Softmax with Loss
        self.loss_layer = SoftmaxWithLoss()

    def predict(self, x):
        out = self.conv1.forward(x)
        out = self.relu1.forward(out)
        out = self.pool2.forward(out)
        out = self.conv3.forward(out)
        out = self.relu3.forward(out)
        out = self.pool4.forward(out)
        out = self.flatten5.forward(out)
        out = self.fc6.forward(out)
        out = self.relu6.forward(out)
        out = self.fc7.forward(out)
        out = self.relu7.forward(out)
        out = self.fc8.forward(out)
        return np.argmax(out, axis=1)

    def forward(self, x, t):
        out = self.conv1.forward(x)
        out = self.relu1.forward(out)
        out = self.pool2.forward(out)
        out = self.conv3.forward(out)
        out = self.relu3.forward(out)
        out = self.pool4.forward(out)
        out = self.flatten5.forward(out)
        out = self.fc6.forward(out)
        out = self.relu6.forward(out)
        out = self.fc7.forward(out)
        out = self.relu7.forward(out)
        out = self.fc8.forward(out)
        loss = self.loss_layer.forward(out, t)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        dout = self.fc8.backward(dout)
        dout = self.relu7.backward(dout)
        dout = self.fc7.backward(dout)
        dout = self.relu6.backward(dout)
        dout = self.fc6.backward(dout)
        dout = self.flatten5.backward(dout)
        dout = self.pool4.backward(dout)
        dout = self.relu3.backward(dout)
        dout = self.conv3

###[Problem 9] (Advanced task) Research into famous image recognition models

Here's what's readily available in Keras (which is a popular high-level API for building neural networks, often used with TensorFlow or other backends):

**Pre-trained models available in Keras**

Keras provides a wide range of pre-trained models that you can directly use for feature extraction or fine-tuning. These include the famous architectures you mentioned and many others:

* **Xception:** A deep convolutional neural network architecture.
* **VGG16 & VGG19:** Very Deep Convolutional Networks with 16 and 19 layers respectively. These were very influential in popularizing deep CNNs.
* **ResNet50, ResNet101, ResNet152, ResNet50V2, ResNet101V2, ResNet152V2:** Deep residual networks that address the vanishing gradient problem and enable training of much deeper networks.
* **InceptionV3, InceptionResNetV2:** Architectures that use inception modules to efficiently capture features at different scales.
* **MobileNetV1, MobileNetV2, MobileNetV3:** Lightweight and efficient CNN architectures designed for mobile and embedded vision applications.
* **DenseNet121, DenseNet169, DenseNet201, DenseNet264:** Densely connected convolutional networks where each layer is connected to all preceding layers.
* **NASNetMobile, NASNetLarge:** Neural Architecture Search Networks that were automatically discovered.
* **EfficientNetB0 to EfficientNetB7:** A family of models that systematically scale network dimensions (depth, width, and resolution) for better efficiency and accuracy.
* **ConvNeXtBase, ConvNeXtSmall, ConvNeXtTiny, ConvNeXtLarge, ConvNeXtXLarge:** Modernized ResNet-like architectures inspired by the Transformer design.

**Key Takeaways:**

* **AlexNet (2012):** While historically significant, AlexNet itself isn't directly available as a pre-built model in the main Keras Applications. However, its architecture and influence are foundational to many of the available models. You might find implementations in the broader TensorFlow ecosystem or in educational examples.
* **VGG16 (2014):** Yes, VGG16 (and its deeper variant VGG19) are readily available in Keras Applications.

**In summary, Keras provides a rich collection of pre-trained CNN architectures, including VGG16 (as you asked), and many more modern and efficient models that have built upon the foundational work of networks like AlexNet.** This allows developers to leverage state-of-the-art features without having to train these large models from scratch.

###[Problem 10] Calculating the output size and number of parameters

To calculate the output size and the number of parameters for each of the convolutional layers you described.

**Key Formulas:**

* **Output Height ($N_{h,out}$):** $\lfloor \frac{N_{h,in} + 2P_h - F_h}{S_h} + 1 \rfloor$
* **Output Width ($N_{w,out}$):** $\lfloor \frac{N_{w,in} + 2P_w - F_w}{S_w} + 1 \rfloor$
* **Number of Parameters:** $(F_h \times F_w \times N_{in\_channels} + 1) \times N_{out\_channels}$
    * $(F_h \times F_w \times N_{in\_channels})$: Number of weights per filter.
    * $+ 1$: Accounts for the bias term for each filter.
    * $\times N_{out\_channels}$: Multiplied by the number of filters (output channels).

**1. Input size: 144×144, 3 channels**
   * Filter size: 3×3, 6 channels
   * Stride: 1
   * Padding: None ($P_h = 0, P_w = 0$)

   * **Output Size:**
      * $N_{h,out} = \lfloor \frac{144 + 2 \times 0 - 3}{1} + 1 \rfloor = \lfloor 141 + 1 \rfloor = 142$
      * $N_{w,out} = \lfloor \frac{144 + 2 \times 0 - 3}{1} + 1 \rfloor = \lfloor 141 + 1 \rfloor = 142$
      * **Output size:** 142×142, 6 channels

   * **Number of Parameters:**
      * $(3 \times 3 \times 3 + 1) \times 6 = (27 + 1) \times 6 = 28 \times 6 = 168$
      * **Number of parameters:** 168

**2. Input size: 60×60, 24 channels**
   * Filter size: 3×3, 48 channels
   * Stride: 1
   * Padding: None ($P_h = 0, P_w = 0$)

   * **Output Size:**
      * $N_{h,out} = \lfloor \frac{60 + 2 \times 0 - 3}{1} + 1 \rfloor = \lfloor 57 + 1 \rfloor = 58$
      * $N_{w,out} = \lfloor \frac{60 + 2 \times 0 - 3}{1} + 1 \rfloor = \lfloor 57 + 1 \rfloor = 58$
      * **Output size:** 58×58, 48 channels

   * **Number of Parameters:**
      * $(3 \times 3 \times 24 + 1) \times 48 = (216 + 1) \times 48 = 217 \times 48 = 10416$
      * **Number of parameters:** 10416

**3. Input size: 20×20, 10 channels**
   * Filter size: 3×3, 20 channels
   * Stride: 2
   * Padding: None ($P_h = 0, P_w = 0$)

   * **Output Size:**
      * $N_{h,out} = \lfloor \frac{20 + 2 \times 0 - 3}{2} + 1 \rfloor = \lfloor \frac{17}{2} + 1 \rfloor = \lfloor 8.5 + 1 \rfloor = \lfloor 9.5 \rfloor = 9$
      * $N_{w,out} = \lfloor \frac{20 + 2 \times 0 - 3}{2} + 1 \rfloor = \lfloor \frac{17}{2} + 1 \rfloor = \lfloor 8.5 + 1 \rfloor = \lfloor 9.5 \rfloor = 9$
      * **Output size:** 9×9, 20 channels

   * **Number of Parameters:**
      * $(3 \times 3 \times 10 + 1) \times 20 = (90 + 1) \times 20 = 91 \times 20 = 1820$
      * **Number of parameters:** 1820

**Summary:**

1.  **Layer 1:**
    * **Output Size:** 142×142, 6 channels
    * **Number of Parameters:** 168
2.  **Layer 2:**
    * **Output Size:** 58×58, 48 channels
    * **Number of Parameters:** 10416
3.  **Layer 3:**
    * **Output Size:** 9×9, 20 channels
    * **Number of Parameters:** 1820

For the third example, as you mentioned, the convolution doesn't perfectly align with the input size and stride. The output size calculation using the floor function (`\lfloor \rfloor`) reflects how a typical framework would handle this by only performing the convolution on the parts of the input where the filter fully fits within the boundaries at the given stride. This results in the edges of the input being effectively "clipped" or not fully processed by the convolution operation in this specific layer configuration.

###[Problem 11] (Advanced) Investigation into filter size

**Why are 3x3 filters more commonly used than larger ones like 7x7?**

While larger filters like 7x7 might seem beneficial for capturing broader spatial relationships in an image in a single layer, several key factors have led to the widespread adoption of smaller 3x3 filters:

1.  **Increased Depth for the Same Receptive Field:** A stack of multiple 3x3 convolutional layers can achieve the same receptive field as a single larger filter but with greater depth. For example, two consecutive 3x3 layers have a receptive field of 5x5, and three consecutive 3x3 layers have a receptive field of 7x7.

2.  **More Non-linearities:** Using multiple 3x3 layers introduces more non-linear activation functions (like ReLU) into the network. More non-linearities allow the network to learn more complex and intricate features compared to a single layer with a large filter. Each ReLU layer adds a point where the network can make a non-linear decision.

3.  **Fewer Parameters:** A stack of smaller filters generally has fewer parameters than a single larger filter with the same receptive field. Consider a receptive field of 7x7 with $C_{in}$ input channels and $C_{out}$ output channels:
    * A single 7x7 filter has $(7 \times 7 \times C_{in} + 1) \times C_{out}$ parameters.
    * Three stacked 3x3 filters (to achieve a 7x7 receptive field) have $3 \times (3 \times 3 \times C_{in} + 1) \times C_{mid} + 3 \times (3 \times 3 \times C_{mid} + 1) \times C_{mid} + (3 \times 3 \times C_{mid} + 1) \times C_{out}$ parameters (if we assume an intermediate number of channels $C_{mid}$). Even with $C_{mid} = C_{in} = C_{out}$, the number of parameters for the 3x3 stack ($3 \times (9C + 1)C$) is less than the 7x7 filter ($49C^2 + C$) for reasonably sized $C$. Fewer parameters help in reducing the risk of overfitting, especially when dealing with limited training data.

4.  **Computational Efficiency:** Fewer parameters often translate to fewer computations during both the forward and backward passes, leading to faster training and inference times.

In essence, using a stack of smaller 3x3 filters allows for building deeper and more expressive networks with better learning capacity and efficiency compared to using a single large filter to achieve the same receptive field.

**The effect of a 1×1 filter with no height or width:**

While a 1x1 filter might seem trivial at first glance, lacking spatial extent within a feature map, it plays a crucial role in modern CNN architectures, primarily for **channel-wise transformations and dimensionality manipulation**. Here's a breakdown of its effects:

1.  **Channel-wise Linear Transformation:** A 1x1 convolution applies a linear combination across the channels of the input feature map at each spatial location (pixel). Each filter in the 1x1 convolutional layer learns a set of weights that determine how to combine the input channels to produce a new output channel.

2.  **Dimensionality Reduction (Bottlenecking):** 1x1 convolutions can be used to reduce the number of channels in a feature map. If a 1x1 convolutional layer has fewer output channels than input channels, it creates a "bottleneck" layer. This can significantly reduce the computational cost of subsequent layers (especially expensive 3x3 or larger convolutions) without drastically sacrificing representational power, as the network can learn the most salient channel-wise combinations to preserve important information.

3.  **Dimensionality Expansion:** Conversely, 1x1 convolutions can also increase the number of channels. This can be useful for preparing feature maps for subsequent layers that require a higher dimensionality.

4.  **Introducing Non-linearities:** When a non-linear activation function (like ReLU) is applied after a 1x1 convolution, it allows for complex channel-wise feature interactions. This adds another layer of abstraction and learning capacity to the network.

5.  **Cross-Channel Feature Pooling:** Although it doesn't pool spatially, a 1x1 convolution can be seen as a form of cross-channel pooling or interaction, where information from different input channels is combined to create new feature representations.

**Common Use Cases:**

* **Inception Modules (GoogLeNet):** 1x1 convolutions are extensively used for bottlenecking before and after larger spatial convolutions to reduce computational cost.
* **ResNet Architectures:** 1x1 convolutions are used in the "bottleneck blocks" to reduce the dimensionality before the main 3x3 convolution and then expand it back afterwards.
* **Network-in-Network (NiN):** This early architecture heavily utilized 1x1 convolutions as "multilayer perceptron convolution layers" to enhance feature abstraction within each spatial location.

In summary, 1x1 filters, despite their lack of spatial filtering capability, are powerful tools for manipulating the channel dimension of feature maps, enabling efficient network designs, dimensionality reduction, expansion, and the introduction of additional non-linearities for richer feature learning. They act as a way to perform complex feature engineering across the channel dimension at each spatial point.