In [2]:
import numpy as np
#【Problem 1】Creating a 2-D convolutional layer

class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, learning_rate=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.lr = learning_rate

        self.W = np.random.randn(out_channels, in_channels, kernel_size) * 0.01
        self.b = np.zeros(out_channels)

    def forward(self, x):
        self.x = x
        batch_size, in_channels, in_width = x.shape
        assert in_channels == self.in_channels

        x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')

        out_width = (in_width + 2*self.padding - self.kernel_size) // self.stride + 1
        out = np.zeros((batch_size, self.out_channels, out_width))

        for n in range(batch_size):
            for m in range(self.out_channels):
                for i in range(out_width):
                    region = x_padded[n, :, i*self.stride:i*self.stride+self.kernel_size]
                    out[n, m, i] = np.sum(region * self.W[m]) + self.b[m]

        self.out = out
        return out

    def backward(self, dout):
        batch_size, _, out_width = dout.shape
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        dx = np.zeros_like(self.x)

        x_padded = np.pad(self.x, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')
        dx_padded = np.pad(dx, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')

        for n in range(batch_size):
            for m in range(self.out_channels):
                for i in range(out_width):
                    region = x_padded[n, :, i*self.stride:i*self.stride+self.kernel_size]
                    dW[m] += dout[n, m, i] * region
                    db[m] += dout[n, m, i]
                    dx_padded[n, :, i*self.stride:i*self.stride+self.kernel_size] += dout[n, m, i] * self.W[m]

        if self.padding != 0:
            dx = dx_padded[:, :, self.padding:-self.padding]
        else:
            dx = dx_padded

        self.W -= self.lr * dW
        self.b -= self.lr * db

        return dx
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, learning_rate=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride
        self.padding = padding
        self.lr = learning_rate

        Fh, Fw = self.kernel_size
        self.W = np.random.randn(out_channels, in_channels, Fh, Fw) * 0.01
        self.b = np.zeros(out_channels)

    def forward(self, x):
        self.x = x
        batch_size, in_channels, in_h, in_w = x.shape
        assert in_channels == self.in_channels

        x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')

        Fh, Fw = self.kernel_size
        out_h = (in_h + 2*self.padding - Fh) // self.stride + 1
        out_w = (in_w + 2*self.padding - Fw) // self.stride + 1
        out = np.zeros((batch_size, self.out_channels, out_h, out_w))

        for n in range(batch_size):
            for m in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        region = x_padded[n, :, i*self.stride:i*self.stride+Fh, j*self.stride:j*self.stride+Fw]
                        out[n, m, i, j] = np.sum(region * self.W[m]) + self.b[m]

        self.out = out
        return out

    def backward(self, dout):
        batch_size, _, out_h, out_w = dout.shape
        Fh, Fw = self.kernel_size
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        dx = np.zeros_like(self.x)

        x_padded = np.pad(self.x, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        dx_padded = np.pad(dx, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')

        for n in range(batch_size):
            for m in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        region = x_padded[n, :, i*self.stride:i*self.stride+Fh, j*self.stride:j*self.stride+Fw]
                        dW[m] += dout[n, m, i, j] * region
                        db[m] += dout[n, m, i, j]
                        dx_padded[n, :, i*self.stride:i*self.stride+Fh, j*self.stride:j*self.stride+Fw] += dout[n, m, i, j] * self.W[m]

        if self.padding != 0:
            dx = dx_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            dx = dx_padded

        self.W -= self.lr * dW
        self.b -= self.lr * db

        return dx


In [3]:
#[Problem 2] Experiments with 2D convolutional layers on small arrays
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]], dtype=float)

w = np.array([
    [[0.0, 0.0, 0.0],
     [0.0, 1.0, 0.0],
     [0.0, -1.0, 0.0]],
    [[0.0, 0.0, 0.0],
     [0.0, -1.0, 1.0],
     [0.0, 0.0, 0.0]]
], dtype=float)

b = np.zeros(2)

def conv2d_forward(x, w, b):
    batch, in_c, h, w_in = x.shape
    out_c, k_h, k_w = w.shape
    out_h = h - k_h + 1
    out_w = w_in - k_w + 1
    out = np.zeros((batch, out_c, out_h, out_w))
    for n in range(batch):
        for m in range(out_c):
            for i in range(out_h):
                for j in range(out_w):
                    window = x[n, 0, i:i+k_h, j:j+k_w]
                    out[n, m, i, j] = np.sum(window * w[m]) + b[m]
    return out

out = conv2d_forward(x, w, b)
print("Forward output:\n", out[0])

delta = np.array([[[-4, -4], [10, 11]], [[1, -7], [1, -11]]])

def conv2d_backward_input(delta, w, x_shape):
    batch, out_c, out_h, out_w = delta.shape
    _, k_h, k_w = w.shape
    dx = np.zeros(x_shape)
    for n in range(batch):
        for c in range(x_shape[1]):
            for i in range(x_shape[2]):
                for j in range(x_shape[3]):
                    grad_sum = 0
                    for m in range(out_c):
                        for s in range(k_h):
                            for t in range(k_w):
                                i_out = i - s
                                j_out = j - t
                                if 0 <= i_out < out_h and 0 <= j_out < out_w:
                                    grad_sum += delta[n, m, i_out, j_out] * w[m, s, t]
                    dx[n, c, i, j] = grad_sum
    return dx

dx = conv2d_backward_input(delta.reshape(1, 2, 2, 2), w, x.shape)
print("Backward dx:\n", dx[0,0])



Forward output:
 [[[-4. -4.]
  [-4. -4.]]

 [[ 1.  1.]
  [ 1.  1.]]]
Backward dx:
 [[  0.   0.   0.   0.]
 [  0.  -5.   4.  -7.]
 [  0.  13.  27. -11.]
 [  0. -10. -11.   0.]]


In [4]:
#[Problem 3] Output size after 2-dimensional convolution

def conv2d_output_size(Nh_in, Nw_in, Fh, Fw, Ph=0, Pw=0, Sh=1, Sw=1):
    Nh_out = (Nh_in + 2 * Ph - Fh) // Sh + 1
    Nw_out = (Nw_in + 2 * Pw - Fw) // Sw + 1
    return Nh_out, Nw_out


In [5]:
#[Problem 4] Creation of maximum pooling layer
class MaxPool2D:
    def __init__(self, kernel_size=2, stride=2):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        batch, channels, h, w = x.shape
        Kh, Kw = self.kernel_size, self.kernel_size
        Sh, Sw = self.stride, self.stride
        Nh_out = (h - Kh) // Sh + 1
        Nw_out = (w - Kw) // Sw + 1
        out = np.zeros((batch, channels, Nh_out, Nw_out))
        self.mask = np.zeros_like(x)

        for n in range(batch):
            for c in range(channels):
                for i in range(Nh_out):
                    for j in range(Nw_out):
                        h_start = i * Sh
                        w_start = j * Sw
                        window = x[n, c, h_start:h_start+Kh, w_start:w_start+Kw]
                        max_val = np.max(window)
                        out[n, c, i, j] = max_val
                        max_pos = (window == max_val)
                        self.mask[n, c, h_start:h_start+Kh, w_start:w_start+Kw] += max_pos
        return out

    def backward(self, dout):
        dx = np.zeros_like(self.x)
        batch, channels, Nh_out, Nw_out = dout.shape
        Kh, Kw = self.kernel_size, self.kernel_size
        Sh, Sw = self.stride, self.stride

        for n in range(batch):
            for c in range(channels):
                for i in range(Nh_out):
                    for j in range(Nw_out):
                        h_start = i * Sh
                        w_start = j * Sw
                        dx[n, c, h_start:h_start+Kh, w_start:w_start+Kw] += dout[n, c, i, j] * self.mask[n, c, h_start:h_start+Kh, w_start:w_start+Kw]
        return dx




def GetMiniBatch(X, y, batch_size=1):
    N = X.shape[0]
    for i in range(0, N, batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]


class SimpleConv2d:
    def __init__(self, F, C, FH, FW, P=0, S=1):
        self.F, self.C, self.FH, self.FW = F, C, FH, FW
        self.P, self.S = P, S
        limit = 1 / np.sqrt(C * FH * FW)
        self.W = np.random.uniform(-limit, limit, (F, C, FH, FW))
        self.b = np.zeros(F)
    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        out_h = (H + 2*self.P - self.FH)//self.S + 1
        out_w = (W + 2*self.P - self.FW)//self.S + 1
        x_pad = np.pad(x, ((0,0),(0,0),(self.P,self.P),(self.P,self.P)), 'constant')
        out = np.zeros((N, self.F, out_h, out_w))
        for n in range(N):
            for f in range(self.F):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i*self.S
                        w_start = j*self.S
                        window = x_pad[n, :, h_start:h_start+self.FH, w_start:w_start+self.FW]
                        out[n,f,i,j] = np.sum(window * self.W[f]) + self.b[f]
        self.out = out
        return out
    def backward(self, dout, lr=0.01):
        N, C, H, W = self.x.shape
        _, F, out_h, out_w = dout.shape
        x_pad = np.pad(self.x, ((0,0),(0,0),(self.P,self.P),(self.P,self.P)), 'constant')
        dx_pad = np.zeros_like(x_pad)
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        for n in range(N):
            for f in range(F):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i*self.S
                        w_start = j*self.S
                        window = x_pad[n, :, h_start:h_start+self.FH, w_start:w_start+self.FW]
                        dW[f] += dout[n,f,i,j] * window
                        db[f] += dout[n,f,i,j]
                        dx_pad[n, :, h_start:h_start+self.FH, w_start:w_start+self.FW] += dout[n,f,i,j] * self.W[f]
        if self.P > 0:
            dx = dx_pad[:, :, self.P:-self.P, self.P:-self.P]
        else:
            dx = dx_pad
        self.W -= lr * dW
        self.b -= lr * db
        return dx



In [6]:
#[Problem 5] (Advance task) Creating average pooling

class AveragePool2D:
    def __init__(self, kernel_size=2, stride=2):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        batch, channels, h, w = x.shape
        Kh, Kw = self.kernel_size, self.kernel_size
        Sh, Sw = self.stride, self.stride
        Nh_out = (h - Kh) // Sh + 1
        Nw_out = (w - Kw) // Sw + 1
        out = np.zeros((batch, channels, Nh_out, Nw_out))

        for n in range(batch):
            for c in range(channels):
                for i in range(Nh_out):
                    for j in range(Nw_out):
                        h_start = i * Sh
                        w_start = j * Sw
                        window = x[n, c, h_start:h_start+Kh, w_start:w_start+Kw]
                        out[n, c, i, j] = np.mean(window)
        return out

    def backward(self, dout):
        dx = np.zeros_like(self.x)
        batch, channels, Nh_out, Nw_out = dout.shape
        Kh, Kw = self.kernel_size, self.kernel_size
        Sh, Sw = self.stride, self.stride

        for n in range(batch):
            for c in range(channels):
                for i in range(Nh_out):
                    for j in range(Nw_out):
                        h_start = i * Sh
                        w_start = j * Sw
                        dx[n, c, h_start:h_start+Kh, w_start:w_start+Kw] += dout[n, c, i, j] / (Kh * Kw)
        return dx


In [7]:
#[Problem 6] Smoothing
class Flatten:
    def forward(self, x):
        self.input_shape = x.shape
        return x.reshape(x.shape[0], -1)

    def backward(self, dout):
        return dout.reshape(self.input_shape)
class FC:
    def __init__(self, input_dim, output_dim):
        limit = 1/np.sqrt(input_dim)
        self.W = np.random.uniform(-limit, limit, (input_dim, output_dim))
        self.b = np.zeros(output_dim)
    def forward(self, x):
        self.x = x
        return x @ self.W + self.b
    def backward(self, dout, lr=0.01):
        dW = self.x.T @ dout
        db = np.sum(dout, axis=0)
        dx = dout @ self.W.T
        self.W -= lr * dW
        self.b -= lr * db
        return dx

class ReLU:
    def forward(self, x):
        self.mask = (x>0)
        return x * self.mask
    def backward(self, dout):
        dx = dout.copy()
        dx[self.mask] = 0
        return dx

class Softmax:
    def forward(self, x):
        e = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.out = e / np.sum(e, axis=1, keepdims=True)
        return self.out
    def backward(self, dout):
        return dout



In [8]:
#[Problem 7] Learning and estimation

class Scratch2dCNNClassifier:
    def __init__(self, NN, CNN, n_epoch=5, n_batch=20, verbose=False):
        self.NN = NN
        self.CNN = CNN
        self.n_epoch = n_epoch
        self.n_batch = n_batch
        self.verbose = verbose
        self.log_loss = np.zeros(n_epoch)
        self.log_acc = np.zeros(n_epoch)
        self.flt = Flatten()
        self.softmax = Softmax()

    def loss_function(self, y, yt):
        delta = 1e-7
        return -np.mean(yt * np.log(y + delta))

    def accuracy(self, Z, Y):
        return accuracy_score(Y, Z)

    def fit(self, X, y, X_val=False, y_val=False, lr=0.01):
        for epoch in range(self.n_epoch):
            get_mini_batch = list(GetMiniBatch(X, y, batch_size=self.n_batch))
            self.loss = 0

            for mini_X_train, mini_y_train in get_mini_batch:
                forward_data = mini_X_train[:, np.newaxis, :, :]
                for i in range(len(self.CNN)):
                    forward_data = self.CNN[i].forward(forward_data)

                flt = Flatten()
                forward_data = flt.forward(forward_data)

                for i in range(len(self.NN)):
                    forward_data = self.NN[i].forward(forward_data)

                Z = forward_data
                dout = (Z - mini_y_train) / self.n_batch

                for i in reversed(range(len(self.NN))):
                    layer = self.NN[i]
                    if isinstance(layer, FC) or isinstance(layer, SimpleConv2d):
                        dout = layer.backward(dout, lr)
                    else:
                        dout = layer.backward(dout)

                dout = flt.backward(dout)

                for i in reversed(range(len(self.CNN))):
                    layer = self.CNN[i]
                    if isinstance(layer, FC) or isinstance(layer, SimpleConv2d):
                        dout = layer.backward(dout, lr)
                    else:
                        dout = layer.backward(dout)

                self.loss += self.loss_function(Z, mini_y_train)

                if self.verbose:
                    print()

        if self.verbose:
            pred_train = self.predict(X)
            acc_train = self.accuracy(pred_train, np.argmax(y, axis=1))
            print(f"Epoch {epoch+1}/{self.n_epoch} - Loss: {self.loss / len(get_mini_batch):.6f} - Accuracy: {acc_train:.4f}")

        self.log_loss[epoch] = self.loss / len(get_mini_batch)
        self.log_acc[epoch] = self.accuracy(self.predict(X), np.argmax(y, axis=1))


    def predict(self, X):
        pred_data = X[:, np.newaxis, :, :]
        for i in range(len(self.CNN)):
            pred_data = self.CNN[i].forward(pred_data)
        pred_data = self.flt.forward(pred_data)
        for i in range(len(self.NN)):
            pred_data = self.NN[i].forward(pred_data)
        return np.argmax(pred_data, axis=1)

In [10]:
from sklearn.metrics import accuracy_score
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype(np.float32) / 255.0
x_test = x_test.astype(np.float32) / 255.0
y_train_one_hot = to_categorical(y_train, 10)
y_test_one_hot = to_categorical(y_test, 10)


NN = {
    0: FC(10*13*13, 200),
    1: ReLU(),
    2: FC(200, 200),
    3: ReLU(),
    4: FC(200, 10)
}

CNN = {
    0: SimpleConv2d(F=10, C=1, FH=3, FW=3, P=0, S=1),
    1: MaxPool2D(kernel_size=2, stride=2)
}


cnn1 = Scratch2dCNNClassifier(NN=NN, CNN=CNN, n_epoch=1, n_batch=64, verbose=True)
cnn1.fit(x_train[:1000], y_train_one_hot[:1000], lr=0.01)

y_pred = cnn1.predict(x_test[:200])
acc = accuracy_score(np.argmax(y_test_one_hot[:200], axis=1), y_pred)
print(f"Accuracy on test set (1000 samples): {acc:.4f}")

  return -np.mean(yt * np.log(y + delta))


















Epoch 1/1 - Loss: nan - Accuracy: 0.1230
Accuracy on test set (1000 samples): 0.0800


In [11]:
#[Problem 8] (Advance assignment) LeNet
class ScratchLeNet:
    def __init__(self):
        # CNN layers
        self.CNN = {
            0: SimpleConv2d(F=6, C=1, FH=5, FW=5, P=0, S=1),  # 28x28 -> 24x24
            1: ReLU(),
            2: MaxPool2D(kernel_size=2, stride=2),             # 24x24 -> 12x12
            3: SimpleConv2d(F=16, C=6, FH=5, FW=5, P=0, S=1), # 12x12 -> 8x8
            4: ReLU(),
            5: MaxPool2D(kernel_size=2, stride=2),             # 8x8 -> 4x4
        }
        self.flatten = Flatten()
        # Fully connected layers
        self.NN = {
            0: FC(16*4*4, 120),
            1: ReLU(),
            2: FC(120, 84),
            3: ReLU(),
            4: FC(84, 10),
            5: Softmax()
        }

        self.n_epoch = 5
        self.n_batch = 64
        self.log_loss = np.zeros(self.n_epoch)
        self.log_acc = np.zeros(self.n_epoch)

    def loss_function(self, y, yt):
        delta = 1e-7
        return -np.mean(yt * np.log(y + delta))

    def accuracy(self, Z, Y):
        return accuracy_score(Y, Z)

    def fit(self, X, y, lr=0.01):
        for epoch in range(self.n_epoch):
            get_mini_batch = GetMiniBatch(X, y, batch_size=self.n_batch)
            loss_total = 0
            for mini_X, mini_y in get_mini_batch:
                # Forward pass CNN
                forward_data = mini_X[:, np.newaxis, :, :]  # add channel dim
                for i in range(len(self.CNN)):
                    forward_data = self.CNN[i].forward(forward_data)

                # Flatten
                forward_data = self.flatten.forward(forward_data)

                # Forward pass FC
                for i in range(len(self.NN)):
                    forward_data = self.NN[i].forward(forward_data)

                Z = forward_data

                # Backward
                dout = (Z - mini_y) / self.n_batch

                for i in reversed(range(len(self.NN))):
                    layer = self.NN[i]
                    if isinstance(layer, FC):
                        dout = layer.backward(dout, lr)
                    else:
                        dout = layer.backward(dout)

                dout = self.flatten.backward(dout)

                for i in reversed(range(len(self.CNN))):
                    layer = self.CNN[i]
                    if isinstance(layer, SimpleConv2d):
                        dout = layer.backward(dout, lr)
                    else:
                        dout = layer.backward(dout)

                loss_total += self.loss_function(Z, mini_y)

            self.log_loss[epoch] = loss_total / len(get_mini_batch)
            pred = self.predict(X)
            self.log_acc[epoch] = self.accuracy(pred, np.argmax(y, axis=1))
            print(f"Epoch {epoch+1}: Loss {self.log_loss[epoch]:.4f}, Accuracy {self.log_acc[epoch]:.4f}")

    def predict(self, X):
        forward_data = X[:, np.newaxis, :, :]
        for i in range(len(self.CNN)):
            forward_data = self.CNN[i].forward(forward_data)
        forward_data = self.flatten.forward(forward_data)
        for i in range(len(self.NN)):
            forward_data = self.NN[i].forward(forward_data)
        return np.argmax(forward_data, axis=1)


#[Problem 9] (Advance assignment) Survey of famous image recognition models
###AlexNet (2012)

AlexNet was developed by Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton and is often credited with launching the deep learning revolution in computer vision. It won the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) in 2012 by a significant margin. The architecture consists of 8 layers, including 5 convolutional layers followed by 3 fully connected layers. It introduced the use of ReLU activation functions, which helped speed up training compared to previous sigmoid or tanh activations. AlexNet uses max-pooling layers for downsampling and incorporates dropout in the fully connected layers to reduce overfitting. The model also employed data augmentation and image preprocessing techniques to improve generalization. Because of its size and computational demands, it was trained on two GPUs. AlexNet demonstrated the effectiveness of deep convolutional neural networks trained on large datasets with GPUs, setting a new standard for image classification models.

###VGG16 (2014)

VGG16 was created by Karen Simonyan and Andrew Zisserman from the Visual Geometry Group at Oxford University. It achieved second place in the ILSVRC 2014 competition but is known for its simple and uniform architecture that emphasizes the importance of network depth. The model is 16 layers deep, composed of 13 convolutional layers and 3 fully connected layers. VGG16 uses very small 3x3 convolutional filters with stride 1 and padding 1. Multiple convolutional layers are stacked before each max-pooling layer, which increases the network’s expressiveness and non-linearity. The model uses ReLU activations throughout and ends with fully connected layers for classification. VGG16 demonstrated that very deep networks with small filters could significantly improve performance. Its simple and modular design makes it easy to implement and modify, and it is commonly used as a baseline or feature extractor in transfer learning.

###Other Notable CNN Architectures (brief mentions)
LeNet (1998): One of the earliest CNNs, designed for digit recognition (MNIST). Simpler and shallower than later models.

GoogLeNet / Inception (2014): Introduced the Inception module with multiple filter sizes in parallel, deeper and more computationally efficient.

ResNet (2015): Introduced residual connections to train very deep networks (50, 101, 152 layers) by addressing vanishing gradients.

DenseNet, MobileNet, EfficientNet: More recent architectures focusing on parameter efficiency, speed, and accuracy trade-offs.

In [12]:
#[Problem 10] Calculation of output size and number of parameters
def conv2d_output_size(H_in, W_in, Fh, Fw, stride=1, pad=0):
    H_out = (H_in - Fh + 2*pad)//stride + 1
    W_out = (W_in - Fw + 2*pad)//stride + 1
    return H_out, W_out

def conv2d_params(Fh, Fw, C_in, C_out, bias=True):
    params = Fh * Fw * C_in * C_out
    if bias:
        params += C_out
    return params

layers = [
    {"H_in":144, "W_in":144, "C_in":3, "Fh":3, "Fw":3, "C_out":6, "stride":1, "pad":0},
    {"H_in":60,  "W_in":60,  "C_in":24,"Fh":3, "Fw":3, "C_out":48,"stride":1, "pad":0},
    {"H_in":20,  "W_in":20,  "C_in":10,"Fh":3, "Fw":3, "C_out":20,"stride":2, "pad":0}
]

for i, l in enumerate(layers, 1):
    H_out, W_out = conv2d_output_size(l["H_in"], l["W_in"], l["Fh"], l["Fw"], l["stride"], l["pad"])
    params = conv2d_params(l["Fh"], l["Fw"], l["C_in"], l["C_out"])
    print(f"Layer {i}: Output size = {H_out}x{W_out}x{l['C_out']}, Params = {params}")


Layer 1: Output size = 142x142x6, Params = 168
Layer 2: Output size = 58x58x48, Params = 10416
Layer 3: Output size = 9x9x20, Params = 1820


#[Problem 11] (Advance assignment) Survey on filter size
####3×3 convolution filters

Preferred over fully connected networks because they reduce computational costs and allow weight sharing, resulting in fewer weights for backpropagation.

Can be seen as a series of one-dimensional convolutions, making them cost-efficient.

Work well in general and are often the popular choice for convolution layers.

####1×1 convolution filters

Proposed in the Network-in-Network paper and widely used in Google Inception.

Advantages include:

Dimensionality reduction for efficient computation

Low-dimensional embedding or feature pooling

Adding nonlinearity again after convolution