In [1]:
import numpy as np

## Komponen Utama ResNet18 Native yang Dibuat

1. **Conv2D** ‚úÖ 
   - Konvolusi 2D dengan parameter kernel, stride, padding  
   - Forward & Backward propagation  
   - Mendukung input multi-channel (RGB)

2. **BatchNorm2D** ‚úÖ 
   - Normalisasi batch untuk stabilisasi dan percepatan training  
   - Forward & Backward propagation  

3. **ReLU (Rectified Linear Unit)** ‚úÖ 
   - Fungsi aktivasi non-linear $ \max(0, x) $  
   - Forward & Backward propagation  

4. **MaxPool2D** ‚úÖ 
   - Pooling maksimal dengan kernel dan stride tertentu  
   - Mengurangi dimensi spasial (height & width)  
   - Forward & Backward propagation  

5. **GlobalAveragePooling2D (GAP)** ‚úÖ 
   - Pooling rata-rata global untuk mereduksi spatial dimension ke 1 nilai per channel  
   - Forward & Backward propagation  

6. **ResidualBlock (BasicBlock)** ‚úÖ 
   - Blok residual dengan dua layer Conv2D + BatchNorm + ReLU  
   - Skip connection (identity atau projection)  
   - Forward & Backward propagation  

7. **Dense (Fully Connected Layer)** ‚úÖ 
   - Layer linear terakhir untuk klasifikasi  
   - Forward & Backward propagation  

8. **Dropout** ‚úÖ 
   - Regularisasi dengan menonaktifkan neuron secara acak saat training  
   - Forward & Backward propagation  

9. **Loss Function: SoftmaxCrossEntropyLoss** ‚úÖ 
   - Fungsi loss klasifikasi multi-kelas  
   - Forward & Backward propagation  

10. **Optimizer (contoh: Adam, SGD)** ‚úÖ 
    - Update parameter berdasarkan gradien dan algoritma optimasi  
    - Mempunyai persamaan matematis terkait update bobot
    
--- 

### Conv2D - Persamaan Matematis

Untuk setiap filter $ f $ dan posisi $ (i, j) $, output dihitung dengan:

$$
y_{i,j,f} = \sum_{c=1}^{C_{in}} \sum_{m=1}^{k} \sum_{n=1}^{k} W_{f,c,m,n} \cdot x_{i+m, j+n, c} + b_f
$$

Keterangan:
Keterangan:

- $C_{in}$: jumlah channel input (contoh: 3 untuk RGB)  
- $W$: bobot filter (kernel) bentuk $(F, C_{in}, k, k)$  
- $b_f$: bias untuk filter ke-$f$  
- $x$: input citra dengan ukuran height $\times$ width  

#### üîÅ Backward: Gradien terhadap parameter dan input

- Gradien terhadap bobot $ W $:

$$
\frac{\partial L}{\partial W_{f,c,m,n}} = \sum_{i,j,b} \frac{\partial L}{\partial y_{i,j,f}} \cdot x_{i+m,j+n,c}
$$

- Gradien terhadap input $ x $:

$$
\frac{\partial L}{\partial x_{i,j,c}} = \sum_{f} \sum_{m,n} \frac{\partial L}{\partial y_{i-m,j-n,f}} \cdot W_{f,c,m,n}
$$


In [4]:
import numpy as np

class Conv2D:
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        """
        in_channels: jumlah channel input (misal: 3 untuk RGB)
        out_channels: jumlah filter
        kernel_size: ukuran kernel (biasanya 3)
        stride: pergeseran filter
        padding: jumlah padding di sekeliling input
        """
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        # Weight: (out_channels, in_channels, kernel_size, kernel_size)
        self.W = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * np.sqrt(2. / (in_channels * kernel_size * kernel_size))
        self.b = np.zeros(out_channels)

        # Gradien
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        """
        x: input dengan shape (batch_size, height, width, in_channels)
        output: (batch_size, new_height, new_width, out_channels)
        """
        self.x = x
        batch_size, h_in, w_in, _ = x.shape
        k = self.kernel_size
        s = self.stride
        p = self.padding

        # Padding
        x_padded = np.pad(x, ((0, 0), (p, p), (p, p), (0, 0)), mode='constant')
        self.x_padded = x_padded

        h_out = (h_in + 2*p - k) // s + 1
        w_out = (w_in + 2*p - k) // s + 1
        out = np.zeros((batch_size, h_out, w_out, self.out_channels))

        for b in range(batch_size):
            for i in range(h_out):
                for j in range(w_out):
                    for f in range(self.out_channels):
                        h_start = i * s
                        w_start = j * s
                        region = x_padded[b, h_start:h_start+k, w_start:w_start+k, :]
                        out[b, i, j, f] = np.sum(region * self.W[f]) + self.b[f]
        return out

    def backward(self, grad_output, lr):
        """
        grad_output: gradien dari layer berikutnya, shape (batch, h_out, w_out, out_channels)
        """
        batch_size, h_out, w_out, _ = grad_output.shape
        k = self.kernel_size
        s = self.stride
        p = self.padding

        self.dW.fill(0)
        self.db.fill(0)
        dx_padded = np.zeros_like(self.x_padded)

        for b in range(batch_size):
            for i in range(h_out):
                for j in range(w_out):
                    for f in range(self.out_channels):
                        h_start = i * s
                        w_start = j * s
                        region = self.x_padded[b, h_start:h_start+k, w_start:w_start+k, :]

                        self.dW[f] += region * grad_output[b, i, j, f]
                        self.db[f] += grad_output[b, i, j, f]
                        dx_padded[b, h_start:h_start+k, w_start:w_start+k, :] += self.W[f] * grad_output[b, i, j, f]

        # Update weight & bias
        self.W -= lr * self.dW
        self.b -= lr * self.db

        # Remove padding
        if p > 0:
            dx = dx_padded[:, p:-p, p:-p, :]
        else:
            dx = dx_padded
        return dx


### BatchNorm2D - Persamaan Matematis


Mean dan variansi:

$$
\mu_c = \frac{1}{N} \sum_{i=1}^{N} x_{i,c}, \quad
\sigma_c^2 = \frac{1}{N} \sum_{i=1}^{N} (x_{i,c} - \mu_c)^2
$$

Normalisasi:

$$
\hat{x}_{i,c} = \frac{x_{i,c} - \mu_c}{\sqrt{\sigma_c^2 + \epsilon}}
$$

Skalasi dan translasi:

$$
y_{i,c} = \gamma_c \hat{x}_{i,c} + \beta_c
$$

#### üîÅ Backward:

- Gradien terhadap skala dan bias:

$$
\frac{\partial L}{\partial \gamma_c} = \sum_i \frac{\partial L}{\partial y_{i,c}} \cdot \hat{x}_{i,c}
\quad ; \quad
\frac{\partial L}{\partial \beta_c} = \sum_i \frac{\partial L}{\partial y_{i,c}}
$$


In [6]:
class BatchNorm2D:
    def __init__(self, num_features, momentum=0.9, epsilon=1e-5):
        """
        num_features: jumlah channel fitur (biasanya sama dengan output channel dari Conv2D)
        """
        self.num_features = num_features
        self.momentum = momentum
        self.epsilon = epsilon

        # Parameter trainable
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)

        # Untuk inference
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)

    def forward(self, x, training=True):
        """
        x: input dengan shape (batch, height, width, channels)
        """
        self.x = x
        if training:
            self.mean = x.mean(axis=(0, 1, 2))
            self.var = x.var(axis=(0, 1, 2))

            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.var

            self.x_norm = (x - self.mean) / np.sqrt(self.var + self.epsilon)
        else:
            self.x_norm = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)

        out = self.gamma * self.x_norm + self.beta
        return out

    def backward(self, grad_output, lr):
        """
        grad_output: gradien dari layer berikutnya, shape (batch, height, width, channels)
        """
        N, H, W, C = grad_output.shape

        dx_norm = grad_output * self.gamma
        dvar = np.sum(dx_norm * (self.x - self.mean) * -0.5 * ((self.var + self.epsilon) ** -1.5), axis=(0, 1, 2))
        dmean = np.sum(dx_norm * -1 / np.sqrt(self.var + self.epsilon), axis=(0, 1, 2)) + \
                dvar * np.sum(-2 * (self.x - self.mean), axis=(0, 1, 2)) / (N * H * W)

        dx = dx_norm / np.sqrt(self.var + self.epsilon) + \
             dvar * 2 * (self.x - self.mean) / (N * H * W) + \
             dmean / (N * H * W)

        self.dgamma = np.sum(grad_output * self.x_norm, axis=(0, 1, 2))
        self.dbeta = np.sum(grad_output, axis=(0, 1, 2))

        # Update gamma dan beta
        self.gamma -= lr * self.dgamma
        self.beta -= lr * self.dbeta

        return dx


### ReLU (Fungsi Aktivasi) - Persamaan Matematis

Fungsi aktivasi:

$$
y = \max(0, x)
$$

#### üîÅ Backward:
Turunan (backpropagation):

$$
\frac{dy}{dx} =
\begin{cases}
1, & \text{jika } x > 0 \\
0, & \text{jika } x \leq 0
\end{cases}
$$


In [8]:
class ReLU:
    def __init__(self):
        self.mask = None  # simpan posisi > 0

    def forward(self, x):
        """
        x: input (batch, height, width, channels)
        """
        self.mask = (x > 0)
        return x * self.mask

    def backward(self, grad_output):
        """
        grad_output: gradien dari layer selanjutnya
        """
        return grad_output * self.mask


### MaxPooling 2D - Persamaan Matematis

Misalkan input feature map $ x $ dengan ukuran $(H \times W)$ kernel pooling berukuran $ k \times k $, dan stride $ s $.

Output feature map $ y $ dengan ukuran $(H_{out} \times W_{out})$ dihitung dengan:

$$
y(i,j) = \max_{(m,n) \in R(i,j)} x(m,n)
$$

di mana

$$
R(i,j) = \{ (m,n) \mid m = s \cdot i + a, \quad n = s \cdot j + b, \quad 0 \leq a,b < k \}
$$

dan ukuran output adalah

$$
H_{out} = \left\lfloor \frac{H - k}{s} \right\rfloor + 1, \quad W_{out} = \left\lfloor \frac{W - k}{s} \right\rfloor + 1
$$

#### üîÅ Backward:
Gradien terhadap input $ x $ hanya diteruskan ke posisi maksimal dalam setiap area pooling, yaitu

$$
\frac{\partial L}{\partial x(m,n)} = 
\begin{cases}
\frac{\partial L}{\partial y(i,j)}, & \text{jika } (m,n) = \arg\max_{(a,b) \in R(i,j)} x(a,b) \\
0, & \text{lainnya}
\end{cases}
$$


In [10]:
class MaxPool2D:
    def __init__(self, kernel_size=2, stride=2):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        """
        x: numpy array with shape (batch_size, channels, height, width)
        """
        self.input = x
        batch_size, channels, height, width = x.shape
        k = self.kernel_size
        s = self.stride

        out_height = (height - k) // s + 1
        out_width = (width - k) // s + 1
        self.output = np.zeros((batch_size, channels, out_height, out_width))
        self.argmax = np.zeros_like(self.output, dtype=int)

        for b in range(batch_size):
            for c in range(channels):
                for i in range(out_height):
                    for j in range(out_width):
                        h_start = i * s
                        w_start = j * s
                        window = x[b, c, h_start:h_start+k, w_start:w_start+k]
                        self.output[b, c, i, j] = np.max(window)
                        self.argmax[b, c, i, j] = np.argmax(window)
        return self.output

    def backward(self, grad_output):
        """
        grad_output: numpy array with shape same as self.output
        """
        batch_size, channels, height, width = self.input.shape
        k = self.kernel_size
        s = self.stride
        grad_input = np.zeros_like(self.input)

        out_height, out_width = grad_output.shape[2], grad_output.shape[3]

        for b in range(batch_size):
            for c in range(channels):
                for i in range(out_height):
                    for j in range(out_width):
                        h_start = i * s
                        w_start = j * s
                        window = self.input[b, c, h_start:h_start+k, w_start:w_start+k]
                        max_idx = self.argmax[b, c, i, j]
                        max_pos = np.unravel_index(max_idx, window.shape)
                        grad_input[b, c, h_start:h_start+k, w_start:w_start+k][max_pos] += grad_output[b, c, i, j]
        return grad_input


### Global Average Pooling (GAP)

Diberikan input $ x \in \mathbb{R}^{B \times C \times H \times W} $, maka output GAP:

$$
y_{b,c} = \frac{1}{H \cdot W} \sum_{i=1}^{H} \sum_{j=1}^{W} x_{b,c,i,j}
$$

Hasil: Tensor ukuran $ B \times C $


In [12]:
class GlobalAveragePooling2D:
    def forward(self, x):
        self.input = x  # shape: (B, C, H, W)
        return np.mean(x, axis=(2, 3))  # shape: (B, C)

    def backward(self, grad_output):
        B, C, H, W = self.input.shape
        grad_input = grad_output[:, :, None, None] * np.ones((B, C, H, W)) / (H * W)
        return grad_input


### Dense Layer / Fully Connected

Output:

$$
y = xW + b
$$

Dengan:
- $ x \in \mathbb{R}^{B \times D} $ 
- $ W \in \mathbb{R}^{D \times K} $, $ b \in \mathbb{R}^{K} $
- $ y \in \mathbb{R}^{B \times K} $, jumlah kelas = $ K $


In [14]:
class Dense:
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim)
        self.b = np.zeros(output_dim)

    def forward(self, x):
        self.input = x  # shape: (B, D)
        return x @ self.W + self.b  # shape: (B, K)

    def backward(self, grad_output):
        self.grad_W = self.input.T @ grad_output  # shape: (D, K)
        self.grad_b = np.sum(grad_output, axis=0)  # shape: (K,)
        grad_input = grad_output @ self.W.T  # shape: (B, D)
        return grad_input


### Residual Block (ResNet BasicBlock)

Blok residual menghitung:

$$
F(x) = \text{ReLU}(\text{BN}_2(\text{Conv}_2(\text{ReLU}(\text{BN}_1(\text{Conv}_1(x))))))
$$

Lalu hasil akhir:

$$
y = F(x) + x
$$

Jika terjadi perubahan dimensi (misal karena stride > 1 atau jumlah channel berubah), maka shortcut diubah:

$$
x' = \text{BN}_{\text{proj}}(\text{Conv}_{1 \times 1}(x))
$$

Sehingga:

$$
y = F(x) + x'
$$


In [16]:
class ResidualBlock:
    def __init__(self, in_channels, out_channels, stride=1):
        self.conv1 = Conv2D(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = BatchNorm2D(out_channels)
        self.relu = ReLU()

        self.conv2 = Conv2D(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = BatchNorm2D(out_channels)

        self.use_projection = (stride != 1 or in_channels != out_channels)
        if self.use_projection:
            self.shortcut_conv = Conv2D(in_channels, out_channels, kernel_size=1, stride=stride)
            self.shortcut_bn = BatchNorm2D(out_channels)

    def forward(self, x):
        out = self.conv1.forward(x)
        out = self.bn1.forward(out)
        out = self.relu.forward(out)

        out = self.conv2.forward(out)
        out = self.bn2.forward(out)

        if self.use_projection:
            shortcut = self.shortcut_conv.forward(x)
            shortcut = self.shortcut_bn.forward(shortcut)
        else:
            shortcut = x  # asumsi ukuran sama

        out += shortcut
        out = self.relu.forward(out)
        return out

    def backward(self, grad_output):
        grad_output = self.relu.backward(grad_output)

        grad_shortcut = grad_output.copy()
        grad_main = grad_output.copy()

        # Backward main path
        grad_main = self.bn2.backward(grad_main)
        grad_main = self.conv2.backward(grad_main)
        grad_main = self.relu.backward(grad_main)
        grad_main = self.bn1.backward(grad_main)
        grad_main = self.conv1.backward(grad_main)

        # Backward shortcut
        if self.use_projection:
            grad_shortcut = self.shortcut_bn.backward(grad_shortcut)
            grad_shortcut = self.shortcut_conv.backward(grad_shortcut)

        # Combine grads
        grad_input = grad_main + grad_shortcut
        return grad_input


### Softmax

Diberikan output logits $ z \in \mathbb{R}^K $ maka softmax-nya:

$$
\text{softmax}(z_i) = \frac{e^{z_i}}{\sum_{j=1}^K e^{z_j}}
$$




### Cross-Entropy Loss

Untuk ground truth one-hot $ y $, dan prediksi softmax $ \hat{y} $:

$$
\mathcal{L} = -\sum_{i=1}^{K} y_i \log(\hat{y}_i)
$$

In [19]:
class SoftmaxCrossEntropyLoss:
    def forward(self, logits, labels):
        # logits: (B, K), labels: (B, K) one-hot
        self.logits = logits
        self.labels = labels

        exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # stabilize
        self.probs = exps / np.sum(exps, axis=1, keepdims=True)

        # Cross-entropy loss
        batch_size = logits.shape[0]
        loss = -np.sum(labels * np.log(self.probs + 1e-12)) / batch_size
        return loss

    def backward(self):
        # Derivatif loss terhadap logits
        return (self.probs - self.labels) / self.labels.shape[0]


# üß† Dropout

### üîπ Forward Pass saat Training

Diberikan input $ x \in \mathbb{R}^n $, kita sampling mask $ r \in \{0,1\}^n $ dari distribusi Bernoulli:

$$
r_i \sim \text{Bernoulli}(1 - p)
$$

Kemudian output $ y $ dihitung sebagai:

$$
y_i = \frac{x_i \cdot r_i}{1 - p}
$$

> Di sini $ p $ adalah probabilitas dropout (misal: 0.5).

### üîπ Forward Pass saat Inferensi

$$
y_i = x_i
$$

Tidak ada dropout saat inferensi ‚Äî semua neuron aktif.

#### üîÅ Backward:
Gradien terhadap input saat backpropagation:

$$
\frac{\partial L}{\partial x_i} = \frac{\partial L}{\partial y_i} \cdot \frac{r_i}{1 - p}
$$


In [21]:
class Dropout:
    def __init__(self, dropout_rate=0.5):
        self.dropout_rate = dropout_rate
        self.mask = None
        self.training = True  # Default mode

    def forward(self, x):
        if self.training:
            self.mask = (np.random.rand(*x.shape) > self.dropout_rate).astype(np.float32)
            return x * self.mask / (1.0 - self.dropout_rate)  # Scale to keep expectation
        else:
            return x  # No dropout during evaluation

    def backward(self, grad_output):
        if self.training:
            return grad_output * self.mask / (1.0 - self.dropout_rate)
        else:
            return grad_output


# üß† Arsitektur ResNet-18 (Residual Network)

**ResNet-18** adalah model Convolutional Neural Network (CNN) dengan 18 lapisan berparameter yang menggunakan **skip connection** (shortcut/identity/projection) untuk mengatasi masalah **degradasi gradien** saat melatih jaringan yang sangat dalam.

---

## üìê Struktur ResNet-18

### üîπ Layer Awal:
- **Conv2D**: 7√ó7, 64 filter, stride=2, padding=3  
- **BatchNorm2D**  
- **ReLU**  
- **MaxPooling2D**: 3√ó3, stride=2  

### üî∏ Residual Blocks:

#### üß© Stage 1:
- 2 √ó BasicBlock(64, 64)

#### üß© Stage 2:
- 1 √ó BasicBlock(64 ‚Üí 128, stride=2, projection)
- 1 √ó BasicBlock(128, 128)

#### üß© Stage 3:
- 1 √ó BasicBlock(128 ‚Üí 256, stride=2, projection)
- 1 √ó BasicBlock(256, 256)

#### üß© Stage 4:
- 1 √ó BasicBlock(256 ‚Üí 512, stride=2, projection)
- 1 √ó BasicBlock(512, 512)

---

### üîö Akhir:
- **Global Average Pooling (GAP)**  
- **Dropout** (p=0.5, saat training)  
- **Dense Layer**: 512 ‚Üí `num_classes`  
- **Softmax Cross Entropy Loss**

---

## üî¢ Total Layer Berparameter:
- 1 √ó Conv awal  
- 16 √ó Conv layer di dalam 8 BasicBlock  
- 1 √ó Dense (FC)  
‚Üí **Total: 18 weight layers**

---

## ‚úÖ Tambahan Komponen:
- üîÑ **Skip Connection**:  
  - Identity mapping (dimensi sama)  
  - 1√ó1 projection (untuk penyesuaian channel atau stride)  
- üõë **Dropout**: digunakan sebelum Dense untuk regularisasi  
- üß† **Global Average Pooling (GAP)**: mengurangi dimensi spasial tanpa parameter  
- ‚úñÔ∏è **Flatten**: tidak diperlukan karena output GAP sudah berupa vektor 1D

---

## üîß Ringkasan Keunggulan:
- Memungkinkan pelatihan jaringan dalam dengan kestabilan
- Mengurangi overfitting dengan **Dropout**
- Tidak memerlukan Flatten karena adanya **GAP**
- Kompatibel dengan **data RGB (3 channel)**

In [23]:
class ResNet18:
    def __init__(self, input_channels=3, num_classes=3):
        self.conv1 = Conv2D(input_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = BatchNorm2D(64)
        self.relu = ReLU()
        self.maxpool = MaxPool2D(kernel_size=3, stride=2, padding=1)  # padding added for 'same' output size
        self.dropout = Dropout(p=0.5)
        
        # Residual stages (2 blocks per layer as ResNet18)
        self.layer1 = [ResidualBlock(64, 64) for _ in range(2)]
        self.layer2 = [ResidualBlock(64, 128, stride=2, use_projection=True), ResidualBlock(128, 128)]
        self.layer3 = [ResidualBlock(128, 256, stride=2, use_projection=True), ResidualBlock(256, 256)]
        self.layer4 = [ResidualBlock(256, 512, stride=2, use_projection=True), ResidualBlock(512, 512)]

        self.global_pool = GlobalAveragePooling2D()
        self.fc = Dense(512, num_classes)
        self.loss_fn = SoftmaxCrossEntropyLoss()

    def forward(self, x, y=None, training=True):
        x = self.conv1.forward(x)
        x = self.bn1.forward(x, training=training)
        x = self.relu.forward(x)
        x = self.maxpool.forward(x)

        for block in self.layer1: x = block.forward(x, training=training)
        for block in self.layer2: x = block.forward(x, training=training)
        for block in self.layer3: x = block.forward(x, training=training)
        for block in self.layer4: x = block.forward(x, training=training)

        x = self.global_pool.forward(x)

        if training:
            x = self.dropout.forward(x)

        logits = self.fc.forward(x)

        if y is not None:
            loss = self.loss_fn.forward(logits, y)
            return logits, loss
        return logits

    def backward(self):
        grad = self.loss_fn.backward()
        grad = self.fc.backward(grad)

        # Dropout backward (only if used in forward)
        grad = self.dropout.backward(grad)

        grad = self.global_pool.backward(grad)

        for block in reversed(self.layer4): grad = block.backward(grad)
        for block in reversed(self.layer3): grad = block.backward(grad)
        for block in reversed(self.layer2): grad = block.backward(grad)
        for block in reversed(self.layer1): grad = block.backward(grad)

        grad = self.maxpool.backward(grad)
        grad = self.relu.backward(grad)
        grad = self.bn1.backward(grad)
        grad = self.conv1.backward(grad)
        return grad


### Stochastic Gradient Descent (SGD)

Optimasi SGD melakukan update parameter $ \theta $ dengan aturan:

$
\theta_{t+1} = \theta_t - \eta \cdot \nabla_{\theta} J(\theta_t)
$

- $ \theta_t $ : parameter pada iterasi ke-$ t $  
- $ \eta $ : learning rate  
- $ \nabla_{\theta} J(\theta_t) $ : gradien fungsi loss terhadap parameter $ \theta $ pada iterasi ke-$ t $


In [25]:
class SGD:
    def __init__(self, parameters, lr=0.01):
        self.parameters = parameters  # list of params (weights & biases)
        self.lr = lr

    def step(self):
        for param in self.parameters:
            if hasattr(param, 'grad'):
                param -= self.lr * param.grad


### Update Bobot

In [27]:
def gather_params(model):
    params = []
    # contoh untuk Conv2D
    params.append(model.conv1.W)
    params.append(model.conv1.b)
    # loop semua ResidualBlock dan ambil paramnya
    for block in model.layer1 + model.layer2 + model.layer3 + model.layer4:
        params.extend([block.conv1.W, block.conv1.b,
                       block.conv2.W, block.conv2.b])
        if block.use_projection:
            params.extend([block.shortcut_conv.W, block.shortcut_conv.b])
    # Dense layer
    params.append(model.fc.W)
    params.append(model.fc.b)
    return params


In [28]:
def train(model, optimizer, data_loader, epochs=1):
    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for x_batch, y_batch in data_loader:
            logits, loss = model.forward(x_batch, y_batch)
            model.backward()

            optimizer.step()

            total_loss += loss * x_batch.shape[0]
            preds = np.argmax(logits, axis=1)
            labels = np.argmax(y_batch, axis=1)
            total_correct += np.sum(preds == labels)
            total_samples += x_batch.shape[0]

        print(f"Epoch {epoch+1}: Loss={total_loss/total_samples:.4f}, Accuracy={total_correct/total_samples:.4f}")


### Adam (Adaptive Moment Estimation)

Adam menggabungkan momentum dan adaptasi learning rate menggunakan estimasi momen pertama dan kedua:

- Hitung gradien saat ini:
$$
g_t = \nabla_{\theta} J(\theta_t)
$$

- Update momen pertama (mean gradien):
$$
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t
$$

- Update momen kedua (mean kuadrat gradien):
$$
v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2
$$

- Koreksi bias momen:
$$
\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad
\hat{v}_t = \frac{v_t}{1 - \beta_2^t}
$$

- Update parameter:
$$
\theta_{t+1} = \theta_t - \eta \cdot \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}
$$

- Di mana:
  - $ \beta_1, \beta_2 $ adalah hyperparameter decay rate (biasanya 0.9 dan 0.999)  
  - $ \epsilon $ adalah nilai kecil untuk menghindari pembagian nol (biasanya $10^{-8}$)  
  - $ \eta $ adalah learning rate


In [30]:
class Adam:
    def __init__(self, parameters, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.parameters = parameters
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        # Initialize moment estimates
        self.m = [np.zeros_like(p) for p in self.parameters]
        self.v = [np.zeros_like(p) for p in self.parameters]
        self.t = 0

    def step(self):
        self.t += 1
        for i, param in enumerate(self.parameters):
            if hasattr(param, 'grad'):
                g = param.grad
                self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * g
                self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (g ** 2)

                m_hat = self.m[i] / (1 - self.beta1 ** self.t)
                v_hat = self.v[i] / (1 - self.beta2 ** self.t)

                param -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)


In [31]:
def evaluate(model, data_loader):
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for x_batch, y_batch in data_loader:
        logits, loss = model.forward(x_batch, y_batch)
        preds = np.argmax(logits, axis=1)
        labels = np.argmax(y_batch, axis=1)

        total_loss += loss * x_batch.shape[0]
        total_correct += np.sum(preds == labels)
        total_samples += x_batch.shape[0]

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    print(f"Validation: Loss={avg_loss:.4f}, Accuracy={accuracy:.4f}")
    return avg_loss, accuracy
