In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from torchvision import datasets, transforms


data_path = ".\\CIFAR10\\"

cifar10 = datasets.CIFAR10(
    data_path, train = True, download = False, 
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
    ]))

cifar10_val = datasets.CIFAR10(
    data_path, train=False, download=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),(0.2470, 0.2435, 0.2616))
    ]))


label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

cifar2 = [(img, label_map[label])
          for img, label in cifar10 
          if label in [0, 2]]

cifar2_val = [(img, label_map[label])
              for img, label in cifar10_val
              if label in [0, 2]]

# Model Memory Efficient    (Soft Coding > Hard Coding)

```
각 계층에서 채널과 피처의 수를 나타내는 숫자들은 직접적으로 모델의 파라미터 수에 영향을 주기에 동일 모델 선언 시 Soft Coding의 경우 모델의 용량을 감소시킨다.
```

## Hard Coding    ->    일일이 파라미터 값을 부여

In [3]:
class NetWidth(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 16, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(16 * 8 * 8, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = out.view(-1, 16 * 8 * 8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        
        return out

## Soft Coding    ->   n_chans1 변수에 저장된 값을 이용

In [8]:
class NetWidth(nn.Module):
    def __init__(self, n_chans1 = 32):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_chans1, n_chans1 // 2, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * self.n_chans1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = out.view(-1, 8 * 8 * self.n_chans1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        
        return out

# Prevention Method of Overfitting

## 1. Add Regularization term (ex. L1, L2)

In [5]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))

print(f"Training on device {device}")

Training on device cuda


In [10]:
import datetime

def training_loop_l2reg(n_epochs, optimizer, model, loss_fn, train_loader):
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
    
        for imgs, labels in train_loader:
            imgs = imgs.to(device)
            labels = labels.to(device)
            outputs = model(imgs)

            loss = loss_fn(outputs, labels)

            l2_lambda = 0.001
            l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())

            loss = loss + l2_lambda * l2_norm

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            loss_train += loss.item()
            
        if epoch == 1 or epoch % 10 == 0:
            print(f"{datetime.datetime.now()} Epoch {epoch}, Training Loss {loss_train / len(train_loader)}")

In [11]:
import torch.optim as optim

train_loader = torch.utils.data.DataLoader(cifar2, batch_size = 64, shuffle = True)

model = NetWidth().to(device = device)
optimizer = optim.SGD(model.parameters(), lr = 1e-2)
loss_fn = nn.CrossEntropyLoss()

training_loop_l2reg(
    n_epochs = 100,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

2023-01-14 18:39:47.241464 Epoch 1, Training Loss 0.5816713928417035
2023-01-14 18:39:52.197775 Epoch 10, Training Loss 0.3492789014129882
2023-01-14 18:39:57.672336 Epoch 20, Training Loss 0.3076192063700621
2023-01-14 18:40:03.121422 Epoch 30, Training Loss 0.2772480789453361
2023-01-14 18:40:08.594407 Epoch 40, Training Loss 0.25395985080558026
2023-01-14 18:40:14.080248 Epoch 50, Training Loss 0.236167131335872
2023-01-14 18:40:19.617275 Epoch 60, Training Loss 0.21687189423164743
2023-01-14 18:40:25.501859 Epoch 70, Training Loss 0.20440235278408997
2023-01-14 18:40:30.951893 Epoch 80, Training Loss 0.1896446351982226
2023-01-14 18:40:36.433873 Epoch 90, Training Loss 0.1755782100520316
2023-01-14 18:40:41.917731 Epoch 100, Training Loss 0.16544538195345812


## 2. Dropout

In [12]:
class NetDropout(nn.Module):
    def __init__(self, n_chans1 = 32):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.conv1_dropout = nn.Dropout2d(p = 0.4)
        self.conv2 = nn.Conv2d(n_chans1, n_chans1 // 2, kernel_size=3, padding=1)
        self.conv2_dropout = nn.Dropout2d(p = 0.4)
        self.fc1 = nn.Linear(8 * 8 * n_chans1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = self.conv1_dropout(out)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = self.conv2_dropout(out)
        out = out.view(-1, 8 * 8 * self.n_chans1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        
        return out

## 3. Batch Normalization

In [13]:
class BatchNorm(nn.Module):
    def __init__(self, n_chans1 = 32):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.conv1_batchnorm = nn.BatchNorm2d(num_features=n_chans1)
        self.conv2 = nn.Conv2d(n_chans1, n_chans1 // 2, kernel_size=3, padding=1)
        self.conv2_batchnorm = nn.BatchNorm2d(num_features=n_chans1 // 2)
        self.fc1 = nn.Linear(8 * 8 * n_chans1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = self.conv1_batchnorm(self.conv1(x))
        out = F.max_pool2d(torch.tanh(out), 2)
        out = self.conv2_batchnorm(self.conv2(out))
        out = F.max_pool2d(torch.tanh(out), 2)
        out = out.view(-1, 8 * 8 * self.n_chans1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        
        return out

In [14]:
import torch.optim as optim

train_loader = torch.utils.data.DataLoader(cifar2, batch_size = 64, shuffle = True)

model = NetWidth().to(device = device)
optimizer = optim.SGD(model.parameters(), lr = 1e-2)
loss_fn = nn.CrossEntropyLoss()

training_loop_l2reg(
    n_epochs = 100,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

2023-01-14 19:30:44.712239 Epoch 1, Training Loss 0.5604936668447628
2023-01-14 19:30:50.042916 Epoch 10, Training Loss 0.3402536412712875
2023-01-14 19:30:55.569086 Epoch 20, Training Loss 0.3031249007411823
2023-01-14 19:31:01.168389 Epoch 30, Training Loss 0.2760226851815631
2023-01-14 19:31:06.689887 Epoch 40, Training Loss 0.2554647217795348
2023-01-14 19:31:12.172260 Epoch 50, Training Loss 0.2400734574551795
2023-01-14 19:31:17.659029 Epoch 60, Training Loss 0.22504083294967178
2023-01-14 19:31:23.145827 Epoch 70, Training Loss 0.20992670654301432
2023-01-14 19:31:28.627116 Epoch 80, Training Loss 0.19661309513126968
2023-01-14 19:31:34.125183 Epoch 90, Training Loss 0.18382295097704907
2023-01-14 19:31:39.624311 Epoch 100, Training Loss 0.17265565793035895


# For Complex Structure    ->    Skip Connection

```
매우 깊은 심층 신경망에서 역전파 진행 시 손실값과 파라미터 사이의 미분 연산의 체인 연결에서 오는 많은 수로 곱해져야 하고, 매우 작은 값이나 매우 큰 값이 생성되거나 부동소수점 근사 과정에서 작은 값들 소멸될 수도 있다.

즉, Gradient Vanishing 문제가 발생될 수 있으며, 이를 해결해주는 것이 바로, Skip Connection이다.
```

In [3]:
class NetRes(nn.Module):
    def __init__(self, n_chans1 = 32):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_chans1, n_chans1 // 2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(n_chans1 // 2, n_chans1 // 2, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(4 * 4 * n_chans1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = F.max_pool2d(torch.relu(self.conv2(out)), 2)
        out1 = out
        out = F.max_pool2d(torch.relu(self.conv3(out) + out1), 2)
        out = out.view(-1, 4 * 4 * self.n_chans1 // 2)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

## Using Building Block 

In [5]:
class ResBlock(nn.Module):
    def __init__(self, n_chans1):
        super(ResBlock, self).__init__()
        self.conv = nn.Conv2d(n_chans1, n_chans1, kernel_size=3, padding=1, bias=False)
        self.batch_norm = nn.BatchNorm2d(num_features=n_chans1)
        torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu')
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)
        
    def forward(self, x):
        out = self.conv(x)
        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x

In [6]:
class NetResDeep(nn.Module):
    def __init__(self, n_chans1 = 32, n_blocks = 10):
        super().__init__()
        self.n_chans1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.resblocks = nn.Sequential(
            *(n_blocks * [ResBlock(n_chans1 = n_chans1)]))
        self.fc1 = nn.Linear(-1, 8 * 8 * n_chans1, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8 * 8 * self.n_chans1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out