In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torchvision import transforms

from datetime import datetime as dt

# we can improve our model using weight penalties to limit overfitting, we decrease the weight of errors so loss curve is smoother
# => there is less to gain from fitting individual samples
# we use L2
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def training_loop_l2reg(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs.to(device=device)   # add this
            labels.to(device=device) # add this
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)

            l2_lambda = 0.001 # hyper-parameter
            l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
            loss = loss + l2_lambda * l2_norm

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_train += loss.item()
        
        if epoch == 1 or epoch % 10 == 0:
            print(f'{dt.now()} Epoch {epoch}, Training loss {loss_train / len(train_loader)}')

In [4]:
# we can also use dropout to turn off some neurons randomly and do something similar to augmentation but on the network instead of the dataset
# dropout is active in training and disabled when predicting => remember to call model.eval() or model.train() to switch modality

class NetDropout(nn.Module):
    def __init__(self, n_channel=32) -> None:
        super().__init__()
        self.n_channel = n_channel
        self.conv1 = nn.Conv2d(3, n_channel, kernel_size=3, padding=1)
        self.conv1_dropout = nn.Dropout2d(p=0.4)
        self.conv2 = nn.Conv2d(n_channel, n_channel // 2, kernel_size=3, padding=1)
        self.conv2_dropout = nn.Dropout2d(p=0.4)
        self.fc1 = nn.Linear(8*8*(n_channel // 2), 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = self.conv1_dropout(out)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = self.conv2_dropout(out)
        out = out.view(-1, 8*8*(self.n_channel // 2))
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
# or use batch normalization that compute normalization on batches

class NetBatchNorm(nn.Module):
    def __init__(self, n_channel=32) -> None:
        super().__init__()
        self.n_channel = n_channel
        self.conv1 = nn.Conv2d(3, n_channel, kernel_size=3, padding=1)
        self.conv1_batchnorm = nn.BatchNorm2d(num_features=n_channel)
        self.conv2 = nn.Conv2d(n_channel, n_channel // 2, kernel_size=3, padding=1)
        self.conv2_batchnorm = nn.BatchNorm2d(num_features=n_channel // 2)
        self.fc1 = nn.Linear(8*8*(n_channel // 2), 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.conv1_batchnorm(self.conv1(x))
        out = F.max_pool2d(torch.tanh(out), 2)
        out = self.conv2_batchnorm(self.conv2(x))
        out = F.max_pool2d(torch.tanh(out), 2)
        out = self.conv2_dropout(out)
        out = out.view(-1, 8*8*(self.n_channel // 2))
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
# or increase network depth...but deep network do a lot of operations in series and then gradients of deeper layers tend to disappear and not be updated
# ResNet solved this issue adding "skip connections" connect directly inputs to outputs of a layer:

class NetRes(nn.Module):
    def __init__(self, n_channel=32) -> None:
        super().__init__()
        self.n_channel = n_channel
        self.conv1 = nn.Conv2d(3, n_channel, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_channel, n_channel // 2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(n_channel // 2, n_channel // 2, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8*8*(n_channel // 2), 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = F.max_pool2d(torch.relu(self.conv2(out)), 2)
        out1 = out
        out = F.max_pool2d(torch.relu(self.conv3(out)) + out1, 2) # skip connection
        out = out.view(-1, 4*4*(self.n_channel // 2))
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

In [5]:
# how to write a really deep (>100 layer) network in pytorch?

class ResBlock(nn.Module):
    def __init__(self, n_channel) -> None:
        super().__init__()
        self.conv = nn.Conv2d(n_channel, n_channel, kernel_size=3, padding=True, bias=False) # bias would be neutralized by batch norm
        self.batch_norm = nn.BatchNorm2d(num_features=n_channel)
        torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu') # initialize random parameters as done in ResNet paper
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)
    
    def forward(self, x):
        out = self.conv(x)
        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x

class NetResDeep(nn.Module):
    def __init__(self, n_channel=32, n_blocks=10) -> None:
        super().__init__()
        self.n_channel = n_channel
        self.conv1 = nn.Conv2d(3, n_channel, kernel_size=3, padding=True)
        self.resblocks = nn.Sequential(*(n_blocks * [ResBlock(n_channel=n_channel)]))
        self.fc1 = nn.Linear(8*8*n_channel, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8*8*(self.n_channel // 2))
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out