# ДЗ №3 
## Обучение моделей глубокого обучения на PyTorch

In [3]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt

from typing import Tuple, List, Type, Dict, Any

In [4]:
torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
class Perceptron(torch.nn.Module):
    
    def __init__(self, 
                 input_resolution: Tuple[int, int] = (28, 28),
                 input_channels: int = 1, 
                 hidden_layer_features: List[int] = [256, 256, 256],
                 activation: Type[torch.nn.Module] = torch.nn.ReLU,
                 num_classes: int = 10):

        super().__init__()
        
        self.layer_1 = torch.nn.Linear(in_features=784, out_features=256, bias=True)
        self.activation_1 = activation()
        self.layer_2 = torch.nn.Linear(in_features=256, out_features=128, bias=True)
        self.activation_2 = activation()
        self.layer_3 = torch.nn.Linear(in_features=128, out_features=64, bias=True)
        self.activation_3 = activation()
        self.layer_4 = torch.nn.Linear(in_features=64, out_features=num_classes, bias=True)
        
    
    def forward(self, x):
        x = self.layer_1(x)
        x = self.activation_1(x)
        x = self.layer_2(x)
        x = self.activation_2(x)
        x = self.layer_3(x)
        x = self.activation_3(x)
        x = self.layer_4(x)
        return x

In [6]:
model = Perceptron()
print(model)
print('Total number of trainable parameters', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

Perceptron(
  (layer_1): Linear(in_features=784, out_features=256, bias=True)
  (activation_1): ReLU()
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (activation_2): ReLU()
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (activation_3): ReLU()
  (layer_4): Linear(in_features=64, out_features=10, bias=True)
)
Total number of trainable parameters 242762


In [7]:
train_transforms = torchvision.transforms.Compose([                                                 
    torchvision.transforms.RandomRotation(24),   
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0, ), (0.3, ))
])

val_transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0, ), (0.3, ))
])

In [None]:
train_dataset = torchvision.datasets.MNIST(root='./mnist', 
                                           train=True, 
                                           download=True,
                                           transform=train_transforms)

val_dataset = torchvision.datasets.MNIST(root='./mnist', 
                                         train=False, 
                                         download=True,
                                         transform=val_transforms)

In [None]:
indices = np.random.randint(0, len(train_dataset), size=16)

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(32, 32))
for i, row in enumerate(axes):
    for j, ax in enumerate(row):
        sample_index = indices[i*4+j]
        sample, label = train_dataset[sample_index]
        ax.imshow(sample[0])
        ax.set_title(label)

In [10]:
def train_model(model: torch.nn.Module, 
                train_dataset: torch.utils.data.Dataset,
                val_dataset: torch.utils.data.Dataset,
                loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
                optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
                optimizer_params: Dict = {},
                initial_lr = 0.01,
                lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
                lr_scheduler_params: Dict = {},
                batch_size = 64,
                max_epochs = 1000,
                early_stopping_patience = 20):
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
    lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    best_val_loss = None
    best_epoch = None

    for epoch in range(max_epochs):
        
        print(f'Epoch {epoch}')
        
        train_single_epoch(model, optimizer, loss_function, train_loader)
        val_metrics = validate_single_epoch(model, loss_function, val_loader)

        print(f'Validation metrics: \n{val_metrics}')
        
        lr_scheduler.step(val_metrics['loss'])
        
        if best_val_loss is None or best_val_loss > val_metrics['loss']:
            print(f'Best model yet, saving')
            best_val_loss = val_metrics['loss']
            best_epoch = epoch
            torch.save(model, './best_model.pth')
            
        if epoch - best_epoch > early_stopping_patience:
            print('Early stopping triggered')
            return
            

In [11]:
def train_single_epoch(model: torch.nn.Module,
                       optimizer: torch.optim.Optimizer, 
                       loss_function: torch.nn.Module, 
                       data_loader: torch.utils.data.DataLoader):
    loss = None
    """
    У меня была идея связанная с тем, как сворачивать матрицу картинки в массив.
    https://www.youtube.com/watch?v=3s7h2MHQtxc
    Например, как показано в этом видеоролике, с помощью space filling curves, 
    мы смогли бы сохранить пространственное свойство картинок. 

    Однако, т.к. мы работаем с картинками одного и того же разрешения, нам это 
    никак не поможет. Но всё-таки идея, как мне кажется неплохая, поэтому я хотел бы узнать,
    что вы думаете по этому поводу. Как сильно я ошибаюсь и в чем не прав.
    """
    for x_batch, y_batch in data_loader:
        x_batch = x_batch.view(x_batch.shape[0], -1)

        y_pred = model(x_batch)
        loss = loss_function(y_pred, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(loss.item())


In [12]:
def validate_single_epoch(model: torch.nn.Module,
                          loss_function: torch.nn.Module, 
                          data_loader: torch.utils.data.DataLoader):
    loss_history = []
    acc = 0

    
    for x_batch, y_batch in data_loader:
        x_batch = x_batch.view(x_batch.shape[0], -1)

        y_pred = model(x_batch)

        loss = loss_function(y_pred, y_batch)
        loss_history.append(loss.item())

        for i, pred in enumerate(y_pred):
            if torch.argmax(pred) == y_batch[i]:
                acc += 1 
    acc /= ((len(data_loader) - 1) * 64 + 32)

    return {"loss": np.mean(loss_history), "accuracy": acc}
    

In [None]:
train_model(model, 
            train_dataset=train_dataset, 
            val_dataset=val_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.01,
            lr_scheduler_class = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
            lr_scheduler_params = {"T_0" : 100})
"""
Данные последней лучшей модели:
    Epoch 26
    Validation metrics: 
    {'loss': 0.11119568567803859, 'accuracy': 0.9728434504792333}
    Best model yet, saving
"""

In [None]:
train_model(model, 
            train_dataset=train_dataset, 
            val_dataset=val_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.01,
            lr_scheduler_class = torch.optim.lr_scheduler.StepLR,
            lr_scheduler_params = {"step_size" : 50})
"""
Данные последней лучшей модели:
    Epoch 31
    Validation metrics: 
    {'loss': 0.13994677272880343, 'accuracy': 0.9656549520766773}
    Best model yet, saving
"""

"""
Я поменял только способ изменения lr. Loss возрос на 0.02 и не был стабилен.
На протяжении обучения, модель показывала loss вплоть до 0.31(...). Когда модель
с теплым рестартом на lr имела стабильный loss и accuracy у неё по итогу выше.
"""

In [19]:
model1 = Perceptron(activation = torch.nn.LeakyReLU)
print(model1)
print('Total number of trainable parameters', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

Perceptron(
  (layer_1): Linear(in_features=784, out_features=256, bias=True)
  (activation_1): LeakyReLU(negative_slope=0.01)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (activation_2): LeakyReLU(negative_slope=0.01)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (activation_3): LeakyReLU(negative_slope=0.01)
  (layer_4): Linear(in_features=64, out_features=10, bias=True)
)
Total number of trainable parameters 242762


In [20]:
model2 = Perceptron(activation = torch.nn.Softplus)
print(model2)
print('Total number of trainable parameters', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

Perceptron(
  (layer_1): Linear(in_features=784, out_features=256, bias=True)
  (activation_1): Softplus(beta=1, threshold=20)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (activation_2): Softplus(beta=1, threshold=20)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (activation_3): Softplus(beta=1, threshold=20)
  (layer_4): Linear(in_features=64, out_features=10, bias=True)
)
Total number of trainable parameters 242762


In [None]:
"""
Теперь я возьму метод обычения с lr_schedule = CosineAnnealingWarmRestarts
и попробую узнать, какая функция активация лучше. Для тестов я взял LeakyReLU
и Softplus как родственников ReLU. Возможно стоит ещё проверить что-нибудь из 
класса сигмоид
"""
train_model(model1, 
            train_dataset=train_dataset, 
            val_dataset=val_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.01,
            lr_scheduler_class = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
            lr_scheduler_params = {"T_0" : 100})
"""
Epoch 7
Validation metrics: 
{'loss': 0.10283183453893392, 'accuracy': 0.9702476038338658}
Best model yet, saving
"""

In [None]:
train_model(model2, 
            train_dataset=train_dataset, 
            val_dataset=val_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.01,
            lr_scheduler_class = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
            lr_scheduler_params = {"T_0" : 100})
"""
Epoch 12
Validation metrics: 
{'loss': 0.12079034244662423, 'accuracy': 0.9681509584664537}
Best model yet, saving
"""