## Exercise 1.1-1.2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import torch
from torch.utils.tensorboard import SummaryWriter
import wandb
from torchvision.datasets import MNIST
from torch.utils.data import Subset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import wandb
from typing import Any, Callable, List, Optional, Type, Union
from torch import Tensor

#### Training hyperparameters.

In [2]:
batch_size = 64
lr = 0.01
momentum=0.9
weight_decay=1e-04
epochs = 25
if torch.cuda.is_available():
    device = torch.device("cuda") # to use the GPU
else:
    device = torch.device("cpu")

#### Data preparation

Dataset loading, validation splitting code for CIFAR10.

In [3]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  
    transforms.RandomHorizontalFlip(), 
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
# create a split for train/validation. We can use early stop
trainset, valset = torch.utils.data.random_split(dataset, [40000, 10000])  # train (40000 images) e validation (10000 images)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2,
                                          drop_last=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2,
                                          drop_last=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2,
                                          drop_last=False)

Files already downloaded and verified
Files already downloaded and verified


#### Training and evaluation Functions 

Training, evaluation, and plotting code.

In [4]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

# Function to train a model for a single epoch over the data loader.
def train_epoch(model, dl, opt, criterion, epoch='Unknown', device='cpu'):
    model.train()
    losses = []
    for (xs, ys) in tqdm(dl, desc=f'Training epoch {epoch}', leave=True):
        xs = xs.to(device)
        ys = ys.to(device)
        opt.zero_grad()
        logits = model(xs)
        loss = criterion(logits, ys)
        loss.backward()
        opt.step()   
        losses.append(loss.item())
    # print('Train Loss: {:.6f}'.format(np.mean(losses)))
    wandb.log({'Train Loss': np.mean(losses)})

# Function to evaluate model over all samples in the data loader.
def evaluate_model(model, dl, criterion, device='cpu', val=False):
    model.eval()
    test_loss = 0
    predictions = []
    gts = []
    for (xs, ys) in tqdm(dl, desc='Evaluating', leave=False):
        xs = xs.to(device)
        output = model(xs)
        preds = torch.argmax(output, dim=1)
        test_loss += criterion(output, ys.to(device)).item()
        gts.append(ys)
        predictions.append(preds.detach().cpu().numpy())
    mode = "Val" if val else "Test"
    # print('\{} set: Average loss: {:.4f}, Accuracy: {:.0f}%\n'.format(
    #     mode,
    #     test_loss/len(dl), accuracy_score(np.hstack(gts), np.hstack(predictions))))   
    # Return accuracy score and classification report.
    wandb.log({'Test Loss': test_loss/len(dl), 'Test Accuracy': round(accuracy_score(np.hstack(gts), np.hstack(predictions)), 2)})
    


# Simple function to plot the loss curve and validation accuracy.
def plot_validation_curves(losses_and_accs):
    losses = [x for (x, _) in losses_and_accs]
    accs = [x for (_, x) in losses_and_accs]
    plt.figure(figsize=(16, 8))
    plt.subplot(1, 2, 1)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Average Training Loss per Epoch')
    plt.subplot(1, 2, 2)
    plt.plot(accs)
    plt.xlabel('Epoch')
    plt.ylabel('Validation Accuracy')
    plt.title(f'Best Accuracy = {np.max(accs)} @ epoch {np.argmax(accs)}')
    
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

### Exercise 1.1: A baseline MLP

Here there is a *simple* Multilayer Perceptron to classify the 10 classes of CIFAR10. Here we exploit the training pipeline above. This training pipeline monitors the loss and accuracy on the training and validation sets for every epoch using weights and biases.

##### Model definition

In [52]:
class Dumb_MLP(nn.Module):
    def __init__(self, dim=64):
        super().__init__()
        self.fc1 = nn.Linear(32*32*3, dim)
        self.fc2 = nn.Linear(dim, dim*2)
        self.fc3 = nn.Linear(dim*2, dim*4)
        self.fc4 = nn.Linear(dim*4, dim*8)
        self.fc5 = nn.Linear(dim*8, 10)
        
    def forward(self, x):
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        return x

In [53]:
wandb.init(
    # set the wandb project where this run will be logged
    project="Lab1-DLA",
    name="MLP",
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "MLP",
    "dataset": "CIFAR-10",
    "epochs": epochs,
    }
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888884685, max=1.0…

In [54]:
model_mlp = Dumb_MLP().to(device)
optimizer = torch.optim.SGD(model_mlp.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
wandb.watch(model_mlp, log='all')
print('MLP trainable parameters: ', count_trainable_parameters(model_mlp))

MLP trainable parameters:  374730


In [55]:
for epoch in range(1, epochs+1):
    train_epoch(model_mlp, trainloader, optimizer, criterion, epoch, device=device)
    evaluate_model(model_mlp, valloader, criterion, device=device)
    
wandb.finish()

Training epoch 1: 100%|██████████| 625/625 [00:29<00:00, 21.26it/s]
Training epoch 2: 100%|██████████| 625/625 [00:25<00:00, 24.89it/s]
Training epoch 3: 100%|██████████| 625/625 [00:22<00:00, 27.79it/s]
Training epoch 4: 100%|██████████| 625/625 [00:22<00:00, 28.17it/s]
Training epoch 5: 100%|██████████| 625/625 [00:22<00:00, 28.09it/s]
Training epoch 6: 100%|██████████| 625/625 [00:21<00:00, 29.10it/s]
Training epoch 7: 100%|██████████| 625/625 [00:22<00:00, 27.66it/s]
Training epoch 8: 100%|██████████| 625/625 [00:24<00:00, 25.34it/s]
Training epoch 9: 100%|██████████| 625/625 [00:21<00:00, 29.31it/s]
Training epoch 10: 100%|██████████| 625/625 [00:21<00:00, 28.54it/s]
Training epoch 11: 100%|██████████| 625/625 [00:22<00:00, 28.02it/s]
Training epoch 12: 100%|██████████| 625/625 [00:22<00:00, 28.22it/s]
Training epoch 13: 100%|██████████| 625/625 [00:22<00:00, 28.31it/s]
Training epoch 14: 100%|██████████| 625/625 [00:21<00:00, 28.64it/s]
Training epoch 15: 100%|██████████| 625/625

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test Accuracy,▁▃▄▄▅▆▆▆▇▇▇▇▇█▇▇█████████
Test Loss,█▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▂▂▂▁▁▂▁
Train Loss,█▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
Test Accuracy,0.5
Test Loss,1.38887
Train Loss,1.36913


### Exercise 1.2: Rinse and Repeat

Now we repeat the same experiments with **Convolutional** Neural Networks using the same pipeline as above. The objective is to show that **deeper** CNNs *without* residual connections do not always work better and **even deeper** ones *with* residual connections.
The convolutional neural network is defined from its blocks which can be both Basic Blocks and Bottleneck that are the typical building blocks for ResNets.

![\label{ResBlocks}](images/img.png)

The `skip`  parameter allows the skip connections. In this way we can compare the same architecture with or without residual connections.

#### Basic code for 3x3 and 1x1 convolutions

In [5]:
def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation,
    )


def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

#### Convolutional Blocks

In [23]:
class BasicBlock(nn.Module):
    expansion: int = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        skip: bool = False
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.skip = skip
        self.downsample = None
        if self.skip:
            self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.skip:
            if self.downsample is not None:
                identity = self.downsample(x)
            out += identity
        
        out = self.relu(out)

        return out

#### Main model backbone
This module allows to choose the number of blocks of type [BasicBlock, Bottleneck] per layer.

In [32]:
class ConvNet(nn.Module):
    def __init__(
        self,
        layers: List[int],
        num_classes: int = 10,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        skip: bool = False,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError(
                "replace_stride_with_dilation should be None "
                f"or a 3-element tuple, got {replace_stride_with_dilation}"
            )
        self.groups = groups
        self.base_width = width_per_group
        self.skip = skip
        self.stem = nn.Sequential(
            nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False),
            norm_layer(self.inplanes),
            nn.ReLU(inplace=True))
        self.layer1 = self._make_layer(24, layers[0])
        self.layer2 = self._make_layer(48, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(96, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(96 * BasicBlock.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock) and m.bn2.weight is not None:
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]

    def _make_layer(
        self,
        planes: int,
        blocks: int,
        stride: int = 1,
        dilate: bool = False,
    ) -> nn.Sequential:
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * BasicBlock.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * BasicBlock.expansion, stride),
                norm_layer(planes * BasicBlock.expansion),
            )

        layers = []
        layers.append(
            BasicBlock(
                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer, self.skip
            )
        )
        self.inplanes = planes * BasicBlock.expansion
        for _ in range(1, blocks):
            layers.append(
                BasicBlock(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    dilation=self.dilation,
                    norm_layer=norm_layer,
                    skip=self.skip
                )
            )

        return nn.Sequential(*layers)

    def _forward_impl(self, x: Tensor) -> Tensor:
        # See note [TorchScript super()]
        x = self.stem(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        # x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x: Tensor) -> Tensor:
        return self._forward_impl(x)

In [33]:
def _convnet(
    layers: List[int],
    num_classes: int,
    groups: int = 1,
    width_per_group: int = 64,
    skip: bool = False,
    **kwargs: Any,
) -> ConvNet:

    model = ConvNet(layers, num_classes, False, groups, width_per_group, skip, **kwargs)

    return model

#### Models
Here we define the models for our experiments.

In [34]:
def small_convnet(num_classes) -> ConvNet:
    return _convnet([1, 1, 1], num_classes, groups=1, width_per_group=64, skip=False)
def small_resnet(num_classes) -> ConvNet:
    return _convnet([1, 2, 2], num_classes, groups=1, width_per_group=64, skip=True)

In [35]:
convnet = small_convnet(10)
print(convnet)
print('ConvNet trainable parameters: ', count_trainable_parameters(convnet))

ConvNet(
  (stem): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(24, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(48, 48, kernel_s

In [38]:
resnet = small_resnet(10)
print(resnet)

ConvNet(
  (stem): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(64, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer2): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(24, 48, kernel_size=(3, 3),

In [30]:
wandb.init(
    # set the wandb project where this run will be logged
    project="Lab1-DLA",
    name='CNN',
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "CNN",
    "dataset": "CIFAR-10",
    "epochs": epochs,
    }
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777131, max=1.0)…

In [37]:
convnet = small_convnet(10).to(device)
optimizer = torch.optim.SGD(convnet.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
wandb.watch(convnet, log='all')

for epoch in range(1, epochs+1):
    train_epoch(convnet, trainloader, optimizer, criterion, epoch, device=device)
    evaluate_model(convnet, valloader, criterion, device=device)
    
wandb.finish()

Training epoch 1: 100%|██████████| 625/625 [00:22<00:00, 27.20it/s]
Training epoch 2: 100%|██████████| 625/625 [00:21<00:00, 28.85it/s]
Training epoch 3: 100%|██████████| 625/625 [00:21<00:00, 28.51it/s]
Training epoch 4: 100%|██████████| 625/625 [00:28<00:00, 21.86it/s]
Training epoch 5: 100%|██████████| 625/625 [00:25<00:00, 24.95it/s]
Training epoch 6: 100%|██████████| 625/625 [00:25<00:00, 24.04it/s]
Training epoch 7: 100%|██████████| 625/625 [00:24<00:00, 25.28it/s]
Training epoch 8: 100%|██████████| 625/625 [00:24<00:00, 25.77it/s]
Training epoch 9: 100%|██████████| 625/625 [00:25<00:00, 24.58it/s]
Training epoch 10: 100%|██████████| 625/625 [00:23<00:00, 26.33it/s]
Training epoch 11: 100%|██████████| 625/625 [00:24<00:00, 25.17it/s]
Training epoch 12: 100%|██████████| 625/625 [00:22<00:00, 27.50it/s]
Training epoch 13: 100%|██████████| 625/625 [00:22<00:00, 27.55it/s]
Training epoch 14: 100%|██████████| 625/625 [00:22<00:00, 27.24it/s]
Training epoch 15: 100%|██████████| 625/625

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test Accuracy,▁▃▄▃▅▅▆▆▆▆▇▇▇▇▇▇█▇█████▇█
Test Loss,█▆▅▆▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▂▁
Train Loss,█▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
Test Accuracy,0.82
Test Loss,0.53215
Train Loss,0.4579


In [40]:
wandb.init(
    # set the wandb project where this run will be logged
    project="Lab1-DLA",
    name='ResNet',
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "ResNet",
    "dataset": "CIFAR-10",
    "epochs": epochs,
    }
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777131, max=1.0)…

In [41]:
resnet = small_resnet(10).to(device)
optimizer = torch.optim.SGD(resnet.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
wandb.watch(resnet, log='all')

for epoch in range(1, epochs+1):
    train_epoch(resnet, trainloader, optimizer, criterion, epoch, device=device)
    evaluate_model(resnet, valloader, criterion, device=device)
    
wandb.finish()

Training epoch 1: 100%|██████████| 625/625 [00:26<00:00, 23.37it/s]
Training epoch 2: 100%|██████████| 625/625 [00:28<00:00, 21.68it/s]
Training epoch 3: 100%|██████████| 625/625 [00:27<00:00, 22.43it/s]
Training epoch 4: 100%|██████████| 625/625 [00:30<00:00, 20.26it/s]
Training epoch 5: 100%|██████████| 625/625 [00:27<00:00, 22.59it/s]
Training epoch 6: 100%|██████████| 625/625 [00:27<00:00, 22.99it/s]
Training epoch 7: 100%|██████████| 625/625 [00:28<00:00, 21.75it/s]
Training epoch 8: 100%|██████████| 625/625 [00:28<00:00, 22.22it/s]
Training epoch 9: 100%|██████████| 625/625 [00:27<00:00, 22.49it/s]
Training epoch 10: 100%|██████████| 625/625 [00:27<00:00, 22.99it/s]
Training epoch 11: 100%|██████████| 625/625 [00:28<00:00, 21.72it/s]
Training epoch 12: 100%|██████████| 625/625 [00:27<00:00, 22.80it/s]
Training epoch 13: 100%|██████████| 625/625 [00:26<00:00, 23.44it/s]
Training epoch 14: 100%|██████████| 625/625 [00:26<00:00, 23.16it/s]
Training epoch 15: 100%|██████████| 625/625

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test Accuracy,▁▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇█▇▇▇▇████
Test Loss,█▆▅▄▃▃▃▃▂▃▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁
Train Loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
Test Accuracy,0.86
Test Loss,0.4149
Train Loss,0.32385


## Results
The figure below shows the Validation accuracy during training of ResNet, ConvNet and MLP. The performances of the MLP are much worse than the other two models. The ResNet which is deeper offers a gain in performance of 4 percentage points at the end of training. 
<img src="images/Val_acc.png" alt="drawing" width="1200"/>
