In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import shutil
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms, datasets
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader, random_split



#### Lambda Layer
- **Purpose:** This layer allows you to define custom operations or transformations within the network. This is useful for operations that aren't standard layers, like identity mapping with zero-padding.


In [3]:


class LambdaLayer(nn.Module):
    """
    This class defines a Lambda Layer, which applies a custom function to the input tensor.

    Attributes:
        lambd (function): A function that defines the operation to be applied to the input tensor.
    """

    def __init__(self, lambd):
        """
        Initialize the Lambda Layer.

        Args:
            lambd (function): The function that defines the operation to be applied to the input tensor.
        """
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        """
        Forward pass for the Lambda Layer.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor after applying the custom function.
        """
        return self.lambd(x)



  
#### Basic Convolution Block
- **Components:**
  1. **Convolutional Layers:** Two convolutional layers that reduce and then restore the spatial dimensions of the input tensor.
  2. **Batch Normalization:** Applied after each convolutional layer to stabilize the learning process.
  3. **ReLU Activation:** Non-linear activation function that introduces non-linearity into the model.
  4. **Shortcut Connection:** Ensures that the input and output dimensions match. It adds the input directly to the output of the block, which helps in training deeper networks by mitigating the vanishing gradient problem.
  
- **Shortcut Connection Options:**
  - **Option A:** Uses an identity shortcut with zero-padding to increase the channel dimensions when required.
  - **Option B:** Uses a 1x1 convolution to adjust the dimensions directly (projection shortcut). This is more computationally expensive but often more effective in deeper networks.

This implementation ensures that the model is flexible and can handle different cases of input-output dimension mismatches while maintaining computational efficiency.

In [4]:
class BasicConvBlock(nn.Module):
    """
    This class defines a Basic Convolution Block for ResNet.

    The block consists of two convolutional layers followed by Batch Normalization and ReLU activation.
    Additionally, a shortcut connection is added to ensure the input and output dimensions match.

    Attributes:
        features (nn.Sequential): The sequence of convolutional layers, Batch Normalization, and ReLU activations.
        shortcut (nn.Sequential or LambdaLayer): The shortcut connection to match the dimensions.
    """

    def __init__(self, in_channels, out_channels, stride=1, option='A'):
        """
        Initialize the Basic Convolution Block.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            stride (int, optional): Stride for the convolution operation. Default is 1.
            option (str, optional): Option for the shortcut connection to match dimensions. 'A' or 'B'. Default is 'A'.
        """
        super(BasicConvBlock, self).__init__()

        # Define the sequence of convolutional layers, Batch Normalization, and ReLU activation.
        self.features = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)),
            ('bn1', nn.BatchNorm2d(out_channels)),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv2', nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)),
            ('bn2', nn.BatchNorm2d(out_channels))
        ]))

        # Initialize an empty shortcut connection by default.
        self.shortcut = nn.Sequential()

        # If the input and output dimensions do not match, apply the appropriate shortcut connection.
        if stride != 1 or in_channels != out_channels:
            if option == 'A':
                # Option A: Use identity shortcuts with zero padding to match dimensions.
                pad_to_add = (out_channels - in_channels) // 2
                self.shortcut = LambdaLayer(lambda x: F.pad(
                    x[:, :, ::stride, ::stride],
                    (0, 0, 0, 0, pad_to_add, pad_to_add, 0, 0)
                ))
            elif option == 'B':
                # Option B: Use 1x1 convolution to match dimensions (projection shortcut).
                self.shortcut = nn.Sequential(OrderedDict([
                    ('s_conv', nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)),
                    ('s_bn', nn.BatchNorm2d(out_channels))
                ]))

    def forward(self, x):
        """
        Forward pass for the Basic Convolution Block.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor after applying the convolutional layers and the shortcut connection.
        """
        out = self.features(x)
        out += self.shortcut(x)  # Add the shortcut connection output to the features output.
        out = F.relu(out)  # Apply ReLU activation to the combined output.
        return out


### Understanding `F.pad` and the ResNet Code

The `F.pad` function in PyTorch is used to apply padding to a tensor. In the context of a ResNet architecture, padding is often applied to match the dimensions of the input and output tensors when using shortcut connections (also known as skip connections).

Let's break down the line of code and understand how `F.pad` works in this context:

```python
F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad,pad, 0,0))
```

### 1. **Input Tensor**
The input tensor `x` is assumed to be a 4D tensor with dimensions `[batch_size, channels, height, width]`, typical for image data.

- `x[:, :, ::2, ::2]`:
  - This slicing operation down-samples the tensor by taking every second element in both the height and width dimensions (`::2`).
  - The result is a tensor with reduced spatial dimensions, effectively down-sampling the image by a factor of 2.

### 2. **Understanding the Padding in `F.pad`**
`F.pad` requires the padding amounts for each dimension of the tensor. The padding values are specified in reverse order, with two values for each dimension (one for padding at the beginning and one for padding at the end).

- **Padding Values**: `(0, 0, 0, 0, pad, pad, 0, 0)`
  - The padding is applied as follows:
    - **(0, 0)**: No padding is applied to the last dimension (batch dimension).
    - **(pad, pad)**: Padding is applied to the channel dimension. `pad` is added at both the beginning and end of the channels dimension.
    - **(0, 0)**: No padding is applied to the spatial dimensions (height and width).

### 3. **Padding Explained**
The specific padding order corresponds to the dimensions in the tensor:

- **Width (Spatial dimension)**: `(0, 0)` means no padding is applied to the width.
- **Height (Spatial dimension)**: `(0, 0)` means no padding is applied to the height.
- **Channels**: `(pad, pad)` adds `pad` number of channels at both the beginning and end of the channels dimension.
- **Batch**: `(0, 0)` means no padding is applied to the batch dimension.

### 4. **Example of `F.pad` in Context**
Assume we have an input tensor `x` with shape `[batch_size, 64, 16, 16]` (i.e., batch size of `batch_size`, 64 channels, and spatial dimensions 16x16).

- After applying `x[:, :, ::2, ::2]`, the tensor would have dimensions `[batch_size, 64, 8, 8]`, reducing the height and width by half.
- Now, if `pad = 32`, the `F.pad` function would add 32 channels on both sides of the channel dimension.
- The resulting tensor would have dimensions `[batch_size, 128, 8, 8]`, where 128 is the result of 64 original channels plus 32 padded channels at the beginning and 32 padded channels at the end.

### 5. **Use Case in ResNet**
In the context of a ResNet, if the input and output channels differ, padding is necessary to match the dimensions so that the input can be added directly to the output of the residual block.

- **Option A (Identity Shortcut with Zero Padding)**: The channel dimension is increased by adding zero-padding, ensuring that the input and output tensors have the same number of channels.
- **Option B (1x1 Convolution Shortcut)**: Instead of padding, a 1x1 convolution is used to project the input tensor to the desired number of channels.

This padding ensures that when the input tensor is added to the output of the residual block, the dimensions align correctly, allowing for the addition operation to take place without errors.


- **`F.pad`**: Pads the tensor to ensure the input and output dimensions match in the ResNet architecture.
- **Zero Padding**: Adds zeros to increase the number of channels.
- **Slicing (`::2`)**: Down-samples the spatial dimensions by a factor of 2.
- **Use in ResNet**: Facilitates the shortcut connection when the number of channels differs between the input and output.



In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict

class ResNet(nn.Module):
    """
    ResNet-56 architecture for CIFAR-10 Dataset of shape 32x32x3.

    Args:
        block_type (nn.Module): The type of residual block to use.
        num_blocks (list): List containing the number of blocks for each layer.
        num_classes (int, optional): Number of output classes. Default is 10.

    Attributes:
        in_channels (int): Number of input channels.
        conv0 (nn.Conv2d): Initial convolutional layer.
        bn0 (nn.BatchNorm2d): Batch normalization layer.
        block1 (nn.Sequential): First block layer.
        block2 (nn.Sequential): Second block layer.
        block3 (nn.Sequential): Third block layer.
        avgpool (nn.AdaptiveAvgPool2d): Adaptive average pooling layer.
        linear (nn.Linear): Linear layer for classification.
    """
    def __init__(self, block_type, num_blocks, num_classes=10):
        super(ResNet, self).__init__()

        self.in_channels = 16  # Initial number of channels

        # Initial convolutional and batch normalization layers
        self.conv0 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn0 = nn.BatchNorm2d(16)

        # Residual blocks for each layer
        self.block1 = self.__build_layer(block_type, 16, num_blocks[0], starting_stride=1)
        self.block2 = self.__build_layer(block_type, 32, num_blocks[1], starting_stride=2)
        self.block3 = self.__build_layer(block_type, 64, num_blocks[2], starting_stride=2)

        # Adaptive average pooling and linear layer for classification
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.linear = nn.Linear(64, num_classes)

        # Initialize weights
        self._initialize_weights()

    def __build_layer(self, block_type, out_channels, num_blocks, starting_stride):
        """
        Build a layer consisting of multiple residual blocks.

        Args:
            block_type (nn.Module): The type of residual block to use.
            out_channels (int): Number of output channels.
            num_blocks (int): Number of blocks in the layer.
            starting_stride (int): Stride value for the first block.

        Returns:
            nn.Sequential: Sequential container of the residual blocks.
        """
        strides_list = [starting_stride] + [1] * (num_blocks - 1)
        layers = []

        for stride in strides_list:
            layers.append(block_type(self.in_channels, out_channels, stride))
            self.in_channels = out_channels  # Update in_channels for the next block

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass of the ResNet model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor.
        """
        # Initial convolution and normalization
        out = F.relu(self.bn0(self.conv0(x)))

        # Pass through residual blocks
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)

        # Global average pooling and final classification layer
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.linear(out)

        return out

    def _initialize_weights(self):
        """
        Initialize the weights of the model using He initialization.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


In [6]:
def ResNet56():
    return ResNet(block_type=BasicConvBlock, num_blocks=[9,9,9])

model = ResNet56()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             432
       BatchNorm2d-2           [-1, 16, 32, 32]              32
            Conv2d-3           [-1, 16, 32, 32]           2,304
       BatchNorm2d-4           [-1, 16, 32, 32]              32
              ReLU-5           [-1, 16, 32, 32]               0
            Conv2d-6           [-1, 16, 32, 32]           2,304
       BatchNorm2d-7           [-1, 16, 32, 32]              32
    BasicConvBlock-8           [-1, 16, 32, 32]               0
            Conv2d-9           [-1, 16, 32, 32]           2,304
      BatchNorm2d-10           [-1, 16, 32, 32]              32
             ReLU-11           [-1, 16, 32, 32]               0
           Conv2d-12           [-1, 16, 32, 32]           2,304
      BatchNorm2d-13           [-1, 16, 32, 32]              32
   BasicConvBlock-14           [-1, 16,

# Load the CIFAR 10 Dataset

In [8]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

def dataloader_cifar():
    """
    Create dataloaders for the CIFAR-10 dataset.

    Returns:
        train_loader (torch.utils.data.DataLoader): Dataloader for the training set.
        val_loader (torch.utils.data.DataLoader): Dataloader for the validation set.
        test_loader (torch.utils.data.DataLoader): Dataloader for the test set.
    """
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    # Input Data in Google Drive (Update path if needed)
    train_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=False, download=True, transform=transform)

    # Split dataset into training set and validation set
    train_dataset, val_dataset = random_split(train_dataset, [45000, 5000])

    print("Image shape of a random sample image : {}".format(train_dataset[0][0].numpy().shape), end='\n\n')
    print("Training Set:   {} images".format(len(train_dataset)))
    print("Validation Set:   {} images".format(len(val_dataset)))
    print("Test Set:       {} images".format(len(test_dataset)))

    BATCH_SIZE = 32

    # Generate dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=10000, shuffle=False)

    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = dataloader_cifar()


Files already downloaded and verified
Files already downloaded and verified
Image shape of a random sample image : (3, 32, 32)

Training Set:   45000 images
Validation Set:   5000 images
Test Set:       10000 images


Here's a detailed breakdown of your training function, formatted in markdown with explanations on what each part does and why it's important:

---

## Training Function Breakdown

### 1. **Imports and Setup**



- **`criterion`**: Defines the loss function. `nn.CrossEntropyLoss` is commonly used for classification tasks. It combines `nn.LogSoftmax()` and `nn.NLLLoss()` in one single class.
- **`optimizer`**: Sets up the optimizer. `optim.Adam` is a popular optimization algorithm that adjusts the learning rate based on the first and second moments of the gradients.


In [9]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)



### 2. **Training Function Definition**


- **`EPOCHS`**: The number of times the entire dataset will pass through the model.
- **`train_samples_num` and `val_samples_num`**: Number of samples in the training and validation sets respectively. These values are used to compute the average loss and accuracy.
- **`train_costs` and `val_costs`**: Lists to store the training and validation losses for each epoch.


---


### 3. **Training Phase**

- **`model.train().cuda()`**: Sets the model to training mode and moves it to GPU (if available).
- **`optimizer.zero_grad()`**: Clears old gradients from the last step.
- **`prediction = model(inputs)`**: Performs the forward pass.
- **`loss.backward()`**: Computes the gradient of the loss with respect to model parameters.
- **`optimizer.step()`**: Updates the model parameters using the computed gradients.
- **`_, predicted_outputs = torch.max(prediction.data, 1)`**: Gets the predicted class labels.
- **`correct_train += (predicted_outputs == labels).float().sum().item()`**: Counts correct predictions.
- **`train_running_loss += (loss.data.item() * inputs.shape[0])`**: Accumulates the loss for averaging later.

---

### 4. **Validation Phase**


- **`model.eval().cuda()`**: Sets the model to evaluation mode and moves it to GPU.
- **`with torch.no_grad()`**: Disables gradient calculation, which saves memory and computations during evaluation.
- **`val_running_loss += (loss.data.item() * inputs.shape[0])`**: Accumulates the validation loss.








In [10]:
def train_model():
    EPOCHS = 15
    train_samples_num = 45000
    val_samples_num = 5000
    train_costs, val_costs = [], []

    #Training phase.
    for epoch in range(EPOCHS):

        train_running_loss = 0
        correct_train = 0

        model.train().cuda()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            """ for every mini-batch during the training phase, we typically want to explicitly set the gradients
            to zero before starting to do backpropragation """
            optimizer.zero_grad()

            # Start the forward pass
            prediction = model(inputs)

            loss = criterion(prediction, labels)

            # do backpropagation and update weights with step()
            loss.backward()
            optimizer.step()

            # print('outputs on which to apply torch.max ', prediction)
            # find the maximum along the rows, use dim=1 to torch.max()
            _, predicted_outputs = torch.max(prediction.data, 1)

            # Update the running corrects
            correct_train += (predicted_outputs == labels).float().sum().item()

            ''' Compute batch loss
            multiply each average batch loss with batch-length.
            The batch-length is inputs.size(0) which gives the number total images in each batch.
            Essentially I am un-averaging the previously calculated Loss '''
            train_running_loss += (loss.data.item() * inputs.shape[0])


        train_epoch_loss = train_running_loss / train_samples_num

        train_costs.append(train_epoch_loss)

        train_acc =  correct_train / train_samples_num

        # Now check trained weights on the validation set
        val_running_loss = 0
        correct_val = 0

        model.eval().cuda()

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass.
                prediction = model(inputs)

                # Compute the loss.
                loss = criterion(prediction, labels)

                # Compute validation accuracy.
                _, predicted_outputs = torch.max(prediction.data, 1)
                correct_val += (predicted_outputs == labels).float().sum().item()

            # Compute batch loss.
            val_running_loss += (loss.data.item() * inputs.shape[0])

            val_epoch_loss = val_running_loss / val_samples_num
            val_costs.append(val_epoch_loss)
            val_acc =  correct_val / val_samples_num

        info = "[Epoch {}/{}]: train-loss = {:0.6f} | train-acc = {:0.3f} | val-loss = {:0.6f} | val-acc = {:0.3f}"

        print(info.format(epoch+1, EPOCHS, train_epoch_loss, train_acc, val_epoch_loss, val_acc))

        torch.save(model.state_dict(), '/content/checkpoint_gpu_{}'.format(epoch + 1))

    torch.save(model.state_dict(), '/content/resnet-56_weights_gpu')

    return train_costs, val_costs



### 5. **Logging and Saving the Model**


- **`Logging`**: Prints the training and validation losses and accuracies for each epoch.
- **`torch.save`**: Saves the model weights. Saving the model at each epoch helps to recover from potential interruptions and monitor the model's progress.


In [11]:
# !pwd
train_costs, val_costs = train_model()

[Epoch 1/15]: train-loss = 1.646446 | train-acc = 0.380 | val-loss = 0.001274 | val-acc = 0.512
[Epoch 2/15]: train-loss = 1.103993 | train-acc = 0.603 | val-loss = 0.000800 | val-acc = 0.657
[Epoch 3/15]: train-loss = 0.827500 | train-acc = 0.709 | val-loss = 0.000243 | val-acc = 0.694
[Epoch 4/15]: train-loss = 0.679794 | train-acc = 0.763 | val-loss = 0.000091 | val-acc = 0.770
[Epoch 5/15]: train-loss = 0.574650 | train-acc = 0.801 | val-loss = 0.000653 | val-acc = 0.776
[Epoch 6/15]: train-loss = 0.489664 | train-acc = 0.831 | val-loss = 0.000556 | val-acc = 0.798
[Epoch 7/15]: train-loss = 0.418733 | train-acc = 0.854 | val-loss = 0.000457 | val-acc = 0.809
[Epoch 8/15]: train-loss = 0.352605 | train-acc = 0.877 | val-loss = 0.000038 | val-acc = 0.801
[Epoch 9/15]: train-loss = 0.301473 | train-acc = 0.894 | val-loss = 0.000097 | val-acc = 0.803
[Epoch 10/15]: train-loss = 0.250305 | train-acc = 0.913 | val-loss = 0.000025 | val-acc = 0.810
[Epoch 11/15]: train-loss = 0.215240 | 

In [12]:
#Restore the model.
model = ResNet56()
model.load_state_dict(torch.load('/content/resnet-56_weights_gpu'))

<All keys matched successfully>

# Testing

In [13]:
test_samples_num = 10000
correct = 0

model.eval().cuda()

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        # Make predictions.
        prediction = model(inputs)

        # Retrieve predictions indexes.
        _, predicted_class = torch.max(prediction.data, 1)

        # Compute number of correct predictions.
        correct += (predicted_class == labels).float().sum().item()

test_accuracy = correct / test_samples_num
print('Test accuracy: {}'.format(test_accuracy))


Test accuracy: 0.8046


# Implementation on a Custom Dataset

[Link to Jelly Fish Dataset](https://www.kaggle.com/datasets/anshtanwar/jellyfish-types)

In [11]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

def dataloader_jellyfish():
    """
    Create dataloaders for the Jelly-Fish dataset.

    Returns:
        train_loader (torch.utils.data.DataLoader): Dataloader for the training set.
        val_loader (torch.utils.data.DataLoader): Dataloader for the validation set.
        test_loader (torch.utils.data.DataLoader): Dataloader for the test set.
    """
    # Define transformations for the dataset
    transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


    # Load the dataset
    dataset = datasets.ImageFolder('/content/drive/MyDrive/All_Datasets/Jelly-Fish', transform=transform)

    # Split the dataset into training, validation, and test sets
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    print(f"Training Set: {len(train_dataset)} images")
    print(f"Validation Set: {len(val_dataset)} images")
    print(f"Test Set: {len(test_dataset)} images")

    BATCH_SIZE = 32

    # Generate dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = dataloader_jellyfish()


Training Set: 630 images
Validation Set: 135 images
Test Set: 135 images


In [8]:
def ResNet56(num_classes=6):
    return ResNet(block_type=BasicConvBlock, num_blocks=[9, 9, 9], num_classes=num_classes)


model = ResNet56()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             432
       BatchNorm2d-2           [-1, 16, 32, 32]              32
            Conv2d-3           [-1, 16, 32, 32]           2,304
       BatchNorm2d-4           [-1, 16, 32, 32]              32
              ReLU-5           [-1, 16, 32, 32]               0
            Conv2d-6           [-1, 16, 32, 32]           2,304
       BatchNorm2d-7           [-1, 16, 32, 32]              32
    BasicConvBlock-8           [-1, 16, 32, 32]               0
            Conv2d-9           [-1, 16, 32, 32]           2,304
      BatchNorm2d-10           [-1, 16, 32, 32]              32
             ReLU-11           [-1, 16, 32, 32]               0
           Conv2d-12           [-1, 16, 32, 32]           2,304
      BatchNorm2d-13           [-1, 16, 32, 32]              32
   BasicConvBlock-14           [-1, 16,

# Training

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model():
    EPOCHS = 15
    train_samples_num = 630
    val_samples_num = 135
    train_costs, val_costs = [], []

    for epoch in range(EPOCHS):
        train_running_loss = 0
        correct_train = 0

        model.train().cuda()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            prediction = model(inputs)
            loss = criterion(prediction, labels)

            loss.backward()
            optimizer.step()

            _, predicted_outputs = torch.max(prediction.data, 1)
            correct_train += (predicted_outputs == labels).float().sum().item()

            train_running_loss += (loss.data.item() * inputs.shape[0])

        train_epoch_loss = train_running_loss / train_samples_num
        train_costs.append(train_epoch_loss)
        train_acc = correct_train / train_samples_num

        val_running_loss = 0
        correct_val = 0

        model.eval().cuda()

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                prediction = model(inputs)
                loss = criterion(prediction, labels)
                _, predicted_outputs = torch.max(prediction.data, 1)
                correct_val += (predicted_outputs == labels).float().sum().item()

            val_running_loss += (loss.data.item() * inputs.shape[0])
            val_epoch_loss = val_running_loss / val_samples_num
            val_costs.append(val_epoch_loss)
            val_acc = correct_val / val_samples_num

        print(f"[Epoch {epoch + 1}/{EPOCHS}]: train-loss = {train_epoch_loss:.6f} | train-acc = {train_acc:.3f} | val-loss = {val_epoch_loss:.6f} | val-acc = {val_acc:.3f}")

        torch.save(model.state_dict(), f'/content/checkpoint_gpu_{epoch + 1}')

    torch.save(model.state_dict(), '/content/resnet-56_weights_gpu')

    return train_costs, val_costs

train_costs, val_costs = train_model()


[Epoch 1/15]: train-loss = 1.583307 | train-acc = 0.373 | val-loss = 0.074027 | val-acc = 0.393
[Epoch 2/15]: train-loss = 1.480552 | train-acc = 0.417 | val-loss = 0.087986 | val-acc = 0.289
[Epoch 3/15]: train-loss = 1.402477 | train-acc = 0.422 | val-loss = 0.062517 | val-acc = 0.444
[Epoch 4/15]: train-loss = 1.419549 | train-acc = 0.427 | val-loss = 0.054468 | val-acc = 0.407
[Epoch 5/15]: train-loss = 1.359981 | train-acc = 0.424 | val-loss = 0.083270 | val-acc = 0.341
[Epoch 6/15]: train-loss = 1.325599 | train-acc = 0.487 | val-loss = 0.103237 | val-acc = 0.400
[Epoch 7/15]: train-loss = 1.338629 | train-acc = 0.483 | val-loss = 0.076869 | val-acc = 0.378
[Epoch 8/15]: train-loss = 1.366661 | train-acc = 0.446 | val-loss = 0.080729 | val-acc = 0.422
[Epoch 9/15]: train-loss = 1.322707 | train-acc = 0.444 | val-loss = 0.048672 | val-acc = 0.356
[Epoch 10/15]: train-loss = 1.306893 | train-acc = 0.460 | val-loss = 0.083231 | val-acc = 0.400
[Epoch 11/15]: train-loss = 1.282570 | 

In [13]:
def test_model():
    test_samples_num = 135
    correct = 0

    model.eval().cuda()

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            prediction = model(inputs)
            _, predicted_class = torch.max(prediction.data, 1)
            correct += (predicted_class == labels).float().sum().item()

    test_accuracy = correct / test_samples_num
    print(f'Test accuracy: {test_accuracy:.3f}')

test_model()


Test accuracy: 0.444
