# PyTorch Tutorial 15 - Transfer Learning

In [6]:
# Image Folder
# Scheduler
# Transfer Learning

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])


data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
}


data_dir = 'data/hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=0)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(class_names)


['ants', 'bees']


In [10]:
def imshow(inp, title):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    plt.title(title)
    plt.show()


# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

# imshow(out, title=[class_names[x] for x in classes])

In [11]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

# 1st option


Overall, this transfer learning approach leverages the features learned by a pretrained ResNet-18 model and fine-tunes them for the specific binary classification task. By adapting only the final fully connected layer and using a step-wise learning rate scheduler, it aims to efficiently train the model while preventing overfitting and ensuring convergence.

In [13]:
#### Finetuning the convnet ####
# Load a pretrained model and reset final fully connected layer.

model = models.resnet18(pretrained=True)

# it resets the final fully connected layer of the model (model.fc) to adapt it to the specific binary classification task. 
# The number of input features (num_ftrs) of the fully connected layer is adjusted to match the output size required for the 
# binary classification task (2 classes in this case).

num_ftrs = model.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model.fc = nn.Linear(num_ftrs, 2)

model = model.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=0.001)
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) # it updates learning rate
# this means our learning rate will be multiplied by gamma after every step_size epochs. So after 7 epochs our learning rate
# will be updated by 10% & be decreased


# StepLR Decays the learning rate of each parameter group by gamma every step_size epochs
# Decay LR by a factor of 0.1 every 7 epochs
# Learning rate scheduling should be applied after optimizer’s update
# e.g., you should write your code this way:
# for epoch in range(100):
#     train(...)
#     validate(...)
#     scheduler.step()


model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=2)


Epoch 0/1
----------
train Loss: 0.6024 Acc: 0.7131
val Loss: 0.4225 Acc: 0.8758

Epoch 1/1
----------
train Loss: 0.5311 Acc: 0.7623
val Loss: 0.3124 Acc: 0.9281

Training complete in 2m 50s
Best val Acc: 0.928105


# 2nd option

* Freezing Pretrained Layers:

Unlike the first approach, where the entire model was fine-tuned, in this approach, we freeze all the layers of the pretrained model except for the final fully connected layer.
It iterates over all parameters of the model (model_conv.parameters()) and sets requires_grad = False for each parameter, except for the parameters of the final fully connected layer (model_conv.fc.parameters()).
Freezing the pretrained layers prevents their parameters from being updated during training, effectively treating them as fixed feature extractors.



* Replacing Final Fully Connected Layer:

It replaces the final fully connected layer (model_conv.fc) with a new one suitable for the specific classification task. The number of input features (num_ftrs) of the new fully connected layer is adjusted accordingly.
By replacing only the final layer, the model retains the ability to extract features from the input images while adapting the classification layer to the new task.


* Training the Model:

Finally, the model is trained (train_model function) for a specified number of epochs (num_epochs). During training, only the parameters of the final fully connected layer are updated, while the parameters of the pretrained layers remain fixed.
This approach allows the model to leverage the features learned by the pretrained layers while adapting the final layer to the specific classification task, making it computationally efficient and potentially less prone to overfitting.

In [14]:
#### ConvNet as fixed feature extractor ####
# Here, we need to freeze all the network except the final layer.
# We need to set requires_grad == False to freeze the parameters so that the gradients are not computed in backward()
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 2)

model_conv = model_conv.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that only parameters of final layer are being optimized as
# opposed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

model_conv = train_model(model_conv, criterion, optimizer_conv,
                         exp_lr_scheduler, num_epochs=2)

Epoch 0/1
----------
train Loss: 0.6795 Acc: 0.6475
val Loss: 0.3155 Acc: 0.8627

Epoch 1/1
----------
train Loss: 0.4967 Acc: 0.7623
val Loss: 0.2262 Acc: 0.9085

Training complete in 1m 47s
Best val Acc: 0.908497


# Transfer Learning Approach 1: Fine-tuning the ConvNet 
### Updating both: Conv + fc (fully connected layer)

* Description: 
In this approach, the entire pretrained model is fine-tuned by updating all of its parameters, including both the pretrained layers and the final fully connected layer.
* Similarities:
Both approaches use a pretrained ResNet-18 model as the basis for feature extraction.
They both involve replacing the final fully connected layer to adapt the model to the specific classification task.
Both approaches utilize a step-wise learning rate scheduler to gradually decrease the learning rate during training.
* Differences:
* * Fine-tuning: This approach fine-tunes all layers of the pretrained model, allowing the model to learn task-specific features as well as retaining previously learned features.
* * Parameter Updates: All parameters of the model, including both pretrained layers and the final fully connected layer, are updated during training.
* * Training Time: Fine-tuning the entire model may require more computational resources and training time compared to freezing pretrained layers.
* Pros:

* * Flexibility: Fine-tuning allows the model to adapt to the specific characteristics of the new dataset, potentially leading to improved performance.
* * Better Feature Learning: By updating all parameters, the model can learn task-specific features that may not be present in the original pretrained model.
* * Potentially Higher Accuracy: Fine-tuning the entire model may lead to higher accuracy compared to freezing pretrained layers, especially when the new dataset is significantly different from the original dataset used for pretraining.
* Cons:

* * Computational Cost: Fine-tuning the entire model requires more computational resources and training time compared to freezing pretrained layers.
* * Risk of Overfitting: Fine-tuning may increase the risk of overfitting, especially if the new dataset is small or if the model is trained for too many epochs.

*************************

# Transfer Learning Approach 2: ConvNet as a Fixed Feature Extractor
### Updating only fc (fully connected layer) but not Convolutional layer

* Description: In this approach, only the final fully connected layer of the pretrained model is updated, while the parameters of the pretrained layers are frozen.
* Similarities:
Both approaches involve using a pretrained ResNet-18 model and replacing the final fully connected layer.
They both utilize a step-wise learning rate scheduler to adjust the learning rate during training.

* Differences:
* * Freezing Pretrained Layers: In this approach, the parameters of the pretrained layers are frozen, preventing them from being updated during training.
* * Parameter Updates: Only the parameters of the final fully connected layer are updated during training, allowing the model to adapt to the new classification task while retaining the features learned by the pretrained layers.
* * Reduced Training Time: Freezing pretrained layers reduces the computational cost and training time compared to fine-tuning the entire model.
* Pros:

* * Computational Efficiency: Freezing pretrained layers reduces the computational cost and training time compared to fine-tuning the entire model.
* * Reduced Risk of Overfitting: By keeping the pretrained layers fixed, there is a reduced risk of overfitting, especially when the new dataset is small.
* * Preservation of Learned Features: Freezing pretrained layers allows the model to retain the features learned during pretraining, potentially leading to better generalization.
* Cons:

* * Less Flexibility: By keeping the pretrained layers fixed, the model may not adapt as well to the specific characteristics of the new dataset compared to fine-tuning the entire model.
* * Limited Feature Learning: The model may be limited to the features learned by the pretrained layers and may not learn task-specific features as effectively as fine-tuning the entire model.

************************
In summary, the choice between the two transfer learning approaches depends on factors such as the size of the new dataset, computational resources available, and the desired balance between performance and computational cost. Fine-tuning the entire model offers more flexibility and potentially higher accuracy but requires more computational resources, while freezing pretrained layers is computationally efficient and reduces the risk of overfitting but may limit the adaptability of the model.







In [None]:
# if we want to apply more layers, we can do it as below

# Freeze pretrained layers
for param in model.parameters():
    param.requires_grad = False

# Add more layers at the end
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 512),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, num_classes)  # num_classes is the number of output classes
)
