# CNN with transfer learning

In this notebook, I'll build a CNN model to detect the American Sign Language (ASL). Then, try to use transfer learning (ResNet-50) to improve the performance.

**Key Words:** PyTorch, CNN, ResNet-50

## Import and Set up

In [21]:
from google.colab import drive
import os
import torch
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import itertools
from sklearn.metrics import roc_auc_score, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

In [6]:
# Global variables
DATASET_PATH = '/content/drive/MyDrive/machine-learning/assignments/midterm-project/midterm-project-individual/ASL-data'
batch_size = 128
torch.manual_seed(936)

<torch._C.Generator at 0x7e57a809e430>

In [3]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Dataset

This ASL dataset was made by my group. It only includes letters from A to E.

In [7]:
transform_normal = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # Approximately between -1 an 1
])

train_data = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "train"), transform=transform_normal)
val_data = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "val"), transform=transform_normal)
test_data = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "test"), transform=transform_normal)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

## CNN

I'm going to build a CNN model with 2 convolution layers.

In [8]:
# Build a CNN class
class MyCNN(nn.Module):
    def __init__(self, num_filters=[16, 32], dropout=0.2):
        super(MyCNN, self).__init__()
        self.num_filters = num_filters
        self.convol_1 = nn.Conv2d(3, num_filters[0], kernel_size=3, padding=1)
        self.convol_2 = nn.Conv2d(num_filters[0], num_filters[1], kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout = nn.Dropout(dropout)
        self.func_1 = nn.Linear(num_filters[1] * 8 * 8, 128)
        self.func_2 = nn.Linear(128, 5)

    def forward(self, x):
        x = self.convol_1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.convol_2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(-1, self.num_filters[1] * 8 * 8)
        x = self.func_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.func_2(x)
        return x

Prepare to train the model

In [9]:
my_model = MyCNN()
my_model = my_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(my_model.parameters(), lr=0.01)

Train the model

While training the model, we also do validation in each epoch. And use early stopping to avoid overfitting.

In [10]:
# Do validation during training
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = float(0)

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    return val_loss / len(val_loader) # Return average loss in one epoch

In [11]:
def model_train(my_model, train_loader, val_loader, criterion, optimizer):
    train_losses = []
    val_losses = []

    # Variables for early stopping
    best_val_loss = float('inf')
    patience = 3 # Three times for no improvement
    counter = 0

    for epoch in range(100):
        my_model.train()
        train_loss = float(0)

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = my_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Get loss on training set
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        # Get loss on validation set
        val_loss = validate(my_model, val_loader, criterion)
        val_losses.append(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(my_model.state_dict(), 'best_model.pth')
        else:
            counter += 1

            if counter >= patience:
                break

In [12]:
model_train(my_model, train_loader, val_loader, criterion, optimizer)

Evaluate the model

In [14]:
def evaluate(my_model, val_loader):
    my_model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = my_model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 100 * correct / total

print(f'Accuracy: {evaluate(my_model, val_loader):.2f}%')

Accuracy: 85.00%


Accuracy 85.00%

## Tune hyperparameters

Hyperparameters we are going to tune:

- Batch size
- Dropout or not
- Number of filters

We make a tune grid to tune them all.

In [15]:
# Build tuning grid
batch_sizes = [32, 64]
dropouts = [0.2, 0.5]
filter_sets = [[16, 32], [32, 64]]

all_grid = list(itertools.product(batch_sizes, dropouts, filter_sets))

In [16]:
# Tune
tune_result = []
best_accuracy = float(0)

for batch_size, dropout, filters in tqdm(all_grid):
    train_loader_tune = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader_tune = DataLoader(val_data, batch_size=batch_size)

    my_model_tune = MyCNN(num_filters=filters, dropout=dropout)
    my_model_tune = my_model_tune.to(device)

    optimizer_tune = torch.optim.Adam(my_model_tune.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model_train(my_model_tune, train_loader_tune, val_loader_tune, criterion, optimizer_tune)

    eval_tune = evaluate(my_model_tune, val_loader_tune)

    if eval_tune > best_accuracy:
        torch.save(my_model_tune.state_dict(), 'best_model_tune.pth')
        best_accuracy = eval_tune

    tune_result.append((batch_size, dropout, filters, eval_tune))

100%|██████████| 8/8 [08:51<00:00, 66.40s/it]


In [20]:
# Print the table of tuning result
for batch_size, dropout, filters, eval_tune in tune_result:
    print(f'Batch size: {batch_size}, Dropout: {dropout}, Number of filters: {filters}, Accuracy: {eval_tune:.2f}%')

Batch size: 32, Dropout: 0.2, Number of filters: [16, 32], Accuracy: 85.00%
Batch size: 32, Dropout: 0.2, Number of filters: [32, 64], Accuracy: 95.00%
Batch size: 32, Dropout: 0.5, Number of filters: [16, 32], Accuracy: 90.00%
Batch size: 32, Dropout: 0.5, Number of filters: [32, 64], Accuracy: 91.67%
Batch size: 64, Dropout: 0.2, Number of filters: [16, 32], Accuracy: 76.67%
Batch size: 64, Dropout: 0.2, Number of filters: [32, 64], Accuracy: 93.33%
Batch size: 64, Dropout: 0.5, Number of filters: [16, 32], Accuracy: 83.33%
Batch size: 64, Dropout: 0.5, Number of filters: [32, 64], Accuracy: 88.33%


The best hyper-parameters are:

- Batch size: 32
- Dropout: 0.2
- Number of filters: [32, 64]
- Accuracy: 95.00%

## CNN with ResNet-50

Get rid of my own convolution layer. Use ResNet-50 as a pre-trained model. Fine-tune the last layer (layer 4).

### Normalization

Normalize the data with ResNet requirement.

In [22]:
transform_resnet = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_data_resnet = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "train"), transform=transform_resnet)
val_data_resnet = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "val"), transform=transform_resnet)
test_data_resnet = datasets.ImageFolder(root=os.path.join(DATASET_PATH, "test"), transform=transform_resnet)

train_loader_resnet = DataLoader(train_data_resnet, batch_size=batch_size, shuffle=True)
val_loader_resnet = DataLoader(val_data_resnet, batch_size=batch_size)
test_loader_resnet = DataLoader(test_data_resnet, batch_size=batch_size)

Load the ResNet-50 and modify it.

In [34]:
# Load ResNet-50
resnet_model = models.resnet50(pretrained=True)

# Freeze all parameters first
for param in resnet_model.parameters():
    param.requires_grad = False

# Unfreeze the last block (layer4) for fine-tuning
for param in resnet_model.layer4.parameters():
    param.requires_grad = True

# Replace the final fully connected layer with two layers (Hidden + Output)
num_features = resnet_model.fc.in_features
hidden_layer_size = 128

resnet_model.fc = nn.Sequential(
    nn.Linear(num_features, hidden_layer_size),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(hidden_layer_size, 5)
)

resnet_model = resnet_model.to(device)



Train the model.

In [35]:
# Train the model
criterion_resnet = nn.CrossEntropyLoss()
optimizer_resnet = torch.optim.Adam(resnet_model.parameters(), lr=0.001)

# Train using the previously defined training function
model_train(resnet_model, train_loader_resnet, val_loader_resnet, criterion_resnet, optimizer_resnet)

Evaluate the model.

In [39]:
# Evaluate the model
# Load the best weights saved during training
resnet_model.load_state_dict(torch.load('best_model.pth'))

# Validation set
accuracy = evaluate(resnet_model, val_loader_resnet)
print(f'ResNet-50 Accuracy on Validation set: {accuracy:.2f}%')

# Test set
accuracy = evaluate(resnet_model, test_loader_resnet)
print(f'ResNet-50 Accuracy on Test set: {accuracy:.2f}%')

ResNet-50 Accuracy on Validation set: 96.67%
ResNet-50 Accuracy on Test set: 91.53%


In [41]:
# Calculate AUC
y_probs = []
y_true_auc = []

resnet_model.eval()
with torch.no_grad():
    for images, labels in test_loader_resnet:
        images = images.to(device)
        # We need true labels on CPU for sklearn
        y_true_auc.extend(labels.numpy())

        outputs = resnet_model(images)
        # Apply softmax to get probabilities
        probs = F.softmax(outputs, dim=1)
        y_probs.extend(probs.cpu().numpy())

# Calculate Macro-Average AUC (One-vs-Rest)
auc_score = roc_auc_score(y_true_auc, y_probs, multi_class='ovr', average='macro')
print(f"ResNet-50 Macro AUC: {auc_score:.4f}")

ResNet-50 Macro AUC: 0.9865
