**CAMs) Learning Deep Features for Discriminative Localization**   
*Bolei Zhou, Aditya Khosla, Agata Lapedriza, Aude Oliva, Antonio Torralba*   
[[paper](https://arxiv.org/abs/1512.04150)]    
CVPR 2016  

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision.models as models

from torchvision.datasets import CIFAR100
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from tqdm import tqdm

In [2]:
# for VGGnet, we removed the layers after conv5-3 (i.e., pool5 to prb) resulting in a mapping resolution of 14 x 14
class CAMs_VGG16(nn.Module):
    def __init__(self, num_classes) -> None:
        super(CAMs_VGG16, self).__init__()

        # to remove the last pooling layer
        # alternative --> nn.Sequential(*list(model.features.children())[:-1])
        self.backbone = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
        self.backbone = nn.Sequential(*[self.backbone.features[i] for i in range(len(self.backbone.features)-1)])

        # we added a convolutional layer of size 3x3, stride 1, pad 1 with 1024 units, followed by a GAP layer and a softmax layer.
        self.conv    = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1)
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.softmax = nn.Softmax(dim=1)
        self.linear  = nn.Linear(1024, num_classes)
    
    def forward(self, x):

        # CNN encoder
        h = self.backbone(x)
        features = self.conv(h) # (B, 1024, w, h)
        p = self.avgpool(features)
        p = torch.squeeze(p)
        out = self.linear(self.softmax(p))

        return out, features

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
# Define train/test data loaders  
# Use data augmentation in training set to mitigate overfitting. 
train_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),                                
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

test_transform = transforms.Compose([                       
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

train_dataset = CIFAR100('dataset/cifar100', download=True, train=True, transform=train_transform)
test_dataset = CIFAR100('dataset/cifar100', download=True, train=False, transform=test_transform)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, drop_last=False)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
def train_net(net, optimizer, criterion):
    global_step = 0
    best_accuracy = 0

    epochs = 100

    for epoch in range(epochs):
        # Here starts the train loop.
        net.train()
        for batch_idx, (x, y) in tqdm(enumerate(train_dataloader)):

            global_step += 1

            #  Send `x` and `y` to either cpu or gpu using `device` variable. 
            x = x.to(device=device)
            y = y.to(device=device)
            
            logit, _ = net(x)

            accuracy = (logit.argmax(1) == y).float().mean()
            loss = criterion(logit, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        net.eval()
        with torch.no_grad():
            test_loss = 0.
            test_accuracy = 0.
            test_num_data = 0.
            for batch_idx, (x, y) in tqdm(enumerate(test_dataloader)):
                x = x.to(device=device)
                y = y.to(device=device)

                logit, _ = net(x)

                loss = criterion(logit, y)

                accuracy = (logit.argmax(dim=1) == y).float().mean()

                test_loss += loss.item()*x.shape[0]
                test_accuracy += accuracy.item()*x.shape[0]
                test_num_data += x.shape[0]

            test_loss /= test_num_data
            test_accuracy /= test_num_data

            print(f'Test result of epoch {epoch}/{epochs} || loss : {test_loss:.3f} acc : {test_accuracy:.3f} ')

        # scheduler.step()
    return best_accuracy


In [6]:
model = CAMs_VGG16(num_classes=100)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

best_accuracy = train_net(model, optimizer,criterion)

49it [01:29,  1.57it/s]