**CAMs) Learning Deep Features for Discriminative Localization**   
*Bolei Zhou, Aditya Khosla, Agata Lapedriza, Aude Oliva, Antonio Torralba*   
[[paper](https://arxiv.org/abs/1512.04150)]    
CVPR 2016  

In [146]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision.models as models


import numpy as np
import cv2
from torchvision.datasets import CIFAR100, CIFAR10, imagenet
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from tqdm import tqdm

import matplotlib.pyplot as plt

In [147]:
# for VGGnet, we removed the layers after conv5-3 (i.e., pool5 to prb) resulting in a mapping resolution of 14 x 14
class CAMs_VGG16(nn.Module):
    def __init__(self, num_classes) -> None:
        super(CAMs_VGG16, self).__init__()

        # to remove the last pooling layer
        # alternative --> nn.Sequential(*list(model.features.children())[:-1])
        self.backbone = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
        self.backbone = nn.Sequential(*[self.backbone.features[i] for i in range(len(self.backbone.features)-1)])

        # we added a convolutional layer of size 3x3, stride 1, pad 1 with 1024 units, followed by a GAP layer and a softmax layer.
        self.conv    = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1)
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear  = nn.Linear(1024, num_classes)
    
    def forward(self, x):

        # CNN encoder
        h = self.backbone(x)
        f = self.conv(h) # (B, 1024, w, h)
        f = F.relu(f)
        p = self.avgpool(f).view(x.shape[0], -1)
        out = self.linear(p)

        return out, f

device(type='cuda')

In [149]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

# Define train/test data loaders  
# Use data augmentation in training set to mitigate overfitting. 
train_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),                                
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

test_transform = transforms.Compose([       
    transforms.Resize(224),                
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

train_dataset = CIFAR10('dataset/cifar10', download=True, train=True, transform=train_transform)
test_dataset = CIFAR10('dataset/cifar10', download=True, train=False, transform=test_transform)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, drop_last=False)

Files already downloaded and verified
Files already downloaded and verified


In [151]:
def train_net(net, optimizer, criterion):
    global_step = 0
    best_accuracy = 0

    epochs = 20

    net.train()
    for epoch in tqdm(range(epochs)):
        # Here starts the train loop.
        for batch_idx, (x, y) in enumerate(train_dataloader):

            global_step += 1

            #  Send `x` and `y` to either cpu or gpu using `device` variable. 
            x = x.to(device=device)
            y = y.to(device=device)
            
            logit, _ = net(x)

            accuracy = (logit.argmax(1) == y).float().mean()
            loss = criterion(logit, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        net.eval()
        with torch.no_grad():
            test_loss = 0.
            test_accuracy = 0.
            test_num_data = 0.
            for batch_idx, (x, y) in tqdm(enumerate(test_dataloader)):
                x = x.to(device=device)
                y = y.to(device=device)

                logit, _ = net(x)

                loss = criterion(logit, y)

                accuracy = (logit.argmax(dim=1) == y).float().mean()

                test_loss += loss.item()*x.shape[0]
                test_accuracy += accuracy.item()*x.shape[0]
                test_num_data += x.shape[0]

            test_loss /= test_num_data
            test_accuracy /= test_num_data

            print(f'Test result of epoch {epoch}/{epochs} || loss : {test_loss:.3f} acc : {test_accuracy:.3f} ')

        # scheduler.step()
    return best_accuracy


In [None]:
model = CAMs_VGG16(num_classes=10)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

best_accuracy = train_net(model, optimizer,criterion)

torch.save(model.state_dict(), 'E:/git/model-implementation/Convolution/model/CAMs_weights.pth')

In [139]:
model.load_state_dict(torch.load('E:/git/model-implementation/Convolution/model/CAMs_weights.pth', map_location=device))

<All keys matched successfully>

In [140]:
test_img, test_label = test_dataset[0]
pred, feature = model(test_img.to(device).view(1, test_img.shape[0], test_img.shape[1], test_img.shape[2]))
weights = model.linear.weight

In [141]:
def return_CAM(feature_conv, weight, class_idx):
    size_upsample = (256, 256)
    b, nc, h, w = feature_conv.shape
    output_cam = []
    for idx in class_idx:
        beforeDot =  feature_conv.reshape((nc, h*w))
        cam = np.matmul(weight[idx], beforeDot)
        cam = cam.reshape(h, w)
        cam = cam - np.min(cam)
        cam_img = cam / np.max(cam)
        cam_img = np.uint8(255 * cam_img)
        output_cam.append(cv2.resize(cam_img, size_upsample))
    return output_cam

In [142]:
print(f'class label: {test_label}, pred label: {pred.argmax(1).item()}')
print(f'img size: {test_img.shape}')
print(f'feature map size: {feature[0].shape}, weights shape: {weights.shape}')

CAMs = return_CAM(feature.to('cpu').detach().numpy(), weights.to('cpu').detach().numpy(), [test_label])

class label: 3, pred label: 4
img size: torch.Size([3, 224, 224])
feature map size: torch.Size([1024, 14, 14]), weights shape: torch.Size([10, 1024])


In [143]:
inv_normalize = transforms.Normalize(
    mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255],
    std=[1/0.229, 1/0.224, 1/0.255]
)
test_img = (inv_normalize(test_img).numpy() * 255).astype(np.uint8)

In [145]:
print(f'output CAM.jpg for the top1 prediction: {pred.argmax(1)}')
channel, height, width = test_img.shape
heatmap = cv2.applyColorMap(cv2.resize(CAMs[0],(width, height)), cv2.COLORMAP_JET)
result = heatmap * 0.3 + test_img.T * 0.5

cv2.imwrite('CAM.jpg',np.hstack([test_img.T, heatmap]) )

output CAM.jpg for the top1 prediction: tensor([4], device='cuda:0')


True