## **Intro**
**Installation of Dependencies**

Run the below cell to install PyTorch for this notebook

In [None]:
import torch

from google.colab import drive
drive.mount('/gdrive', force_remount=True)

## **Useful Functions**
Run the below cell to initialize the provided functions that will be used in this assignment

In [None]:
from os import path
import wheel
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.autograd import Variable
from torch.autograd import Function

import urllib
import cv2
import numpy as np
import os, sys, math, random, subprocess
import matplotlib.pyplot as plt
from scipy.ndimage.filters import gaussian_filter
from IPython.display import clear_output, Image, display, HTML
from google.protobuf import text_format
from io import StringIO
import PIL.Image
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 10)        # large images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap

def get_n_params(module):
  nparam = 0
  for name, param in module.named_parameters():
    param_count = 1
    for size in list(param.size()):
      param_count *= size
    nparam += param_count
  return nparam

def get_model_params(model):
  nparam = 0
  for name, module in model.named_modules():
    nparam += get_n_params(module)
  return nparam

def np_img_from_url(url):
  url_response = urllib.urlopen(url)
  img_array = np.array(bytearray(url_response.read()), dtype=np.uint8)
  img = cv2.imdecode(img_array, -1)
  return img

def to_numpy_image(tensor_or_variable):
  
  # If this is already a numpy image, just return it
  if type(tensor_or_variable) == np.ndarray:
    return tensor_or_variable
  
  # Make sure this is a tensor and not a variable
  if type(tensor_or_variable) == Variable:
    tensor = tensor_or_variable.data
  else:
    tensor = tensor_or_variable
  
  # Convert to numpy and move to CPU if necessary
  np_img = tensor.detach().cpu().numpy()
  
  # If there is no batch dimension, add one
  if len(np_img.shape) == 3:
    np_img = np_img[np.newaxis, ...]
  
  # Convert from BxCxHxW (PyTorch convention) to BxHxWxC (OpenCV/numpy convention)
  np_img = np_img.transpose(0, 2, 3, 1)
  
  return np_img


def to_pytorch_image(np_image):
  
  # Create a batch dimension
  if len(np_image.shape) == 3:
    np_image = np_image[np.newaxis, ...]
  
  # Convert from BxHxWxC (OpenCV/numpy) to BxCxHxW (PyTorch)
  np_image = np_image.transpose(0, 3, 1, 2)
  
  pytorch_img = torch.from_numpy(np_image).float()
  
  return pytorch_img
  
def draw_border(image_np, color):
  color = np.asarray(color)
  s = image_np.shape
  image_np = image_np.copy()
  image_np[0:5, :, :] = color[np.newaxis, np.newaxis, :]
  image_np[:, 0:5, :] = color[np.newaxis, np.newaxis, :]
  image_np[s[0]-5:s[0], :, :] = color[np.newaxis, np.newaxis, :]
  image_np[:, s[0]-5:s[0], :] = color[np.newaxis, np.newaxis, :]
  return image_np

def normalize_zero_one_range(tensor_like):
  x = tensor_like - tensor_like.min()
  x = x / (x.max() + 1e-9)
  return x


def prep_for_showing(image):
  np_img = to_numpy_image(image)
  if len(np_img.shape) > 3:
    np_img = np_img[0]
  np_img = normalize_zero_one_range(np_img)
  return np_img

  
def show_image(tensor_var_or_np, title=None, bordercolor=None):
  np_img = prep_for_showing(tensor_var_or_np)
  
  if bordercolor is not None:
    np_img = draw_border(np_img, bordercolor)
  
  # plot it
  np_img = np_img.squeeze()
  plt.figure(figsize=(4,4))
  plt.imshow(np_img)
  plt.axis('off')
  if title: plt.title(title)
  plt.show()
    
def show_images(images, correct_list=None, size=128, titles=None):
  for i, image in enumerate(images):
    bordercolor = ([0,1,0] if correct_list[i] else [1,0,0]) if correct_list else None
    show_image(image, bordercolor=bordercolor, title=titles[i] if titles else None)
    
def show_image_rows(image_lists):
  for l in image_lists:
    #plt.figure(figsize=(1, len(l)))
    #plt.axis('off')
    f, axarr = plt.subplots(1,len(l))
    #print(axarr)
    for i,img in enumerate(l):
      img_np = prep_for_showing(img).squeeze()
      axarr[i].imshow(img_np)
      axarr[i].axis('off')
    plt.show()

from bs4 import BeautifulSoup
import requests

def listFD(url, ext=''):
  page = requests.get(url).text
  soup = BeautifulSoup(page, 'html.parser')
  return [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]

def url_to_image(url):
	# download the image, convert it to a NumPy array, and then read
	# it into OpenCV format
  resp = urllib.request.urlopen(url)
  image = np.asarray(bytearray(resp.read()), dtype="uint8")
  image = cv2.imdecode(image, cv2.IMREAD_COLOR)
  
  # Convert BGR to RGB
  image = image[:, :, [2,1,0]]
  
  return image

def url_to_text(url):
  resp = urllib.request.urlopen(url)
  text = resp.read()
  return text.decode("utf-8")

## **Dataset**

The following cell implements a Dataset that loads the images that will be used later in this assignment.

It's a subclass of torch.utils.data.Dataset, which is a base class for Datasets in pytorch, and it is compatible with torch.utils.data.Dataloader. Dataloder is useful, because it allows us to easily load data in multiple background threads without writing any threading code. This is important, because one of the biggest bottlenecks in neural network training is the rate at which data can be fed into the model.

A dataset implements __len__ and __getitem__, which means samples from the dataset can be obtained by indexing it like sample = dataset[5].

A dataloader can be iterated and returns batches: batch_0 = iter(dataloader).first()

torchvision is useful suite of utilities when doing computer vision work with PyTorch. It already provides implementations for many of the most popular computer vision datasets. In this notebook, we have rolled our own dataset that loads images from a URL.



In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

DATA_URL = "http://www.cs.cornell.edu/courses/cs5670/2019sp/projects/pa5/assignment_data/dataset/"
IMAGENET_LABELS_URL = "http://www.cs.cornell.edu/courses/cs5670/2019sp/projects/pa5/assignment_data/imagenet_classes.txt"
DOGS_DIR = "test-dog"
FOOD_DIR = "test-food"

IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])
IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])

class FoodAndDogDataset(Dataset):
  """
  PyTorch DataLoader compatible dataset that pre-loads images from the dataset
  directory on the CS5670 class website. Images are returned as tensors, normalized
  with the imagenet normalization.
  """
  def __init__(self, include_classes="both"):
    self.dog_paths = listFD(DATA_URL + DOGS_DIR, ".jpg")
    self.food_paths = listFD(DATA_URL + FOOD_DIR, ".jpg")
    
    self.dog_labels = [0 for _ in self.dog_paths]
    self.food_labels = [1 for _ in self.food_paths]
    
    self.img_paths = []
    self.img_labels = []
    
    if include_classes in ["both", "dogs"]:
      self.img_paths += self.dog_paths
      self.img_labels += self.dog_labels
    
    if include_classes in ["both", "food"]:
      self.img_paths += self.food_paths
      self.img_labels += self.food_labels
  
    # We might not want to do this on a real dataset, if we have a lot of data.
    # In that case we would lazy-download images.
    # But for now we will pre-download them, because there's not that many
    self.all_images_np = [url_to_image(url).astype(np.float32) / 255 for url in self.img_paths]
  
    print(f"Initialized dataset: {include_classes} with {len(self.all_images_np)} images")
  
  def __len__(self):
    return len(self.img_paths)
  
  def __getitem__(self, idx):
    img_i_np = self.all_images_np[idx]
    label_i = self.img_labels[idx]

    # Convert to pytorch. Copy so that we don't keep re-normalizing the same image multiple times.
    img_i_tensor = to_pytorch_image(img_i_np.copy())

    # Pre-process the image:
    # Normalize using the mean and variance of the ImageNet dataset
    # The model was trained from this dataset
    img_i_tensor = img_i_tensor - IMAGENET_MEAN[np.newaxis, :, np.newaxis, np.newaxis].expand_as(img_i_tensor)
    img_i_tensor = img_i_tensor / IMAGENET_STD[np.newaxis, :, np.newaxis, np.newaxis].expand_as(img_i_tensor)

    assert img_i_tensor.size(2) == img_i_tensor.size(3) == 256, "Images expected to be of shape 1x3x256x256"
    
    # Take a center-crop of image, assuming the image is 256x256
    img_i_tensor = img_i_tensor[:, :, 16:240, 16:240]

    return img_i_tensor[0], label_i
    
dog_dataset = FoodAndDogDataset(include_classes="dogs")
food_dataset = FoodAndDogDataset(include_classes="food")
full_dataset = FoodAndDogDataset(include_classes="both")

## **ILSVRC2012 class labels**
In this project, we will be using an AlexNet image-classifier model pre-trained on the ILSVRC2012 dataset. In order to understand the model's output, we need a maping from the output probabilities to the class names. We load this mapping in this cell.

In [None]:
imagenet_id_to_label = {}
labels_file = url_to_text(IMAGENET_LABELS_URL)
labels_id2word = labels_file.split("\n")[:1000]
print(labels_id2word)

## **Notes on PyTorch**
PyTorch implements many operations on Tensors (such as torch.FloatTensor or torch.LongTensor) in much the same way numpy implements operations on ndarray. Variables (torch.autograd.Variable) wrap tensors and can be used interchangeably. When an operation is performed on a Tensor, the output is a new Tensor. When it is performed on a Variable, the output is also a Variable.

All operations involving Variables automatically build a computational graph (provided the requires_grad parameter is set to True on any of the Variable's involved in the operation). This means that at any point, calling backward on any Variable will compute the gradient of that Variable with respect to all other Variables. Thus calling backward on the loss function we're trying to optimize will calculate the gradient w.r.t model parameters, allowing us to implement gradient descent.

We can also use .backward to calculate a gradient w.r.t the input, provided the requires_grad attribute on the input variable is set to true. After calling backward, the gradients can be accessed in the .grad attribute of the variable. Here is some fairly typical code that uses this functionality to access gradients with respect to an input variable:

Given input and target of type torch.FloatTensor. First we must zero gradients on any variable that already has gradients coputed. On nn.Module, this zeroes gradients of all the model parameters.

> some_model.zero_grad()


Convert the input into a variable that requires gradient

> input_var = Variable(input, requires_grad=True)

> target_var = Variable(target)

Call the model and calculate the loss

> out = some_model(input_var)

> loss = some_loss_function(out, target)

Run backpropagation

> loss.backward()

Access gradients of the loss with respect to the input

> grad_input = input_var.grad

> grad_input_tensor = grad_input.data

> grad_input_numpy = grad_input_tensor.cpu().numpy()

The underlying Tensor can be accessed as the data attribute given a Variable.

PyTorch implements many neural network operations in the torch.nn package. These are usually packaged in a Module, which allows for easy modular design of neural network architectures.

## **AlexNet definition**
Below is the Module that implements AlexNet, copied from the torchvision package, with minor modifications for easier use in this assignment. Specifically, because the original implementation uses nn.Sequential containers, the individual layers are not directly accessible. We have made them directly accessible by defining the shortcut_modules attribute.

In [None]:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo

model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
}

INPLACE = False

class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=INPLACE),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=INPLACE),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=INPLACE),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=INPLACE),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=INPLACE),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=INPLACE),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=INPLACE),
            nn.Linear(4096, num_classes),
        )
        
        self.module_shortcuts = {
            "conv1": self.features[0],
            "relu1": self.features[1],
            "conv2": self.features[3],
            "relu2": self.features[4],
            "conv3": self.features[6],
            "relu3": self.features[7],
            "conv4": self.features[8],
            "relu4": self.features[9],
            "conv5": self.features[10],
            "relu5": self.features[11],
            
            "fc6": self.classifier[2],
            "fc7": self.classifier[5],
            "fc8": self.classifier[6]
        }
    
        
    def __getitem__(self, layer_name):
      if layer_name in self.module_shortcuts:
        return self.module_shortcuts[layer_name]
      return None
    
    
    def shortcut_modules(self):
      for name, mod in self.module_shortcuts.items():
        yield name, mod
    
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 6 * 6)
        x = self.classifier(x)
        return x

def alexnet(pretrained=False, **kwargs):
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = AlexNet(**kwargs)
    if pretrained:
        print("Loading pre-trained weights from model zoo")
        model.load_state_dict(model_zoo.load_url(model_urls['alexnet']))
    model.eval()
    return model

model = alexnet(pretrained=True, num_classes=1000)

## **Student TODO Implementations**
Implement all of your solutions in this secion

**Answer TODO 1 here:**

1.

In [None]:
def convert_ilsvrc2012_probs_to_dog_vs_food_probs(probs_ilsvrc):
    """
    Convert from 1000-class ILSVRC probabilities to 2-class "dog vs food"
    incices.  Use the variables "dog_indices" and "food_indices" to map from
    ILSVRC2012 classes to our classes.
    HINT:
    Compute "probs" by first estimating the probability of classes 0 and 1,
    using probs_ilsvrc.  Stack together the two probabilities along axis 1, and
    then normalize (along axis 1).
    :param probs_ilsvrc: shape (N, 1000) probabilities across 1000 ILSVRC classes
    :return probs: shape (N, 2): probabilities of each of the N items as being
        either dog (class 0) or food (class 1).
    """
    # in the ILSVRC2012 dataset, indices 151-268 are dogs and index 924-969 are foods
    dog_indices = range(151, 269)
    food_indices = range(924, 970)
    N, _ = probs_ilsvrc.shape
    probs = np.zeros((N, 2)) # placeholder
    ############################ TODO 2 BEGIN #################################
    find_dog = probs_ilsvrc[:, dog_indices]
    find_food = probs_ilsvrc[:, food_indices]

    dog_prob_sum = np.sum(find_dog, 1)
    food_prob_sum = np.sum(find_food, 1)
    prob_sum = dog_prob_sum + food_prob_sum
    dog_prob_normalize = dog_prob_sum / prob_sum
    food_prob_noramlize = food_prob_sum / prob_sum

    probs = np.concatenate([np.expand_dims(dog_prob_normalize, 1), np.expand_dims(food_prob_noramlize, 1)], 1)

    ############################ TODO 2 END #################################
    return probs

In [None]:
def get_prediction_descending_order_indices(probs, cidx):
    """
    Returns the ordering of probs that would sort it in descending order
    :param probs: (N, 2) probabilities (computed in TODO 2)
    :param cidx: class index (0 or 1)
    :return list of N indices that sorts the array in descending order
    """
    ############################ TODO 3 BEGIN #################################
    temp = probs[:, cidx]
    t_order = np.empty_like(temp, dtype=np.uint8)
    temp_prob2 = temp 
    i=0
    while i<temp.shape[0]:
      max_values = temp_prob2.max()
      index = np.where(temp==max_values)
      if index[0].shape != 1:
        for k in range(index[0].shape[0]):
          t_order[i] = index[0][k]
          i = i+1
      else:
        t_order[i] = index[0]
        i = i+1
      temp_prob2 = np.delete(temp_prob2, np.where(temp_prob2==max_values))
    ############################ TODO 3 END #################################
    return t_order

In [None]:
def compute_dscore_dimage(scores, image, class_idx):
    """
    Returns the gradient of s_y (the score at index class_idx) with respect to
    the image (data), ds_y / dI.  Note that this is the unnormalized class
    score "s", not the probability "p".
    :param scores: (Variable) shape (1000) the output scores from AlexNet for image
    :param image: (Variable) shape (1, 3, 224, 244) the input image
    :param class_idx: class index in range [0, 999] indicating which class to compute saliency for
    :return grad: (Tensor) shape (3, 224, 224), gradient ds_y / dI
    """
    grad = torch.zeros_like(image) # placeholder
    ############################ TODO 4 BEGIN #################################
    scores[class_idx].backward()
    grad = image.grad
    gradients_as_arr = np.array(grad.detach())
    print(grad.shape)
    ############################ TODO 4 END #################################
    assert tuple(grad.shape) == (1, 3, 224, 224) # expected shape
    return grad[0]

In [None]:
def normalized_sgd_with_momentum_update(image, grad, velocity, momentum, learning_rate):
    """
    :param image: (Variable) shape (1, 3, 224, 244) the current solution
    :param grad: (Variable) gradient of the loss with respect to the image
    :param velocity: (Variable) momentum vector "V"
    :param momentum: (float) momentum parameter "mu"
    :param learning_rate: (float) learning rate "alpha"
    :return: (Variable) the updated image and momentum vector (image, velocity)
    """
    ############################ TODO 5a BEGIN #################################
    constant = np.linalg.norm(grad.detach().cpu())
    constant = torch.tensor(constant)
    # velocity = momentum*velocity - learning_rate*(grad/torch.linalg.norm(grad))
    velocity = momentum*velocity - learning_rate*(grad/constant)

    image = image + velocity

    
    ############################ TODO 5a BEGIN #################################
    return image, velocity

In [None]:
def class_visualization_gradient(target_score, image, target_class, reg_lambda):
    """
    Compute the gradient for make_class_visualization (dL / dI).
    :param target_score: (Variable) holding the current score assigned to the target class
    :param image: (Variable) shape (1, 3, 224, 224) the current solution
    :param target_class: (int) ILSVRC class in range [0, 999]
    :param regularization: (float) weight (lambda) applied to the regularizer.
    :return grad: (Variable) gradient dL / dI
    """
    grad = torch.zeros_like(image) # placeholder

    ############################ TODO 6 BEGIN #################################
    target_score[target_class].backward()
    grad = image.grad
    ############################ TODO 6 END #################################
    assert tuple(grad.shape) == (1, 3, 224, 224) # expected shape
    return grad

In [None]:
def fooling_image_gradient(target_score, orig_data, image_in, target_class, reg_lambda):
    """
    Compute the gradient for make_fooling_image (dL / dI).
    :param target_score: (Variable) holding the current score assigned to the target class
    :param orig_data: (Variable) shape (1, 3, 224, 224) holding the original image
    :param image_in: (Variable) shape (1, 3, 224, 224) hoding the current solution
    :param target_class: (int) ILSVRC class in range [0, 999]
    :param reg_lambda: (float) weight applied to the regularizer.
    :return grad: (Variable) gradient dL / dI
    """
    grad = torch.zeros_like(image_in) # placeholder
    ############################ TODO 5b BEGIN #################################
    target_score.backward()
    temp_grad = image_in.grad
    grad = -temp_grad + reg_lambda*(image_in - orig_data)
    ############################ TODO 5b END #################################
    assert tuple(grad.shape) == (1, 3, 224, 224) # expected shape
    return grad


## **1. AlexNet: Visualizing Structure**
The AlexNet model consists of 5 convolutional layers and three fully connected layers. Each of these layers is encapsulated in an nn.Module class that has one or more trainable parameters.

We can look the shapes of each of the model parameters using the below cell:

In [None]:
model_num_params = 0

# Loop through all modules
model_num_params = get_n_params(model)
print(f"# params in AlexNet: {model_num_params}")

# Loop through the select modules that we've named:
for simple_name, module in model.shortcut_modules():
  if hasattr(module, "weight"):
    print(f"Module {simple_name} weights: {module.weight.size()} bias: {module.bias.size()}")

## **Visualizing conv1 filters**
Filters in conv1 are unique in that they take RGB images as input. This means that we can visualize them as RGB images. For all other layers, we cannot view them as nice little colored squares because they are much higher dimensional.

In [None]:
def vis_square(data, title=None):
    """Take a Tensor of shape (n, K, height, width) or (n, K, height, width)
       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
    
    if data.size(1) > 3:
      data = data.view(-1, 1, data.size(2), data.size(3))
        
    data = to_numpy_image(data)
        
    # normalize data for display
    data = (data - data.min()) / (data.max() - data.min())
    
    # force the number of filters to be square
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
               (0, 2), (0, 2))                 # add some space between filters
               + ((0, 0),) * (data.ndim - 3))  # don't pad the last dimension (if there is one)
    data = np.pad(data, padding, mode='constant', constant_values=1)  # pad with ones (white)
    
    # tile the filters into an image
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    
    data = data.squeeze()
    
    # plot it
    plt.figure(figsize=(8, 8))
    plt.imshow(data)
    plt.axis('off')
    if title: plt.title(title)
      

weights = model.features[0].weight.data
vis_square(weights, title="Visualizing filters in conv1")

## **Visualizing AlexNet activations.**
PyTorch uses dynamic computational graphs. As a result of this, there is no way for us to directly access the intermediate activations as there is no fixed placeholder for them. In PyTorch, we can access intermediate activations using hooks. A forward hook is a function that gets called every time that the forward method on a module has been executed. To access intermediate activations, we register a forward hook that displays the activations.

Let's take an example image and pass it through AlexNet.

Visualize the activations inside all of these layers, using the code below. Lighter values have higher magnitude, and darker values have smaller magnitude.

In [None]:
# Load the image of the dog
example_image, label = dog_dataset[10]
show_image(example_image)

In [None]:
import functools

# Re-define the model to clear any previously registered hooks
model = alexnet(pretrained=True, num_classes=1000)

# Define a hook that visualizes a layer output
def show_activations_hook(name, module, input, output):
  
  print(f"Visualizing layer: {name}")
  # For conv/relu layer outputs (BxCxHxW) we plot an image as before
  if output.dim() == 4:
    vis_square(output, f"Activations on: {name}")
  
  # For linear layer outputs, we plot the activations as a line plot
  else:
    feat = output.data.view([-1]).cpu().numpy()
    plt.figure(figsize=(15, 3))
    plt.plot(feat)
    plt.title(f"Activations on: {name}")
  
# Register the hook on the select set of modules
for name, module in model.shortcut_modules():
  hook = functools.partial(show_activations_hook, name)
  module.register_forward_hook(hook)
  
# PyTorch modules work on minibatches and expect the first axis to be the batch axis
# If we run a model on a single image, we must turn this into a batch of size 1
model_input = Variable(example_image[np.newaxis, ...])

# Run the forward pass on the model
class_activations = model(model_input)[0]

## **?? Question ??**
Consider the visualizations produced by the above cell. Why does fc8 have negative values, but fc6 and fc7 are only positive?

HINT: Look at the structure of the network above, and our selection of named layers in AlexNet.module_shortcuts.

## **Looking up the class names**
The AlexNet model above outputs unconstrained class "scores". To turn these scores into a valid probability distribution over the 1000 ImageNet classes, we apply the softmax activation. We can find the index of the class with the maximum score and map that back to a description of the class in words.

In [None]:
# Load the image of the hot dog
example_image, label = dog_dataset[10]
show_image(example_image)

# Re-define the model to clear any previously registered hooks
model = alexnet(pretrained=True, num_classes=1000)

# Set to eval mode to disable dropout
model.eval()

model_input = example_image[np.newaxis, ...]

model_input = model_input.clone()

# Run the forward pass on the model
class_activations = model(Variable(model_input))[0]

# Compute class probabilities
class_probs = F.softmax(class_activations, dim=0)

# Get the class index
prob, class_idx = torch.max(class_probs, 0)

# Take the integer index out of the variable and tensor
class_idx = class_idx.data.item()

predicted_class_name = labels_id2word[class_idx]
print(f"Predicted class: {class_idx} - {predicted_class_name} with probability {prob.data.item()}")

## **What to expect**

We can see that this example was correctly classified, and with high confidence, despite the dog wearing a misleading costume!

## **2. Dog vs Food: Classification**
Let's classify dog vs food. We have prepared a test set of dogs dressed up like hotdogs, and hotdogs cut to look like animals. The below cell visualizes all the images in our dataset:

In [None]:
classes = ['dog', 'food']
class_datasets = [dog_dataset, food_dataset]

for cidx, cname in enumerate(classes):
  for i, (image, label) in enumerate(class_datasets[cidx]):
    if i >= 3:
      break
    show_image(image, classes[label])

## **Repurposing the ILSVRC2012 Classifier**
AlexNet was trained to recognize one of 1000 classes. We can repurpose it for our "food vs dog" task by remapping the categories.

In [None]:
# Re-define the model to clear any previously registered hooks
model = alexnet(pretrained=True, num_classes=1000)
N = len(full_dataset)
dataloader = DataLoader(
    full_dataset,
    batch_size=1024,
    shuffle=False,
    num_workers=0,
    pin_memory=False,
    drop_last=False)

assert len(dataloader) == 1, "Since batch_size is bigger than the number of examples, we should only have one batch"

for batch in dataloader:
  images = batch[0]
  labels = batch[1]
  labels_np = labels.numpy()
  
  class_activations = model(Variable(images))
  ilsvrc_class_probs = F.softmax(class_activations, dim=1)
  
  # Convert from Variable containing FloatTensor to numpy ndarray
  ilsvrc_class_probs_np = ilsvrc_class_probs.data.cpu().numpy()

  dogfood_class_probs_np = convert_ilsvrc2012_probs_to_dog_vs_food_probs(ilsvrc_class_probs_np)

  assert list(dogfood_class_probs_np.shape) == [N, 2]
  
  np.testing.assert_almost_equal(np.sum(dogfood_class_probs_np, axis=1), np.ones(N), decimal=5)
  
  print("Seems correct!")

## **Measuring the accuracy**
You should expect to get ~90% accuracy for dogs and ~96% accuracy for food with an overall accuracy of 93%.

In [None]:
predicted_class = np.argmax(dogfood_class_probs_np, axis=1)
correct_mask = predicted_class == labels_np
num_correct = np.sum(correct_mask)
accuracy = 100.0 * float(num_correct) / N

print(f"Overall accuracy {accuracy} {num_correct} / {N}")

for cidx, cname in enumerate(classes):
  cls_mask = labels_np == cidx
  predicted_cls = predicted_class[cls_mask]
  num_correct = np.sum(predicted_cls == cidx)
  cls_acc = 100.0 * float(num_correct) / cls_mask.sum()

  print(f"{cname} class accuracy {cls_acc} {num_correct} / {cls_mask.sum()}")

## **3. Dog vs Food: Visualization**
We can sort the predictions by the "dog" score and "food" score. The images are sorted according to how much AlexNet thinks the image belongs to that category.

Images are colored green/red depending on whether the prediction was correct.

For this assignment, you should expect that the incorrect predictions are all near the bottom (with the lowest score). This is a property that is very desirable -- mistakes only happen with lower scores. Note that in general real-world tasks, this does not always happen for free. If you want to be able to estimate the confidence of being correct, you need to separately predict that, and it is not easy to predict.

In [None]:
for cidx, cname in enumerate(classes):
  print(f"Predictions for class: {cname}")
  cls_mask = labels_np == cidx
  predicted_cls = predicted_class[cls_mask]
  cls_probs = dogfood_class_probs_np[cls_mask]
  correct = [p == cidx for p in predicted_cls]
  
  cls_images = [images[i] for i in range(len(images)) if cls_mask[i]]
  
  order = get_prediction_descending_order_indices(cls_probs, cidx)
  
  print(order)
  assert len(order) == cls_mask.sum()
  
  show_images(
    images = [cls_images[i] for i in order],
    correct_list = [correct[i] for i in order],
    titles = [f"Prob: {cls_probs[i, cidx]}" for i in order],
    size=128
  )

## **4. Visualizing saliency**
Using our pre-trained AlexNet, we will compute class saliency maps as described in Section 3.1 of [2]. As mentioned in Section 2 of the paper, you should compute the gradient with respect to the image of the unnormalized class score (fc8), not of the normalized class probability (prob). You will need to use the backward method of the module to compute gradients with respect to the image.

We want to compute:$${\partial s_y \over \partial I}$$

where $s_y$ is the score for class $y$ after layer fc8, but before applying the Softmax layer.

We will then visualize the squared magnitude of this (max across color channels), to estimate the saliency of the class across the input image. See [1] for more details and intuition.

NOTE: You don't need to call model() in your function; this has already been run for you. Same for all gradient functions you implement below.

[[2] Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. "Deep Inside Convolutional Networks: Visualising Image Classification Models and Saliency Maps", ICLR Workshop 2014.](https://arxiv.org/pdf/1312.6034.pdf)

In [None]:
model = alexnet(pretrained=True)

def visualize_saliency(image):
    image_in = Variable(image.unsqueeze(0), requires_grad=True)
    print(torch.max(image_in))
    cls_scores = model(image_in)[0]
    max_score, max_idx = torch.max(cls_scores, 0)

    grad = compute_dscore_dimage(cls_scores, image_in, max_idx)
    print(grad.shape)
    vis = grad * grad
    vis, _ = torch.max(vis, 0)
    
    return vis

class_datasets = [dog_dataset, food_dataset]
  
num_images = 6
for cidx, cname in enumerate(classes):
  
  print(f"Saliency for class: {cname}")
  in_images = []
  vis_images = []
  
  for i, sample in enumerate(class_datasets[cidx]):
    image_i = sample[0]
    label_i = sample[1]
  
    vis_image = visualize_saliency(image_i)
    
    assert list(vis_image.shape) == [224, 224]
    
    vis_images.append(vis_image.unsqueeze(0))
    in_images.append(image_i)
    if i >= num_images:
      break
  
  row_list = list(zip(in_images, vis_images))
  
  show_image_rows(row_list)
  
  cls_mask = labels_np == cidx
  predicted_cls = predicted_class[cls_mask]
  cls_probs = dogfood_class_probs_np[cls_mask]

## **5. Fooling AlexNet**
For many machine learning models, it is possible to "fool" them by tweaking the image slightly so that the image is predicted to become any category [1]. Given any image, and any target class, you can perform gradient ascent to maximize the score of that target class (equivalently, gradient descent on the negative score of that class), stopping when the network confidently predicts it as the target class.

Again, maximize the score with respect to the unnormalized class score (fc8) and not the normalized class probability (prob).

In addition to maximizing the score (minimizing the negative score), also add a regularizer that computes the L2 norm between the original image, and the fooling image. The final gradient will be the sum of the gradient from the regularizer and the gradient from maximizing the class score.

We can write this as a loss $L$:$$
L = -s_y(I) + R(I)
$$

where $$ R(I) = 0.5 {\lambda} \|I - I_\text{orig}\|_2^2 $$$y$ is the target class, and $\lambda$ is the regularization.

Momentum

When optimizing functions with ConvNets, typically you use use gradient descent with momentum, which has the update rule:$$V_t = \mu V_{t-1} - \alpha G$$$$I_t = I_{t-1} + V_{t}$$

where $V$ is the velocity, $\alpha$ is the learning rate, $\mu$ is the momentum parameter, $t$ is the iteration number, and $G = \frac{\partial L}{\partial I_{t-1}}$ is the gradient.

To improve stability, we will use a slightly different update, which normalizes the gradient $G$ to have unit norm:$$V_t = \mu V_{t-1} - \alpha \frac{G}{\|G\|}$$$$I_t = I_{t-1} + V_{t}$$

The norm ${\|G\|}$ is the 2-norm of all the elements in $G$ flattened into a vector.

[1] Szegedy et al, "Intriguing properties of neural networks", ICLR 2014

In [None]:
model = alexnet(pretrained=True)

# Set the model to eval mode
model.eval()

def make_fooling_image(image, target_class, learning_rate, regularization,
                       num_iter, momentum, threshold=0.9):
    """
    Fool AlexNet into thinking that any image has a particular class, by perturbing it just a little bit
    
    :param image: starting image CxHxW tensor
    :param target_class: the class that this will become after optimization
    :param learning_rate: either a constant, or a function that returns the learning rate at each iteration
    :param regularization: lambda parameter to multiply the regularizer
    :param num_iter: maximum number of iterations
    :param momentum: amount of momentum to use in the SGD 
    :param threshold: the target score for target_class
    """
    # Create batch dimension and turn into a variable
    image = image[np.newaxis, ...]
    
    print(f"Fooling AlexNet into thinking this is a: {labels_id2word[target_class]}")
    
    # This is the original image (used by the regularizer)
    orig_data = Variable(image.clone())
    
    image_in = Variable(image, requires_grad=True)
    velocity = torch.zeros_like(image_in)
        
    for i in range(num_iter):
        curr_scores = model(image_in)[0]
        curr_probs = F.softmax(curr_scores, 0)
        
        target_prob = curr_probs[target_class]
        target_score = curr_scores[target_class]
                
        # compute the gradient
        grad_wrt_image = fooling_image_gradient(
            target_score, orig_data, image_in, target_class, regularization)
        
        # update the image with the SGD rule
        image_in, velocity = normalized_sgd_with_momentum_update(
            image_in, grad_wrt_image, velocity, momentum, learning_rate)
        
        # Detach the image and velocity so that we don't backprop through
        # multiple iterations of the loop
        image_in = Variable(image_in.data, requires_grad = True)
        velocity = Variable(velocity.data, requires_grad = False)
        
        # Zero the gradients
        model.zero_grad()
        
        # Take the target probability out of the variable (turn it into float)
        target_prob = target_prob.data.item()
        
        # visualize the current state
        print(f"({i+1}/{num_iter}), {target_prob * 100} confidence")
        
        if target_prob > threshold:
            break
    
    delta = (image_in - orig_data).data
    return image_in, delta


num_images = 2
target_class=113
for cidx, cname in enumerate(classes):
    dataset = class_datasets[cidx]
    
    images_in = []
    fooling_images = []
    deltas = []
    
    for i, input in enumerate(dataset):
      image_in = input[0]
      label = input[1]
      
      fooling_image, delta = make_fooling_image(
          image_in,
          target_class=target_class,
          learning_rate=1e-1,
          regularization=5e-5,
          num_iter=100,
          momentum=0.9)
      
      delta = 0.5 + (5.0/255.0) * delta
    
      images_in.append(image_in)
      fooling_images.append(fooling_image)
      deltas.append(delta)
    
      if i >= num_images:
        break
        
    print ("\nLeft: original, middle: fooling image, right: difference magnified by 5x (gray is 0).\n"
        "AlexNet will classify the middle image in each row as %r with high confidence" % (
        labels_id2word[target_class]))
    show_image_rows(list(zip(images_in, fooling_images, deltas)))

## **GradCam**

GradCam ~ explanation

In 

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [None]:
# take input image and cam-mask and make gradCam image
def show_cam_on_image(img, mask):
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + np.float32(img)
    cam = cam / np.max(cam)

    plt.imshow(cv2.cvtColor(np.uint8(255*cam), cv2.COLOR_BGR2RGB))
    plt.show()

# Preprocess image before entering a pretrained model
def preprocess_image(img):
    means = [0.485, 0.456, 0.406]
    stds = [0.229, 0.224, 0.225]

    preprocessed_img = img.copy()[:, :, ::-1]
    for i in range(3):
        preprocessed_img[:, :, i] = preprocessed_img[:, :, i] - means[i]
        preprocessed_img[:, :, i] = preprocessed_img[:, :, i] / stds[i]
    preprocessed_img = \
        np.ascontiguousarray(np.transpose(preprocessed_img, (2, 0, 1)))
    preprocessed_img = torch.from_numpy(preprocessed_img)
    preprocessed_img.unsqueeze_(0)
    input = preprocessed_img.requires_grad_(True)
    return input

# Take the gradient information flowing into the last convolutional layer of the CNN 
# and features map of last convolutional layer
# and then make masks for guided-gradCam 
def makemask(grads_val, features):
  
  grads_val, features = gradcam(model, input_path)

  target = features
  target = target.cpu().data.numpy()[0, :]

  weights = np.mean(grads_val, axis=(2, 3))[0, :]
  cam = np.zeros(target.shape[1:], dtype=np.float32)

  for i, w in enumerate(weights):
    cam += w * target[i, :, :]

  cam = np.maximum(cam, 0)
  cam = cv2.resize(cam, input_img.shape[2:])
  cam = cam - np.min(cam)
  cam = cam / np.max(cam)

  show_cam_on_image(img, cam)

  return cam

# preprocess guided Backprop and guided grad-Cam to imshow these images
def deprocess_image(img):
    """ see https://github.com/jacobgil/keras-grad-cam/blob/master/grad-cam.py#L65 """
    img = img - np.mean(img)
    img = img / (np.std(img) + 1e-5)
    img = img * 0.1
    img = img + 0.5
    img = np.clip(img, 0, 1)
    return np.uint8(img*255)

# take the model, mask, input image's path to make guided backprop, guided gradCam
def Guide_gradcam(model, mask, input_path):
    img = cv2.imread(input_path, 1)
    img = np.float32(cv2.resize(img, (224, 224))) / 255
    input_img = preprocess_image(img)

    model.eval()

    changerelu(model)

    #Get guided backprop, guided gradCam
    output = model(input_img)
    index = np.argmax(output.cpu().data.numpy())
    one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)
    one_hot[0][index] = 1
    one_hot = torch.from_numpy(one_hot).requires_grad_(True)
    one_hot = torch.sum(one_hot * output)

    one_hot.backward(retain_graph=True)
    output = input_img.grad.cpu().data.numpy()
    output = output[0, :, :, :]

    gb = output
    gb = gb.transpose((1, 2, 0))
    cam_mask = cv2.merge([cam, cam, cam])
    cam_gb = deprocess_image(cam_mask*gb)
    gb = deprocess_image(gb)

    plt.imshow(cv2.cvtColor(cam_gb, cv2.COLOR_BGR2RGB))
    plt.show()
    plt.imshow(cv2.cvtColor(gb, cv2.COLOR_BGR2RGB))
    plt.show()

In [None]:
save_grad = []

# to get a gradient in intermediate gradient value, it is recommand to use .register-hook function
# the hook_func below is used with .register-hook function
def hook_func(grad):
  save_grad.append(grad)
  return grad

def gradcam(model, input_path):
  img = cv2.imread(input_path, 1)
  img = np.float32(cv2.resize(img, (224, 224))) / 255
  input_img = preprocess_image(img)
  x = input_img

  #### PA3 implementation start####
  ### You should take the gradient information flowing into the last convolutional layer of the CNN
  feature_module = model.layer4
  model.eval()
  target_activation = "2"

  for name, module in model._modules.items():
    if module == feature_module:
      for name2, module2 in feature_module._modules.items():
        x = module2(x)
        output = []
        if name2 in target_activation:
          x.register_hook(hook_func)
          features = x
    elif "avgpool" in name.lower():
      x = module(x)
      x = x.view(x.size(0),-1)
    else:
      x = module(x)

  output = x
  
  ### placeholder ###
  # features : intermediate feature map
  # output : model's output

  index = np.argmax(output.cpu().data.numpy())
  one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)
  one_hot[0][index] = 1
  one_hot = torch.from_numpy(one_hot).requires_grad_(True)
  one_hot = torch.sum(one_hot * output)

  feature_module.zero_grad()
  model.zero_grad()
  one_hot.backward(retain_graph=True)

  ## grads_val : intermediate feature map's gradient

  grads_val = save_grad[0].cpu().data.numpy()

  #### PA3 implementation end ####

  return grads_val, features


# To get a guided Backprop and Guided Grad-CAM, you shuld change the relu function in a pretrained model 
# with the built-in function according to the GradCam Paper

# the code below is to assign built-in function
# Source : https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
class MyReLu(Function):
    @staticmethod
    def forward(self, input):
        positive_mask = (input > 0).type_as(input)
        output = torch.addcmul(torch.zeros(input.size()).type_as(input), input, positive_mask)
        self.save_for_backward(input, output)
        return output

    @staticmethod
    def backward(self, grad_output):
        input, output = self.saved_tensors
        grad_input = None

        positive_mask_1 = (input > 0).type_as(grad_output)
        positive_mask_2 = (grad_output > 0).type_as(grad_output)
        grad_input = torch.addcmul(torch.zeros(input.size()).type_as(input),
                                   torch.addcmul(torch.zeros(input.size()).type_as(input), grad_output,
                                                 positive_mask_1), positive_mask_2)
        return grad_input

# the code below is to change relu function with manual built in function
def changerelu(model):
  for name, module in model._modules.items():
    changerelu(module)
    if name == 'relu':
      model._modules[name] = MyReLu.apply  


In [None]:
model = torchvision.models.resnet50(pretrained=True)
input_path = '/gdrive/My Drive/datas/both.png'

grads_val, features = gradcam(model, input_path)

mask = makemask(grads_val, features)

Guide_gradcam(model, mask, input_path)