In [1]:
### PREAMBLE
# Chapter 4 - Adversarial training, solving the outer minimization
# linear.svg

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LightSource

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## **DATASET**

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

data_path = "./data"

cifar_train = datasets.CIFAR10(data_path, train=True, download=True, transform=transforms.ToTensor())
cifar_test = datasets.CIFAR10(data_path, train=False, download=True, transform=transforms.ToTensor())
print(cifar_train.data[0].shape)
train_loader = DataLoader(cifar_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(cifar_test, batch_size = 100, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
(32, 32, 3)


## Attack strategies for Adversarial example generation

In [3]:
def fgsm(model, X, y, epsilon=0.1):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return epsilon * delta.grad.detach().sign()

def pgd(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + X.shape[0]*alpha*delta.grad.data).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False):
    """ Construct FGSM adversarial examples on the examples X"""
    if randomize:
        delta = torch.rand_like(X, requires_grad=True)
        delta.data = delta.data * 2 * epsilon - epsilon
    else:
        delta = torch.zeros_like(X, requires_grad=True)
        
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf_targ2(model, X, y, y_targ=0, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct targeted adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        yp = model(X + delta)
        loss = 2*yp[:,y_targ].sum() - yp.sum()
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def norms(Z):
    """Compute norms over all but the first dimension"""
    return Z.view(Z.shape[0], -1).norm(dim=1)[:,None,None,None]


def pgd_l2(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data += alpha*delta.grad.detach() / norms(delta.grad.detach())
        delta.data = torch.min(torch.max(delta.detach(), -X), 1-X) # clip X+delta to [0,1]
        delta.data *= epsilon / norms(delta.detach()).clamp(min=epsilon)
        delta.grad.zero_()
        
    return delta.detach()

In [4]:
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        yp = model(X)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_adversarial(loader, model, attack, opt=None, **kwargs):
    """Adversarial training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        delta = attack(model, X, y, **kwargs)
        yp = model(X+delta)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

## Model Creation & Adversarial training

In [19]:
from torchvision.models import resnet50

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)  

def get_cnn_model():
    torch.manual_seed(0)
    model = nn.Sequential(nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                                    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                                    Flatten(),
                                    nn.Linear(4096, 100), nn.ReLU(),
                                    nn.Linear(100, 10)).to(device)
    return model

In [20]:
from tqdm import tqdm

def adversarial_training(model_func, attack, ckpt_name, n_epochs=5, **kwargs):
    global train_loader, test_loader
    model = model_func()
    opt = optim.SGD(model.parameters(), lr=1e-1)
    for t in tqdm(range(n_epochs)):
        train_err, train_loss = epoch_adversarial(train_loader, model, attack, opt, **kwargs)
        test_err, test_loss = epoch(test_loader, model)
        adv_err, adv_loss = epoch_adversarial(test_loader, model, attack, **kwargs)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-2
    print(f'Misclassification Rate: Train = {train_err}, Test = {test_err}, Adversarial = {adv_err}')
    torch.save(model.state_dict(), ckpt_name)
    del model

### FGSM Based Advesarial example Generation & Training

In [21]:
print("FGSM: epsilon=0.1")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt")

FGSM: epsilon=0.1


100%|██████████| 5/5 [01:18<00:00, 15.78s/it]

Misclassification Rate: Train = 0.90188, Test = 0.9, Adversarial = 0.9





In [22]:
print("FGSM: epsilon=0.01")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.01)

FGSM: epsilon=0.01


100%|██████████| 5/5 [01:16<00:00, 15.27s/it]

Misclassification Rate: Train = 0.5924, Test = 0.4803, Adversarial = 0.577





In [23]:
print("FGSM: epsilon=0.001")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.001)

FGSM: epsilon=0.001


100%|██████████| 5/5 [01:16<00:00, 15.36s/it]

Misclassification Rate: Train = 0.44376, Test = 0.427, Adversarial = 0.4476





In [24]:
print("FGSM: epsilon=0.05")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.05)

FGSM: epsilon=0.05


100%|██████████| 5/5 [01:17<00:00, 15.40s/it]

Misclassification Rate: Train = 0.78154, Test = 0.6613, Adversarial = 0.7654





In [25]:
print("FGSM: epsilon=0.02")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.02)

FGSM: epsilon=0.02


100%|██████████| 5/5 [01:16<00:00, 15.35s/it]

Misclassification Rate: Train = 0.82794, Test = 0.7241, Adversarial = 0.7663





### PGD_linf based Adversarial Generation & Training

In [26]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt")

PGD_linf: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [06:59<00:00, 83.83s/it]

Misclassification Rate: Train = 0.90176, Test = 0.9, Adversarial = 0.9





In [27]:
print("PGD_linf: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_linf: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [06:58<00:00, 83.71s/it]

Misclassification Rate: Train = 0.9019, Test = 0.8627, Adversarial = 0.9043





In [28]:
print("PGD_linf: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_linf: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [06:58<00:00, 83.64s/it]

Misclassification Rate: Train = 0.78204, Test = 0.6573, Adversarial = 0.7663





In [29]:
print("PGD_linf: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_linf: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [06:57<00:00, 83.58s/it]

Misclassification Rate: Train = 0.59164, Test = 0.4835, Adversarial = 0.5783





In [30]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_linf: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [12:58<00:00, 155.61s/it]

Misclassification Rate: Train = 0.90176, Test = 0.9, Adversarial = 0.9





### PGD based Adversarial Generation & Training

In [31]:
print("PGD: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd, "model_dnn_robust.pt")

PGD: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [06:57<00:00, 83.51s/it]

Misclassification Rate: Train = 0.49846, Test = 0.4452, Adversarial = 0.5127





In [32]:
print("PGD: alpha=0.01, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd, "model_dnn_robust.pt", alpha=0.01, epsilon=0.01)

PGD: alpha=0.01, epsilon=0.01, iters=20


100%|██████████| 5/5 [06:57<00:00, 83.44s/it]

Misclassification Rate: Train = 0.49842, Test = 0.4401, Adversarial = 0.5076





In [33]:
print("PGD: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd, "model_dnn_robust.pt", alpha=0.1, epsilon=0.01)

PGD: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [06:56<00:00, 83.39s/it]

Misclassification Rate: Train = 0.57592, Test = 0.473, Adversarial = 0.5688





In [34]:
print("PGD: alpha=0.001, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd, "model_dnn_robust.pt", alpha=0.001, epsilon=0.1)

PGD: alpha=0.001, epsilon=0.1, iters=20


100%|██████████| 5/5 [06:57<00:00, 83.48s/it]

Misclassification Rate: Train = 0.43572, Test = 0.4242, Adversarial = 0.4439





In [35]:
print("PGD: alpha=0.1, epsilon=0.1, iters=40")
adversarial_training(get_cnn_model, pgd, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1, num_iter=40)

PGD: alpha=0.1, epsilon=0.1, iters=40


100%|██████████| 5/5 [12:59<00:00, 155.82s/it]

Misclassification Rate: Train = 0.68678, Test = 0.5044, Adversarial = 0.6748





### PGD_linf_targ based Adversarial Generation & Training

In [36]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf_targ2, "model_cnn_robust.pt")

PGD_linf_targ: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:00<00:00, 84.17s/it]

Misclassification Rate: Train = 0.51882, Test = 0.6507, Adversarial = 0.5172





In [37]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.01)

PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=20


100%|██████████| 5/5 [07:00<00:00, 84.13s/it]

Misclassification Rate: Train = 0.42814, Test = 0.4291, Adversarial = 0.4343





In [38]:
print("PGD_linf_targ: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_linf_targ: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:01<00:00, 84.34s/it]

Misclassification Rate: Train = 0.50142, Test = 0.5962, Adversarial = 0.4931





In [39]:
print("PGD_linf_targ: alpha=0.001, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.001, epsilon=0.1)

PGD_linf_targ: alpha=0.001, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:01<00:00, 84.32s/it]

Misclassification Rate: Train = 0.42712, Test = 0.4414, Adversarial = 0.4313





In [40]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=40")
adversarial_training(get_cnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.01, num_iter=40)

PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=40


100%|██████████| 5/5 [13:03<00:00, 156.67s/it]

Misclassification Rate: Train = 0.42416, Test = 0.4215, Adversarial = 0.4176





### PGD_l2 based Adversarial Generation & Training

In [41]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt")

PGD_l2: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:06<00:00, 85.22s/it]

Misclassification Rate: Train = 0.47542, Test = 0.4332, Adversarial = 0.4769





In [42]:
print("PGD_l2: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_l2: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:06<00:00, 85.25s/it]

Misclassification Rate: Train = 0.4941, Test = 0.4542, Adversarial = 0.4949





In [43]:
print("PGD_l2: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_l2: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [07:06<00:00, 85.27s/it]

Misclassification Rate: Train = 0.45476, Test = 0.4205, Adversarial = 0.449





In [44]:
print("PGD_l2: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_l2: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [07:06<00:00, 85.20s/it]

Misclassification Rate: Train = 0.43354, Test = 0.4297, Adversarial = 0.4369





In [45]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_l2: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [13:11<00:00, 158.29s/it]

Misclassification Rate: Train = 0.49054, Test = 0.4504, Adversarial = 0.4898



