In [2]:
### PREAMBLE
# Chapter 4 - Adversarial training, solving the outer minimization
# linear.svg

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LightSource

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## **DATASET**

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

data_path = "./data"

mnist_train = datasets.MNIST(data_path, train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST(data_path, train=False, download=True, transform=transforms.ToTensor())
print(mnist_train.data[0].shape)
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

torch.Size([28, 28])


## Attack strategies for Adversarial example generation

In [24]:
def fgsm(model, X, y, epsilon=0.1):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return epsilon * delta.grad.detach().sign()

def pgd(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + X.shape[0]*alpha*delta.grad.data).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False):
    """ Construct FGSM adversarial examples on the examples X"""
    if randomize:
        delta = torch.rand_like(X, requires_grad=True)
        delta.data = delta.data * 2 * epsilon - epsilon
    else:
        delta = torch.zeros_like(X, requires_grad=True)
        
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf_targ2(model, X, y, y_targ=0, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct targeted adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        yp = model(X + delta)
        loss = 2*yp[:,y_targ].sum() - yp.sum()
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def norms(Z):
    """Compute norms over all but the first dimension"""
    return Z.view(Z.shape[0], -1).norm(dim=1)[:,None,None,None]


def pgd_l2(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data += alpha*delta.grad.detach() / norms(delta.grad.detach())
        delta.data = torch.min(torch.max(delta.detach(), -X), 1-X) # clip X+delta to [0,1]
        delta.data *= epsilon / norms(delta.detach()).clamp(min=epsilon)
        delta.grad.zero_()
        
    return delta.detach()

In [5]:
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        yp = model(X)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_adversarial(loader, model, attack, opt=None, **kwargs):
    """Adversarial training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        delta = attack(model, X, y, **kwargs)
        yp = model(X+delta)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

## Model Creation & Adversarial training

In [6]:
from torchvision.models import resnet50

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)  

def get_cnn_model():
    torch.manual_seed(0)
    model = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                                    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                                    Flatten(),
                                    nn.Linear(7*7*64, 100), nn.ReLU(),
                                    nn.Linear(100, 10)).to(device)
    return model

def get_dnn_model():
    torch.manual_seed(0)
    model = nn.Sequential(Flatten(),nn.Linear(784, 100), nn.ReLU(), nn.Linear(100, 10)).to(device)
    return model

In [7]:
from tqdm import tqdm

def adversarial_training(model_func, attack, ckpt_name, n_epochs=5, **kwargs):
    global train_loader, test_loader
    model = model_func()
    opt = optim.SGD(model.parameters(), lr=1e-1)
    for t in tqdm(range(n_epochs)):
        train_err, train_loss = epoch_adversarial(train_loader, model, attack, opt, **kwargs)
        test_err, test_loss = epoch(test_loader, model)
        adv_err, adv_loss = epoch_adversarial(test_loader, model, attack, **kwargs)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-2
    print(f'Misclassification Rate: Train = {train_err}, Test = {test_err}, Adversarial = {adv_err}')
    torch.save(model.state_dict(), ckpt_name)
    del model

### FGSM Based Advesarial example Generation & Training

In [8]:
print("FGSM: epsilon=0.1")
adversarial_training(get_dnn_model, fgsm, "model_cnn_robust.pt")

FGSM: epsilon=0.1


100%|██████████| 5/5 [00:45<00:00,  9.07s/it]

Misclassification Rate: Train = 0.17741666666666667, Test = 0.196, Adversarial = 0.1524





In [9]:
print("FGSM: epsilon=0.01")
adversarial_training(get_dnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.01)

FGSM: epsilon=0.01


100%|██████████| 5/5 [00:43<00:00,  8.64s/it]

Misclassification Rate: Train = 0.05736666666666667, Test = 0.0368, Adversarial = 0.0559





In [10]:
print("FGSM: epsilon=0.001")
adversarial_training(get_dnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.001)

FGSM: epsilon=0.001


100%|██████████| 5/5 [00:43<00:00,  8.71s/it]

Misclassification Rate: Train = 0.044566666666666664, Test = 0.0407, Adversarial = 0.043





In [11]:
print("FGSM: epsilon=0.05")
adversarial_training(get_dnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.05)

FGSM: epsilon=0.05


100%|██████████| 5/5 [00:42<00:00,  8.59s/it]

Misclassification Rate: Train = 0.13816666666666666, Test = 0.0444, Adversarial = 0.133





In [12]:
print("FGSM: epsilon=0.02")
adversarial_training(get_dnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.02)

FGSM: epsilon=0.02


100%|██████████| 5/5 [00:43<00:00,  8.80s/it]

Misclassification Rate: Train = 0.07285, Test = 0.0361, Adversarial = 0.0716





### PGD_linf based Adversarial Generation & Training

In [13]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_linf, "model_cnn_robust.pt")

PGD_linf: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:30<00:00, 18.08s/it]

Misclassification Rate: Train = 0.33775, Test = 0.0846, Adversarial = 0.3193





In [14]:
print("PGD_linf: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_linf: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:30<00:00, 18.03s/it]

Misclassification Rate: Train = 0.34486666666666665, Test = 0.0859, Adversarial = 0.3261





In [15]:
print("PGD_linf: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_dnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_linf: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [01:30<00:00, 18.03s/it]

Misclassification Rate: Train = 0.14586666666666667, Test = 0.0445, Adversarial = 0.1388





In [16]:
print("PGD_linf: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_dnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_linf: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [01:30<00:00, 18.10s/it]

Misclassification Rate: Train = 0.05855, Test = 0.0383, Adversarial = 0.0574





In [17]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_dnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_linf: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [02:19<00:00, 27.99s/it]

Misclassification Rate: Train = 0.3429333333333333, Test = 0.0849, Adversarial = 0.3259





### PGD based Adversarial Generation & Training

In [18]:
print("PGD: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd, "model_dnn_robust.pt")

PGD: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:29<00:00, 17.82s/it]

Misclassification Rate: Train = 0.06716666666666667, Test = 0.0346, Adversarial = 0.0667





In [19]:
print("PGD: alpha=0.01, epsilon=0.01, iters=20")
adversarial_training(get_dnn_model, pgd, "model_dnn_robust.pt", alpha=0.01, epsilon=0.01)

PGD: alpha=0.01, epsilon=0.01, iters=20


100%|██████████| 5/5 [01:28<00:00, 17.80s/it]

Misclassification Rate: Train = 0.0563, Test = 0.0367, Adversarial = 0.0561





In [20]:
print("PGD: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd, "model_dnn_robust.pt", alpha=0.1, epsilon=0.01)

PGD: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:28<00:00, 17.78s/it]

Misclassification Rate: Train = 0.05775, Test = 0.0371, Adversarial = 0.0568





In [21]:
print("PGD: alpha=0.001, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd, "model_dnn_robust.pt", alpha=0.001, epsilon=0.1)

PGD: alpha=0.001, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:29<00:00, 17.82s/it]

Misclassification Rate: Train = 0.04716666666666667, Test = 0.0402, Adversarial = 0.0464





In [23]:
print("PGD: alpha=0.1, epsilon=0.1, iters=40")
adversarial_training(get_dnn_model, pgd, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1, num_iter=40)

PGD: alpha=0.1, epsilon=0.1, iters=40


100%|██████████| 5/5 [02:17<00:00, 27.54s/it]

Misclassification Rate: Train = 0.22413333333333332, Test = 0.0418, Adversarial = 0.2116





### PGD_linf_targ based Adversarial Generation & Training

In [25]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_linf_targ2, "model_cnn_robust.pt")

PGD_linf_targ: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:34<00:00, 18.83s/it]

Misclassification Rate: Train = 0.04593333333333333, Test = 0.0941, Adversarial = 0.0462





In [26]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=20")
adversarial_training(get_dnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.01)

PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=20


100%|██████████| 5/5 [01:34<00:00, 18.90s/it]

Misclassification Rate: Train = 0.04298333333333333, Test = 0.041, Adversarial = 0.0416





In [27]:
print("PGD_linf_targ: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_linf_targ: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:34<00:00, 18.86s/it]

Misclassification Rate: Train = 0.046783333333333336, Test = 0.0848, Adversarial = 0.0453





In [28]:
print("PGD_linf_targ: alpha=0.001, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.001, epsilon=0.1)

PGD_linf_targ: alpha=0.001, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:34<00:00, 18.91s/it]

Misclassification Rate: Train = 0.04295, Test = 0.0425, Adversarial = 0.0422





In [29]:
print("PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=40")
adversarial_training(get_dnn_model, pgd_linf_targ2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.01, num_iter=40)

PGD_linf_targ: alpha=0.01, epsilon=0.01, iters=40


100%|██████████| 5/5 [02:27<00:00, 29.49s/it]

Misclassification Rate: Train = 0.042833333333333334, Test = 0.0402, Adversarial = 0.0416





### PGD_l2 based Adversarial Generation & Training

In [30]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_l2, "model_cnn_robust.pt")

PGD_l2: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:45<00:00, 21.06s/it]

Misclassification Rate: Train = 0.04945, Test = 0.0383, Adversarial = 0.0493





In [31]:
print("PGD_l2: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_dnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_l2: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [01:44<00:00, 20.99s/it]

Misclassification Rate: Train = 0.04956666666666667, Test = 0.0382, Adversarial = 0.049





In [32]:
print("PGD_l2: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_dnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_l2: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [01:44<00:00, 20.83s/it]

Misclassification Rate: Train = 0.0465, Test = 0.0396, Adversarial = 0.0449





In [33]:
print("PGD_l2: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_dnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_l2: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [01:45<00:00, 21.11s/it]

Misclassification Rate: Train = 0.04346666666666667, Test = 0.0413, Adversarial = 0.0419





In [34]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_dnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_l2: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [02:48<00:00, 33.75s/it]

Misclassification Rate: Train = 0.04948333333333333, Test = 0.0383, Adversarial = 0.0494



