In [None]:
### PREAMBLE
# Chapter 4 - Adversarial training, solving the outer minimization
# linear.svg

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LightSource

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## **DATASET**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

data_path = "/content/data"

mnist_train = datasets.MNIST(data_path, train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST(data_path, train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /content/data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /content/data/MNIST/raw/train-images-idx3-ubyte.gz to /content/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /content/data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /content/data/MNIST/raw/train-labels-idx1-ubyte.gz to /content/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /content/data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /content/data/MNIST/raw/t10k-images-idx3-ubyte.gz to /content/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /content/data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /content/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to /content/data/MNIST/raw



## Attack strategies for Adversarial example generation

In [None]:
def fgsm(model, X, y, epsilon=0.1):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return epsilon * delta.grad.detach().sign()

def pgd(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + X.shape[0]*alpha*delta.grad.data).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False):
    """ Construct FGSM adversarial examples on the examples X"""
    if randomize:
        delta = torch.rand_like(X, requires_grad=True)
        delta.data = delta.data * 2 * epsilon - epsilon
    else:
        delta = torch.zeros_like(X, requires_grad=True)
        
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def pgd_linf_targ2(model, X, y, y_targ, epsilon=0.1, alpha=0.01, num_iter=20):
    """ Construct targeted adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        yp = model(X + delta)
        loss = 2*yp[:,y_targ].sum() - yp.sum()
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

def norms(Z):
    """Compute norms over all but the first dimension"""
    return Z.view(Z.shape[0], -1).norm(dim=1)[:,None,None,None]


def pgd_l2(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20):
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data += alpha*delta.grad.detach() / norms(delta.grad.detach())
        delta.data = torch.min(torch.max(delta.detach(), -X), 1-X) # clip X+delta to [0,1]
        delta.data *= epsilon / norms(delta.detach()).clamp(min=epsilon)
        delta.grad.zero_()
        
    return delta.detach()

In [None]:
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        yp = model(X)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_adversarial(loader, model, attack, opt=None, **kwargs):
    """Adversarial training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        delta = attack(model, X, y, **kwargs)
        yp = model(X+delta)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

## Model Creation & Adversarial training

In [None]:
from torchvision.models import resnet50

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)  

def get_cnn_model():
    torch.manual_seed(0)
    model = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                                    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                                    nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                                    Flatten(),
                                    nn.Linear(7*7*64, 100), nn.ReLU(),
                                    nn.Linear(100, 10)).to(device)
    return model

In [None]:
from tqdm import tqdm

def adversarial_training(model_func, attack, ckpt_name, n_epochs=5, **kwargs):
    global train_loader, test_loader
    model = model_func()
    opt = optim.SGD(model.parameters(), lr=1e-1)
    for t in tqdm(range(n_epochs)):
        train_err, train_loss = epoch_adversarial(train_loader, model, attack, opt, **kwargs)
        test_err, test_loss = epoch(test_loader, model)
        adv_err, adv_loss = epoch_adversarial(test_loader, model, attack, **kwargs)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-2
    print(f'Misclassification Rate: Train = {train_err}, Test = {test_err}, Adversarial = {adv_err}')
    torch.save(model.state_dict(), ckpt_name)
    del model

### FGSM Based Advesarial example Generation & Training

In [None]:
print("FGSM: epsilon=0.1")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt")

FGSM: epsilon=0.1


100%|██████████| 5/5 [01:27<00:00, 17.58s/it]

Misclassification Rate: Train = 0.03593333333333333, Test = 0.0145, Adversarial = 0.04





In [None]:
print("FGSM: epsilon=0.01")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.01)

FGSM: epsilon=0.01


100%|██████████| 5/5 [01:18<00:00, 15.79s/it]

Misclassification Rate: Train = 0.012583333333333334, Test = 0.0123, Adversarial = 0.0171





In [None]:
print("FGSM: epsilon=0.001")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.001)

FGSM: epsilon=0.001


100%|██████████| 5/5 [01:18<00:00, 15.68s/it]

Misclassification Rate: Train = 0.01015, Test = 0.0118, Adversarial = 0.0125





In [None]:
print("FGSM: epsilon=0.05")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.05)

FGSM: epsilon=0.05


100%|██████████| 5/5 [01:18<00:00, 15.75s/it]

Misclassification Rate: Train = 0.022416666666666668, Test = 0.0112, Adversarial = 0.0267





In [None]:
print("FGSM: epsilon=0.02")
adversarial_training(get_cnn_model, fgsm, "model_cnn_robust.pt", epsilon=0.02)

FGSM: epsilon=0.02


100%|██████████| 5/5 [01:18<00:00, 15.76s/it]

Misclassification Rate: Train = 0.0145, Test = 0.0123, Adversarial = 0.0203





### PGD_linf based Adversarial Generation & Training

In [None]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt")

PGD_linf: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:31<00:00, 90.31s/it]

Misclassification Rate: Train = 0.03495, Test = 0.0154, Adversarial = 0.0403





In [None]:
print("PGD_linf: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_linf: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:30<00:00, 90.18s/it]

Misclassification Rate: Train = 0.03501666666666667, Test = 0.0141, Adversarial = 0.04





In [None]:
print("PGD_linf: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_linf: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [07:33<00:00, 90.64s/it]

Misclassification Rate: Train = 0.0222, Test = 0.0122, Adversarial = 0.0274





In [None]:
print("PGD_linf: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_linf: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [07:35<00:00, 91.03s/it]

Misclassification Rate: Train = 0.011666666666666667, Test = 0.0122, Adversarial = 0.0165





In [None]:
print("PGD_linf: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_cnn_model, pgd_linf, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_linf: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [14:17<00:00, 171.51s/it]

Misclassification Rate: Train = 0.03498333333333333, Test = 0.0147, Adversarial = 0.0419





### PGD_l2 based Adversarial Generation & Training

In [None]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt")

PGD_l2: alpha=0.01, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:37<00:00, 91.52s/it]

Misclassification Rate: Train = 0.0117, Test = 0.0125, Adversarial = 0.0154





In [None]:
print("PGD_l2: alpha=0.1, epsilon=0.1, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.1, epsilon=0.1)

PGD_l2: alpha=0.1, epsilon=0.1, iters=20


100%|██████████| 5/5 [07:37<00:00, 91.46s/it]

Misclassification Rate: Train = 0.01165, Test = 0.0131, Adversarial = 0.0161





In [None]:
print("PGD_l2: alpha=0.05, epsilon=0.05, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.05, epsilon=0.05)

PGD_l2: alpha=0.05, epsilon=0.05, iters=20


100%|██████████| 5/5 [07:37<00:00, 91.46s/it]

Misclassification Rate: Train = 0.010816666666666667, Test = 0.0129, Adversarial = 0.0142





In [None]:
print("PGD_l2: alpha=0.001, epsilon=0.01, iters=20")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.001, epsilon=0.01)

PGD_l2: alpha=0.001, epsilon=0.01, iters=20


100%|██████████| 5/5 [07:37<00:00, 91.47s/it]

Misclassification Rate: Train = 0.0098, Test = 0.0154, Adversarial = 0.016





In [None]:
print("PGD_l2: alpha=0.01, epsilon=0.1, iters=40")
adversarial_training(get_cnn_model, pgd_l2, "model_cnn_robust.pt", alpha=0.01, epsilon=0.1, num_iter=40)

PGD_l2: alpha=0.01, epsilon=0.1, iters=40


100%|██████████| 5/5 [14:26<00:00, 173.29s/it]

Misclassification Rate: Train = 0.01185, Test = 0.0139, Adversarial = 0.0169



