In [6]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(0)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [0]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)    

model_dnn_2 = nn.Sequential(Flatten(), nn.Linear(784,200), nn.ReLU(), 
                            nn.Linear(200,10)).to(device)

model_dnn_4 = nn.Sequential(Flatten(), nn.Linear(784,200), nn.ReLU(), 
                            nn.Linear(200,100), nn.ReLU(),
                            nn.Linear(100,100), nn.ReLU(),
                            nn.Linear(100,10)).to(device)

model_cnn = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
                          nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                          nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                          nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                          Flatten(),
                          nn.Linear(7*7*64, 100), nn.ReLU(),
                          nn.Linear(100, 10)).to(device)


train.. trains the models 

In [0]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

mnist_train = datasets.MNIST("../data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("../data", train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)

In [0]:
def epoch(loader, model, opt=None):
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        yp = model(X)
        loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

In [10]:
opt = optim.SGD(model_dnn_2.parameters(), lr=1e-1)
for _ in range(10):
    train_err, train_loss = epoch(train_loader, model_dnn_2, opt)
    test_err, test_loss = epoch(test_loader, model_dnn_2)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

0.131833	0.510995	0.081200	0.286766
0.075933	0.264755	0.062600	0.220393
0.059733	0.210222	0.053800	0.182406
0.049333	0.173824	0.047700	0.158933
0.042233	0.148129	0.038800	0.136713
0.036483	0.128240	0.036700	0.123244
0.032017	0.113127	0.033400	0.112570
0.028550	0.100832	0.030500	0.102933
0.025017	0.090675	0.029900	0.100128
0.022583	0.082377	0.029700	0.099003


In [11]:
opt = optim.SGD(model_dnn_4.parameters(), lr=1e-1)
for _ in range(10):
    train_err, train_loss = epoch(train_loader, model_dnn_4, opt)
    test_err, test_loss = epoch(test_loader, model_dnn_4)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

0.230483	0.760702	0.085500	0.281864
0.065100	0.220869	0.045500	0.151801
0.042217	0.143030	0.035800	0.117922
0.031800	0.106133	0.032200	0.105105
0.025883	0.084971	0.027300	0.086052
0.020483	0.068996	0.028600	0.091057
0.017200	0.057379	0.024700	0.078578
0.014883	0.049171	0.024600	0.077526
0.012600	0.041440	0.026500	0.085066
0.010817	0.035355	0.022400	0.076369


In [12]:
opt = optim.SGD(model_cnn.parameters(), lr=1e-1)
for t in range(10):
    train_err, train_loss = epoch(train_loader, model_cnn, opt)
    test_err, test_loss = epoch(test_loader, model_cnn)
    if t == 4:
        for param_group in opt.param_groups:
            param_group["lr"] = 1e-2
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

0.235417	0.674366	0.035900	0.111383
0.027317	0.089329	0.021200	0.068788
0.017850	0.057688	0.016600	0.051576
0.013917	0.043483	0.014900	0.045710
0.010217	0.032973	0.015400	0.049015
0.004683	0.015756	0.012400	0.038307
0.003467	0.012394	0.011700	0.038140
0.003083	0.010924	0.012000	0.038105
0.002600	0.009754	0.012000	0.039070
0.002367	0.008930	0.012600	0.039601


In [0]:
torch.save(model_dnn_2.state_dict(), "model_dnn_2.pt")
torch.save(model_dnn_4.state_dict(), "model_dnn_4.pt")
torch.save(model_cnn.state_dict(), "model_cnn.pt")

### Targeted attacks

What we have considered so far are "untargeted" attacks, meaning they effectively try to change the label to _any_ alternative, rather than change it to a particular alternative.  As a different task, which we saw briefly in the introduction, we can change the attack to try to convert the prediction to a particular alernative.  This is a task known as a "targetted attack", and it can be achieved using the same strategy overall strategy as we did previously.  However, in this case the only difference is that instead of maximizing the loss of the true label, we maximize the loss of the loss of the true label and also minimize the loss for the alternative label.  This is equivalent to solving the inner optimization problem
\begin{equation}
\maximize_{\|\delta\| \leq \epsilon} \left ( \ell(h_\theta(x + \delta), y) - \ell(h_\theta(x + \delta), y_{\mathrm{targ}}) \right ) \equiv \maximize_{\|\delta\| \leq \epsilon} \left ( h_\theta(x + \delta)_{y_{\mathrm{targ}}} - h_\theta(x + \delta)_y \right )
\end{equation}
Let's see what this looks like, using a PGD attack (without randomized restarts).  Note that in order to achieve our targetted class in most of these cases on MNIST, we use a slightly larger perturbation region, $\epsilon=0.2$.

In [0]:
def pgd_linf_targ(model, X, y, epsilon, alpha, num_iter, y_targ):
    """ Construct targeted adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        yp = model(X + delta)
        loss = (yp[:,y_targ] - yp.gather(1,y[:,None])[:,0]).sum()
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

Let's look at trying to make the class label all equation to 2.

In [15]:
delta = pgd_linf_targ(model_cnn, X, y, epsilon=0.2, alpha=1e-2, num_iter=40, y_targ=2)
yp = model_cnn(X + delta)
plot_images(X+delta, y, yp, 3, 6)

NameError: ignored

This looks pretty good: albeit with a slightly larger $\epsilon$, we can fool the classifier into predicting that all the examples are class 2 (note that the actual 2 is unchanged, because the loss function in this case is always exactly zero).  Let's try using a target class of 0 instead.

In [0]:
delta = pgd_linf_targ(model_cnn, X, y, epsilon=0.2, alpha=1e-2, num_iter=40, y_targ=0)
yp = model_cnn(X + delta)
plot_images(X+delta, y, yp, 3, 6)

While we are able to fool the classifier for all the non-zero digits, it's worth pointing out that we don't actually achieve the target class here in all cases.  This is because the optimization objective we are maximizing is the class logit for the zero minus the class logit for the true class.  But we don't actually care what happens to the other classes, and in some cases, the best way to make the class 0 logit high is to make another class logit even higher.  We can get around this by modifying our objective to maximize the target class logit and minimize _all_ the other logits, i.e.,
\begin{equation}
\maximize_{\|\delta\| \leq \epsilon} \left ( h_\theta(x + \delta)_{y_{\mathrm{targ}}} - \sum_{y' \neq y_{\mathrm{targ}}} h_\theta(x + \delta)_{y'} \right )
\end{equation}

In [0]:
def pgd_linf_targ2(model, X, y, epsilon, alpha, num_iter, y_targ):
    """ Construct targeted adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    for t in range(num_iter):
        yp = model(X + delta)
        loss = 2*yp[:,y_targ].sum() - yp.sum()
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

In [0]:
delta = pgd_linf_targ(model_cnn, X, y, epsilon=0.2, alpha=1e-2, num_iter=40, y_targ=0)
yp = model_cnn(X + delta)
plot_images(X+delta, y, yp, 3, 6)

This is a more difficult objective than the previous one, so we aren't able to fool the classifier as much.  But when we _do_ fool the classifier, it more consistently (even if still not perfectly) able to predict the target class.