In [1]:
from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
from utils import pgd_, fgsm_, gradient_information, adversarial_accuracy, gradient_norm
from Nets import MNIST_Net, Gradient_Masked_MNIST, PGD_MNIST
%load_ext autoreload
%autoreload 2
%aimport utils, Nets

## Train a NeuralNet to run experiments on

In [2]:
device = torch.device("cuda")
batch_size = 64
test_batch_size = 1000
epochs = 14
log_interval = 500

model = MNIST_Net(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size)
model.train_on_data(epochs)


Test set: Average loss: 0.0002, Accuracy: 56098/60000 (93%)


Test set: Average loss: 0.0002, Accuracy: 57132/60000 (95%)


Test set: Average loss: 0.0001, Accuracy: 57703/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 58154/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58495/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58714/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58876/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 59015/60000 (98%)


Test set: Average loss: 0.0000, Accuracy: 59120/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59218/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59305/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59371/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59429/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59453/60000 (99%)



## Attack

In [3]:
adversarial_dataset = torch.utils.data.Subset(model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
adversarial_accuracy(model, adversarial_loader, attack=fgsm_, eps=1)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
16.400000000000002


## Load Gradient Masked Network

In [4]:
masked_model = Gradient_Masked_MNIST(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size)
masked_model.train_on_data(epochs)


Test set: Average loss: 0.0005, Accuracy: 55055/60000 (92%)


Test set: Average loss: 0.0003, Accuracy: 57152/60000 (95%)


Test set: Average loss: 0.0002, Accuracy: 57769/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 58161/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58355/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58488/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58751/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58749/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58973/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 59034/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 59107/60000 (99%)


Test set: Average loss: 0.0001, Accuracy: 59176/60000 (99%)


Test set: Average loss: 0.0001, Accuracy: 59172/60000 (99%)


Test set: Average loss: 0.0001, Accuracy: 59221/60000 (99%)



## Attack

In [5]:
adversarial_dataset = torch.utils.data.Subset(masked_model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
adversarial_accuracy(masked_model, adversarial_loader, attack=fgsm_, eps=1)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
88.9


## Load PGD trained network

In [23]:
pgd_model = PGD_MNIST(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size, step=0.05, eps=1, iters=20)
pgd_model.train_on_data(epochs)


Test set: Average loss: 0.0009, Accuracy: 50384/60000 (84%)


Test set: Average loss: 0.0004, Accuracy: 55457/60000 (92%)


Test set: Average loss: 0.0003, Accuracy: 57132/60000 (95%)


Test set: Average loss: 0.0002, Accuracy: 57800/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 58186/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58364/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58545/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58669/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58762/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58821/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58929/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58995/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 59029/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 59086/60000 (98%)



## Attack

In [24]:
adversarial_dataset = torch.utils.data.Subset(pgd_model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
adversarial_accuracy(pgd_model, adversarial_loader, attack=fgsm_, eps=1)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
90.9


## Gradient masking metric
Check alignment of gradient at adv point with (adv point - original point)

### Normal network

In [8]:
n_examples = 1000
x = torch.cat([model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [9]:
[(i, np.nanmean(gradient_information(model, x, y, iters=i*10, clip_min=model.normalized_min, clip_max=model.normalized_max, device=device).detach().cpu().numpy())) for i in range(1, 40, 5)]

[(1, 0.20531067),
 (6, 0.5354743),
 (11, 0.550822),
 (16, 0.55232334),
 (21, 0.55256784),
 (26, 0.5516728),
 (31, 0.5516398),
 (36, 0.5519692)]

### FGSM trained network

In [58]:
n_examples = 1000
x = torch.cat([masked_model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([masked_model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [11]:
[(i, np.nanmean(gradient_information(masked_model, x, y, iters=i*10, clip_min=masked_model.normalized_min, clip_max=masked_model.normalized_max, device=device).detach().cpu().numpy())) for i in range(1, 40, 5)]

[(1, 0.18873714),
 (6, 0.48728827),
 (11, 0.5040706),
 (16, 0.50230885),
 (21, 0.5038546),
 (26, 0.5071839),
 (31, 0.5036746),
 (36, 0.49869025)]

### PGD trained network

In [25]:
n_examples = 1000
x = torch.cat([pgd_model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([pgd_model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [26]:
[(i, np.nanmean(gradient_information(pgd_model, x, y, iters=i*10, clip_min=pgd_model.normalized_min, clip_max=pgd_model.normalized_max, device=device).detach().cpu().numpy())) for i in range(1, 40, 5)]

[(1, 0.10904554),
 (6, 0.3140713),
 (11, 0.33384696),
 (16, 0.32671914),
 (21, 0.32991028),
 (26, 0.32651478),
 (31, 0.32870236),
 (36, 0.32701364)]

### Check gradient norms

In [27]:
n_examples = 1000
x = torch.cat([masked_model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([masked_model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [28]:
gradient_norm(model, x, y, device=device), gradient_norm(masked_model, x, y, device=device), gradient_norm(pgd_model, x, y, device=device)

(tensor(2.4784e-05, device='cuda:0'),
 tensor(2.4713e-05, device='cuda:0'),
 tensor(2.0038e-05, device='cuda:0'))

## Adversarial Training

In [48]:
def adv_train(model, device, train_loader, optimizer, epochs):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epochs + 1):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            adv_data = pgd_(model, data, target, 0.1, 0.5, iters=7, targeted=False, device=device, clip_min=normalized_min, clip_max=normalized_max)
            optimizer.zero_grad()
            output = model(adv_data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))

In [55]:
undefended_model = type(model)().to(device)
undefended_model.load_state_dict(model.state_dict())
adv_train(model, device, train_loader, optimizer, 4)



In [56]:
adversarial_accuracy(model, adversarial_loader)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
94.19999999999999


## Black Box Attack

In [29]:
def black_box_adversarial_accuracy(model, surrogate_model, dataset_loader):
    correct = 0
    for batch_idx, (data, target) in enumerate(dataset_loader):
        data, target = data.to(device), target.to(device)
        adv = fgsm_(surrogate_model, data, target, 1, targeted=False, device=device, clip_min=model.normalized_min, clip_max=model.normalized_max)
        output = model(adv)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        if (batch_idx % 100 == 0):
            print('{} / {}'.format(batch_idx * dataset_loader.batch_size, len(dataset_loader.dataset)))
    print ((correct/len(dataset_loader.dataset) * 100))

In [31]:
surrogate_model = MNIST_Net(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size, oracle=masked_model)
surrogate_model.train_on_data(epochs)
adversarial_dataset = torch.utils.data.Subset(masked_model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
black_box_adversarial_accuracy(masked_model, surrogate_model, adversarial_loader)


Test set: Average loss: 0.0002, Accuracy: 55863/60000 (93%)


Test set: Average loss: 0.0002, Accuracy: 57001/60000 (95%)


Test set: Average loss: 0.0001, Accuracy: 57487/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 57873/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 58089/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58334/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58462/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58558/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58609/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58717/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58753/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58726/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58797/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58847/60000 (98%)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
66.10000000000001
