In [1]:
from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
from utils import pgd_, fgsm_, gradient_information, adversarial_accuracy
from Nets import MNIST_Net, Gradient_Masked_MNIST
%load_ext autoreload
%autoreload 2
%aimport utils, Nets

## Train a NeuralNet to run experiments on

In [2]:
device = torch.device("cuda")
batch_size = 64
test_batch_size = 1000
epochs = 14
log_interval = 500

model = MNIST_Net(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size)
model.train_on_data(epochs)


Test set: Average loss: 0.0002, Accuracy: 56307/60000 (94%)


Test set: Average loss: 0.0001, Accuracy: 57352/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 57954/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58291/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58634/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58883/60000 (98%)


Test set: Average loss: 0.0001, Accuracy: 58998/60000 (98%)


Test set: Average loss: 0.0000, Accuracy: 59154/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59226/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59268/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59378/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59436/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59502/60000 (99%)


Test set: Average loss: 0.0000, Accuracy: 59512/60000 (99%)



## Attack

In [3]:
adversarial_dataset = torch.utils.data.Subset(model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
adversarial_accuracy(model, adversarial_loader)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
63.1


## Load Gradient Masked Network

In [4]:
masked_model = Gradient_Masked_MNIST(device=device, log_interval=log_interval, batch_size=batch_size, test_batch_size=test_batch_size)
masked_model.train_on_data(epochs)


Test set: Average loss: 0.0014, Accuracy: 48283/60000 (80%)


Test set: Average loss: 0.0008, Accuracy: 53500/60000 (89%)


Test set: Average loss: 0.0006, Accuracy: 55020/60000 (92%)


Test set: Average loss: 0.0005, Accuracy: 56766/60000 (95%)


Test set: Average loss: 0.0004, Accuracy: 57064/60000 (95%)


Test set: Average loss: 0.0003, Accuracy: 57077/60000 (95%)


Test set: Average loss: 0.0002, Accuracy: 57202/60000 (95%)


Test set: Average loss: 0.0002, Accuracy: 57422/60000 (96%)


Test set: Average loss: 0.0002, Accuracy: 57691/60000 (96%)


Test set: Average loss: 0.0002, Accuracy: 57776/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 57810/60000 (96%)


Test set: Average loss: 0.0001, Accuracy: 58277/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58271/60000 (97%)


Test set: Average loss: 0.0001, Accuracy: 58131/60000 (97%)



## Attack

In [12]:
adversarial_dataset = torch.utils.data.Subset(masked_model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
adversarial_accuracy(masked_model, adversarial_loader)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
16.5


## Gradient masking metric
Check alignment of gradient at adv point with (adv point - original point)

### Normal network

In [6]:
n_examples = 1000
x = torch.cat([model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [7]:
[(i, np.nanmean(gradient_information(model, x, y, iters=i*10, clip_min=model.normalized_min, clip_max=model.normalized_max, device=device).detach().cpu().numpy())) for i in range(1, 40, 5)]

[(1, 0.19403648),
 (6, 0.53083456),
 (11, 0.5480431),
 (16, 0.5501983),
 (21, 0.54799485),
 (26, 0.5483353),
 (31, 0.54809546),
 (36, 0.54717046)]

### FGSM trained network

In [8]:
n_examples = 1000
x = torch.cat([masked_model.train_dataset[i][0].unsqueeze(0) for i in range(n_examples)]).to(device)
y = torch.LongTensor([masked_model.train_dataset[i][1] for i in range(n_examples)]).to(device)

In [9]:
[(i, np.nanmean(gradient_information(masked_model, x, y, iters=i*10, clip_min=masked_model.normalized_min, clip_max=masked_model.normalized_max, device=device).detach().cpu().numpy())) for i in range(1, 40, 5)]

[(1, 0.19549523),
 (6, 0.43253633),
 (11, 0.44213217),
 (16, 0.4425391),
 (21, 0.44382584),
 (26, 0.44337183),
 (31, 0.44322023),
 (36, 0.44287926)]

## Adversarial Training

In [48]:
def adv_train(model, device, train_loader, optimizer, epochs):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epochs + 1):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            adv_data = pgd_(model, data, target, 0.1, 0.5, iters=7, targeted=False, device=device, clip_min=normalized_min, clip_max=normalized_max)
            optimizer.zero_grad()
            output = model(adv_data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))

In [55]:
undefended_model = type(model)().to(device)
undefended_model.load_state_dict(model.state_dict())
adv_train(model, device, train_loader, optimizer, 4)



In [56]:
adversarial_accuracy(model, adversarial_loader)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
94.19999999999999


## Black Box Attack

In [17]:
def black_box_adversarial_accuracy(model, undefended_model, dataset_loader):
    correct = 0
    for batch_idx, (data, target) in enumerate(dataset_loader):
        data, target = data.to(device), target.to(device)
        adv = pgd_(undefended_model, data, target, 0.1, 0.7, iters=20, targeted=False, device=device, clip_min=model.normalized_min, clip_max=model.normalized_max)
        output = model(adv)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        if (batch_idx % 100 == 0):
            print('{} / {}'.format(batch_idx * dataset_loader.batch_size, len(dataset_loader.dataset)))
    print ((correct/len(dataset_loader.dataset) * 100))

In [18]:
adversarial_dataset = torch.utils.data.Subset(masked_model.test_dataset, [i for i in range(1000)])
adversarial_loader = torch.utils.data.DataLoader(adversarial_dataset, batch_size=2, num_workers=2, shuffle=False)
black_box_adversarial_accuracy(masked_model, model, adversarial_loader)

0 / 1000
200 / 1000
400 / 1000
600 / 1000
800 / 1000
80.80000000000001
