In [1]:
import gc
import random

import keras
import numpy as np
import seaborn as sns
import tensorflow as tf

tf.compat.v1.disable_eager_execution()

from art import config
from art.utils import load_dataset
from art.estimators.classification import KerasClassifier
from art.attacks.evasion import ProjectedGradientDescent
from art.data_generators import KerasDataGenerator
from art.defences.trainer import AdversarialTrainer

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.layers import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator


from matplotlib import pyplot as plt

from tqdm.auto import tqdm

## Params

In [2]:
# fix randomness
seed = 42

random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# training parameters
n_epochs = 20
batch_size = 128
dataset_subsample = 0.5

## Data

In [3]:
# load data
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('mnist')

# Subsample the data set for speed
_, x_train_s = train_test_split(x_train, test_size=dataset_subsample, random_state=seed, stratify=y_train)
_, x_test_s = train_test_split(x_test, test_size=dataset_subsample, random_state=seed, stratify=y_test)
_, y_train_s = train_test_split(y_train, test_size=dataset_subsample, random_state=seed, stratify=y_train)
_, y_test_s = train_test_split(y_test, test_size=dataset_subsample, random_state=seed, stratify=y_test)

# labels as integers
y_test_s_labels = np.argmax(y_test_s, axis=-1)

print(x_train_s.shape, x_test_s.shape, y_train_s.shape, y_test_s.shape)

(30000, 28, 28, 1) (5000, 28, 28, 1) (30000, 10) (5000, 10)


## Model

In [4]:
def build_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',kernel_initializer='he_normal',input_shape=input_shape))
    model.add(MaxPool2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu',padding='same',kernel_initializer='he_normal'))
    model.add(MaxPool2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    

    model.compile(
        loss=keras.losses.categorical_crossentropy,
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        metrics=['accuracy']
    )
    
    return model

In [12]:
# Check that the model would train normally
model = build_model(input_shape=(28, 28, 1), num_classes=10)
print(model.summary())
model.fit(x_train_s, y_train_s, epochs=10)
safety_pred = np.argmax(model.predict(x_test_s), axis=-1)
print('Accuracy on clean testing data')
print(classification_report(y_test_s_labels, safety_pred, digits=5))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 13, 13, 64)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 6, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              2360320   
_________________________________________________________________
dense_3 (Dense)              (None, 10)               

In [13]:
del model

## Adversarial training

In [6]:
model = build_model(input_shape=(28, 28, 1), num_classes=10)

# Wrappers for ART
classifier = KerasClassifier(
    model=model,
    clip_values=(min_, max_),
    use_logits=False
)

In [7]:
# Attacker object
pgd = ProjectedGradientDescent(
    classifier,
    eps=0.3,
    eps_step=0.01,
    max_iter=40
)

In [8]:
%%time

# Create adversarial trainer and perform adversarial training
trainer = AdversarialTrainer(
    classifier,
    pgd,
    ratio=1.0
)

trainer.fit(x_train_s, y_train_s, nb_epochs=n_epochs, batch_size=batch_size)

HBox(children=(FloatProgress(value=0.0, description='Precompute adv samples', max=1.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Adversarial training epochs', max=20.0, style=ProgressSty…

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
Please use Model.fit, which supports generators.

CPU times: user 1d 37min 39s, sys: 1h 1min 35s, total: 1d 1h 39min 15s
Wall time: 1h 45min 36s


In [9]:
# Evaluate the adversarially trained model on clean test set
test_pred = np.argmax(classifier.predict(x_test_s), axis=-1)
print('Accuracy on clean testing data')
print(classification_report(y_test_s_labels, test_pred, digits=5))

Accuracy on clean testing data
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000       490
           1    0.11340   1.00000   0.20370       567
           2    0.00000   0.00000   0.00000       516
           3    0.00000   0.00000   0.00000       505
           4    0.00000   0.00000   0.00000       491
           5    0.00000   0.00000   0.00000       446
           6    0.00000   0.00000   0.00000       479
           7    0.00000   0.00000   0.00000       514
           8    0.00000   0.00000   0.00000       487
           9    0.00000   0.00000   0.00000       505

    accuracy                        0.11340      5000
   macro avg    0.01134   0.10000   0.02037      5000
weighted avg    0.01286   0.11340   0.02310      5000



  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Evaluate the adversarially trained model on clean test set
test_pred = np.argmax(classifier.predict(x_test_s), axis=-1)
print('Accuracy on clean testing data')
print(classification_report(y_test_s_labels, test_pred, digits=5))

Accuracy on clean testing data
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000       490
           1    0.11340   1.00000   0.20370       567
           2    0.00000   0.00000   0.00000       516
           3    0.00000   0.00000   0.00000       505
           4    0.00000   0.00000   0.00000       491
           5    0.00000   0.00000   0.00000       446
           6    0.00000   0.00000   0.00000       479
           7    0.00000   0.00000   0.00000       514
           8    0.00000   0.00000   0.00000       487
           9    0.00000   0.00000   0.00000       505

    accuracy                        0.11340      5000
   macro avg    0.01134   0.10000   0.02037      5000
weighted avg    0.01286   0.11340   0.02310      5000



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
%%time

# Evaluate the adversarially trained model on fresh adversarial samples produced on the adversarially trained model
x_test_pgd_new = pgd.generate(x_test_s[:100])

CPU times: user 15.6 s, sys: 490 ms, total: 16.1 s
Wall time: 1.32 s


In [11]:
labels_pgd_new = np.argmax(classifier.predict(x_test_pgd_new), axis=-1)
print(classification_report(y_test_s_labels[:100], labels_pgd_new, digits=5))

              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000        14
           1    0.13000   1.00000   0.23009        13
           2    0.00000   0.00000   0.00000        10
           3    0.00000   0.00000   0.00000         8
           4    0.00000   0.00000   0.00000        10
           5    0.00000   0.00000   0.00000         7
           6    0.00000   0.00000   0.00000        12
           7    0.00000   0.00000   0.00000        10
           8    0.00000   0.00000   0.00000         9
           9    0.00000   0.00000   0.00000         7

    accuracy                        0.13000       100
   macro avg    0.01300   0.10000   0.02301       100
weighted avg    0.01690   0.13000   0.02991       100

