In [1]:
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
import numpy as np
import matplotlib.pyplot as plt
import foolbox

Using TensorFlow backend.


In [2]:
# Get the data via keras
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()


# Normalize 8-bit values
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

# Further break training data into train / validation sets (# put 5000 into validation set and keep remaining 55,000 for train)
(x_train, x_valid) = x_train[5000:], x_train[:5000] 
(y_train, y_valid) = y_train[5000:], y_train[:5000]

# Reshape input data from (28, 28) to (28, 28, 1) to make keras happy
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h, 1)
x_valid = x_valid.reshape(x_valid.shape[0], w, h, 1)
x_test = x_test.reshape(x_test.shape[0], w, h, 1)

# hang onto the labels, as we'll need them when we add adversarial images to the training set
y_train_class = y_train
y_valid_class = y_valid
y_test_class = y_test

# One-hot encode the labels to make keras happy
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_valid = tf.keras.utils.to_categorical(y_valid, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [3]:
MAX_ADVERSARIES = 100
ATTACK_CLASS = 2

In [5]:
# load the model arising from running https://github.com/margaretmz/deep-learning/blob/master/fashion_mnist_keras.ipynb
kmodel = tf.keras.models.load_model('model.weights.best.hdf5')

# create a foolbox-friendly model with its keras wrapper
#fmodel = foolbox.models.TensorFlowModel.from_keras(model = kmodel,bounds = (0.0, 1.0))
fmodel = foolbox.models.KerasModel(model = kmodel,bounds = (0.0, 1.0))

In [7]:
# create an adversarial training set based on the original one
x_train_adv = x_train
y_train_adv = y_train
x_test_adv = x_test
y_test_adv = y_test


# create a Foolbox attack 
attack_lbfgs = foolbox.v1.attacks.LBFGSAttack(fmodel)
adversarial_count = 0
adversarial_indices = [] 

for i in range(0,len(x_train)):
  if(y_train_class[i]==ATTACK_CLASS):
    image = x_train[i]
    adversarial = attack_lbfgs(image,label = y_train_class[i])
    model_one_hot_pred = kmodel.predict(image.reshape(1,w,h,1))
    model_pred = np.argmax(model_one_hot_pred)
    adver_one_hot_pred = kmodel.predict(adversarial.reshape(1,w,h,1))
    adver_pred = np.argmax(adver_one_hot_pred)
    if(model_pred != adver_pred):
        label_entry = np.zeros(10)
        label_entry[model_pred] = 1
        adversarial_indices.append(i)
        x_train_adv=np.append(x_train_adv,[adversarial],axis=0)
        y_train_adv=np.append(y_train_adv,[label_entry],axis=0)
        adversarial_count += 1
        if(adversarial_count == MAX_ADVERSARIES):
            break

  "Not running the attack because the original input"


In [8]:
misclassified_count = 0

for i in range(len(x_train),len(x_train_adv)):
    image = x_train_adv[i]
    model_one_hot_pred = kmodel.predict(image.reshape(1,w,h,1))
    model_pred = np.argmax(model_one_hot_pred)
    actual = np.argmax(y_train_adv[i])
    if(model_pred != actual):
        misclassified_count += 1
        
print(100*(misclassified_count/len(adversarial_indices)))



100.0


In [9]:
checkpointer = ModelCheckpoint(filepath='adv.model.weights.best.hdf5', verbose = 1, save_best_only=True)
kmodel.fit(x_train_adv,
         y_train_adv,
         batch_size=64,
         epochs=10,
         validation_data=(x_valid, y_valid),
         callbacks=[checkpointer])

Train on 55100 samples, validate on 5000 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.22619, saving model to adv.model.weights.best.hdf5
Epoch 2/10
Epoch 00002: val_loss improved from 0.22619 to 0.22164, saving model to adv.model.weights.best.hdf5
Epoch 3/10
Epoch 00003: val_loss improved from 0.22164 to 0.21721, saving model to adv.model.weights.best.hdf5
Epoch 4/10
Epoch 00004: val_loss improved from 0.21721 to 0.21721, saving model to adv.model.weights.best.hdf5
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.21721
Epoch 6/10
Epoch 00006: val_loss improved from 0.21721 to 0.21236, saving model to adv.model.weights.best.hdf5
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.21236
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.21236
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.21236
Epoch 10/10
Epoch 00010: val_loss improved from 0.21236 to 0.21009, saving model to adv.model.weights.best.hdf5


<tensorflow.python.keras.callbacks.History at 0x7facf4685860>

In [10]:
misclassified_count = 0

for i in range(len(x_train),len(x_train_adv)):
    image = x_train_adv[i]
    model_one_hot_pred = kmodel.predict(image.reshape(1,w,h,1))
    model_pred = np.argmax(model_one_hot_pred)
    actual = np.argmax(y_train_adv[i])
    if(model_pred != actual):
        misclassified_count += 1
        
print(100*(misclassified_count/len(adversarial_indices)))

55.00000000000001
