## Frontier Stiching Attack

Paper: https://arxiv.org/pdf/1711.01894.pdf

In this attack we demonstrate that frontier stiching does NOT survive the blackbox attack. We train an original model with the embedded watermark and a reference model without watermark on non-overlapping datasets. Then, we do model stealing on the original model and show that watermark extraction is **non-unique**, i.e. the same watermark can be extracted from the reference model. 

In [1]:
import sys, os
base_path = os.getcwd()[0:os.getcwd().rfind('Watermark')] + "Watermark/"
sys.path.append(base_path) 

%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from keras import backend as K

from cleverhans.attacks import FastGradientMethod
from cleverhans.utils_keras import KerasModelWrapper

from src.adversarial_main import adversarial_blackbox, zerobit_embed, blackmarks_embed
from src.models import get_deep_cnn_for_cifar, get_lenet_model_for_mnist
from src.preprocess_data import load_cifar_images, load_mnist_images
from src.util import plot_blackbox, merge_histories
from src.callbacks import AdditionalValidationSets

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


In [2]:
sess = tf.Session()
K.set_session(sess)

In [3]:
split1 = 20000
split2 = 40000
split3 = 60000

In [4]:
(x_train, y_train), (x_test, y_test) = load_mnist_images()

In [5]:
# Embed the watermark into the model
original_model = get_lenet_model_for_mnist()

original_model, history, trigger = zerobit_embed(model=original_model,
                                     x_train=x_train[:split1],
                                     y_train=y_train[:split1],
                                     x_test=x_test,
                                     y_test=y_test,
                                     sess=sess,
                                     epochs=5,
                                     wm_epochs=5)

Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
dim is deprecated, use axis instead
#######These are false adv########
[ 0  1  2  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 24 25 26
 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
 51 53]
[  3  12  23  52  59  72  77  82  84  99 111 128 136 149 151 155 179 183
 184 185 201 205 208 211 214 220 223 248 250 255 263 279 281 288 298 310
 315 325 337 341 347 349 361 377 387 394 405 406 408 432]
##################################
Train on 100 samples, validate on 10000 samples
Epoch 1/5
=> Time: : 3.968735933303833
=> watermark_val: 0.56
Epoch 2/5
=> Time: : 3.9532856941223145
=> watermark_val: 0.66
Epoch 3/5
=> Time: : 4.011355876922607
=> watermark_val: 0.8
Epoch 4/5
=> Time: : 4.079674959182739
=> watermark_val: 0.82
Epoch 5/5
=> Time

In [6]:
# Steal the model 
stolen_model = get_lenet_model_for_mnist()

y_pred = original_model.predict(x_train[split1:split2])
stolen_history = AdditionalValidationSets([(trigger['keys'][0], trigger['keys'][1], 'watermark')])
stolen_model.fit(x_train[split1:split2],
                    y_pred,
                    batch_size=64,
                    epochs=5,
                    validation_data=(x_test, y_test),
                    callbacks=[stolen_history],
                    verbose=1)

Train on 20000 samples, validate on 10000 samples
Epoch 1/5
=> watermark_val: 0.55
Epoch 2/5
=> watermark_val: 0.51
Epoch 3/5
=> watermark_val: 0.55
Epoch 4/5
=> watermark_val: 0.51
Epoch 5/5
=> watermark_val: 0.52


<keras.callbacks.History at 0x7fc694505278>

In [7]:
# Train a random model
rnd_model = get_lenet_model_for_mnist()
rnd_history = AdditionalValidationSets([(trigger['keys'][0], trigger['keys'][1], 'watermark')])
rnd_model.fit(x_train[split2:split3],
                    y_train[split2:split3],
                    batch_size=64,
                    epochs=5,
                    validation_data=(x_test, y_test),
                    callbacks=[rnd_history],
                    verbose=1)

Train on 20000 samples, validate on 10000 samples
Epoch 1/5
=> watermark_val: 0.51
Epoch 2/5
=> watermark_val: 0.61
Epoch 3/5
=> watermark_val: 0.46
Epoch 4/5
=> watermark_val: 0.58
Epoch 5/5
=> watermark_val: 0.54


<keras.callbacks.History at 0x7fc67c6759e8>

In [8]:
# Evaluate watermark retention in all models
original_loss = original_model.evaluate(trigger['keys'][0], trigger['keys'][1])
stolen_loss = stolen_model.evaluate(trigger['keys'][0], trigger['keys'][1])
random_loss = rnd_model.evaluate(trigger['keys'][0], trigger['keys'][1])
print("Original: {}, Stolen: {}, Random: {}".format(original_loss[1], stolen_loss[1], random_loss[1]))

Original: 0.99, Stolen: 0.52, Random: 0.54


In [1]:
# Plot graphs 
all_history = (history, stolen_history, rnd_history)

plt.figure(figsize=(20, 10))
params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'font.size': 16}
plt.rcParams.update(params)
color_original_acc = "blue"
color_original_wm = "green"

color_stolen_acc = "yellow"
color_stolen_wm = "green"

color_rnd_acc = "red"
color_rnd_wm = "lightgreen"


linestyle_test_acc = "x-"
linestyle_watermark = "x--"
fontsize_data_labels = 16
linewidth = 3.0
markersize = 12

plt.xlabel('Epochs', fontsize=26)
plt.ylabel('Accuracy', fontsize=26)

o = len(history.history['val_acc'])-1

original_acc_x, original_acc_y = np.arange(len(history.history['val_acc'])), history.history['val_acc']
original_acc_line = plt.plot(original_acc_x,
             original_acc_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_original_acc)

original_wm_x, original_wm_y = np.arange(len(history.history['watermark_val'])), history.history['watermark_val']
original_wm_line = plt.plot(original_wm_x,
             original_wm_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_original_wm)

stolen_acc_x, stolen_acc_y = np.arange(o,o+len(stolen_history.history['val_acc'])), stolen_history.history['val_acc']
stolen_acc_line = plt.plot(stolen_acc_x,
             stolen_acc_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_stolen_acc)

stolen_wm_x, stolen_wm_y = np.arange(o,o+len(stolen_history.history['watermark_val'])), stolen_history.history['watermark_val']
stolen_wm_line = plt.plot(stolen_wm_x,
             stolen_wm_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_stolen_wm)

rnd_acc_x, rnd_acc_y = np.arange(o,o+len(rnd_history.history['val_acc'])), rnd_history.history['val_acc']
rnd_acc_line = plt.plot(rnd_acc_x,
             rnd_acc_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_rnd_acc)

rnd_wm_x, rnd_wm_y = np.arange(o,o+len(rnd_history.history['watermark_val'])), rnd_history.history['watermark_val']
rnd_wm_line = plt.plot(rnd_wm_x,
             rnd_wm_y,
             linestyle_watermark,
             linewidth=linewidth,
             markersize=markersize,
             color=color_rnd_wm)

plt.axvline(o,
                linestyle=':',
                linewidth=linewidth,
                color='red')



plt.ylim(0, 1.05)
plt.xlim(0)

plt.grid()

plt.legend(['Owner Acc', 'Owner WM', 'Stolen Acc', 'Stolen WM', 'Ref Acc', 'Ref WM'],
           loc='lower left')
plt.show()
    

NameError: name 'history' is not defined