# Part 0: Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

from art.estimators.classification import KerasClassifier
from art.attacks.evasion import FastGradientMethod, CarliniLInfMethod, CarliniL2Method , ProjectedGradientDescentTensorFlowV2, AutoProjectedGradientDescent,ProjectedGradientDescent

print(tf.__version__)

# Part 1 : Building a Model

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

category_loss_from_logits = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(28, 28)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(10, activation='linear')
])

model.compile(optimizer='adam',
              loss=category_loss_from_logits,
              metrics=['accuracy']);

In [None]:
model.fit(x_train, y_train, epochs=5);

In [None]:
loss_test, accuracy_test = model.evaluate(x_test, y_test)
print('Accuracy on test data: {:4.2f}%'.format(accuracy_test * 100))

In [None]:
#1/ Train for 5 epochs a model called "model1" that has only one hidden layer of 64 neurons and only *linear* activations instead of ReLU;
#compare its accuracy with the first model


model1 = tf.keras.models.Sequential([
     #fill missing code
])

model1.compile(optimizer='adam',
              loss=category_loss_from_logits,
              metrics=['accuracy']);

model1.fit(x_train,y_train, epochs=5);


loss_test, accuracy_test = model1.evaluate(x_test, y_test)
print('Accuracy on test data: {:4.2f}%'.format(accuracy_test * 100))

In [None]:
#2/ Train for 5 epochs a model called "model2" that has 5 dense layer of 128 neurons and a relu activation and no dropout; compare its accuracy with the first model

model2 = tf.keras.models.Sequential([
      #fill missing code
])

model2.compile(optimizer='adam',
              loss=category_loss_from_logits,
              metrics=['accuracy']);

model2.fit(x_train, y_train, epochs=5);
loss_test, accuracy_test = model2.evaluate(x_test, y_test)
print('Accuracy on test data: {:4.2f}%'.format(accuracy_test * 100))

In [None]:
#2bis/ (OPTIONAL CHALLENGE /!\) Train for 5 epochs a "model3" that obtains more 99% of accuracy on *test* data (hint: you might want to use 2D convolutional layers,i.e. tf.keras.layers.Conv2D(...), and MaxPool layers, i.e.  tf.keras.layers.MaxPooling2D(...))
model3 = tf.keras.models.Sequential([
     #fill missing code
])

model3.compile(optimizer='adam',
              loss=category_loss_from_logits,
              metrics=['accuracy']);

model3.fit(x_train, y_train, epochs=5);
loss_test, accuracy_test = model3.evaluate(x_test, y_test)
print('Accuracy on test data: {:4.2f}%'.format(accuracy_test * 100))

# Part 2: Simple Whitebox Adversarial Attacks

In [None]:
classifier = KerasClassifier(model=model, clip_values=(0, 1), use_logits=True, )
x_test = x_test[0:100]
y_test = y_test[0:100]

## FGSM attacks

In [None]:
attack_fgsm = FastGradientMethod(estimator=classifier, eps=16/255, norm=np.inf)

In [None]:
x_test_adv = attack_fgsm.generate(x_test)

In [None]:
loss_test, accuracy_test = model.evaluate(x_test_adv, y_test)
perturbation = np.mean(np.abs((x_test_adv - x_test)))

print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.2f}'.format(perturbation))

In [None]:
y_pred_0 = model.predict(x_test[0:1])
print(f"predicted class of original image: {y_pred_0.argmax()}")
plt.matshow(x_test[0])
plt.show()

In [None]:
y_pred_0 = model.predict(x_test_adv[0:1])
print(f"predicted class of adversarial attack: {y_pred_0.argmax()}")
plt.matshow(x_test_adv[0])
plt.show()

In [None]:
#3/ Run an FGSM attack on the norm Inf and 2 with EQUIVALENT epsilons, e.g. eps=4/255 for the norm Inf and eps=(16/255)*sqrt(784) for the l2 norm. Display the results. Which one leads to the best image?

attack_fgsm = FastGradientMethod(
         #fill missing code
)
x_test_adv = attack_fgsm.generate(x_test)

loss_test, accuracy_test = model.evaluate(x_test_adv, y_test)
perturbation = np.mean(np.abs((x_test_adv - x_test)))

print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.5f}'.format(perturbation))
y_pred_0 = model.predict(x_test_adv[0:1])
print(f"predicted class of adversarial attack: {y_pred_0.argmax()}")
plt.matshow(x_test_adv[0])
plt.show()

In [None]:
#3/ Run an FGSM attack on the norm Inf and 2 with EQUIVALENT epsilons, e.g. eps=16/255 for the norm Inf and eps=(16/255)*sqrt(784) for the l2 norm. Display the results. Which one leads to the best image?

attack_fgsm = FastGradientMethod(
         #fill missing code
)
x_test_adv = attack_fgsm.generate(x_test)

loss_test, accuracy_test = model.evaluate(x_test_adv, y_test)
perturbation = np.mean(np.abs((x_test_adv - x_test)))

print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.5f}'.format(perturbation))
y_pred_0 = model.predict(x_test_adv[0:1])
print(f"predicted class of adversarial attack: {y_pred_0.argmax()}")
plt.matshow(x_test_adv[0])
plt.show()

# Part 3: Other Adversarial Attacks (Carlini-Wagner, APGD...)

In [None]:
attack_cw = CarliniL2Method(classifier=classifier,
                              max_iter=20,
                              learning_rate=0.01,
                              initial_const=1e0,
                              )

In [None]:
x_test_adv_cw = attack_cw.generate(x_test[:100])

In [None]:
idx = 0
plt.matshow(x_test_adv_cw[idx])
y_pred_adv_cw = model.predict(x_test_adv_cw[idx:idx+1])
print(f"Predicted class: {y_pred_adv_cw.argmax()}")
print(f"L2 dist to original image {np.linalg.norm(x_test_adv_cw[idx] - x_test[idx],):.2f}")
plt.show()

In [None]:
loss_test, accuracy_test = model.evaluate(x_test_adv_cw, y_test[:100])
perturbation = np.mean(np.abs((x_test_adv_cw - x_test)))
l_2_perturbation = np.linalg.norm(x_test_adv_cw - x_test,axis=(1,2),)
l_2_perturbation_mean_cw = np.mean(l_2_perturbation)
print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.2f}'.format(perturbation))
print('Average l2 perturbation: {:4.2f}'.format(l_2_perturbation_mean))

In [None]:
#4/ Generate a new FGSM attack with the same l2 perturbation as the previous CW and compare their effectiveness: i.e. %of success for similar average l2 perturbation

eps= l_2_perturbation_mean_cw
attack_fgsm = FastGradientMethod(

                                      #fill missing code
)
x_test_adv = attack_fgsm.generate(x_test)

loss_test, accuracy_test = model.evaluate(x_test_adv, y_test)


l_2_perturbation = np.linalg.norm(x_test_adv - x_test,axis=(1,2))
l_2_perturbation_mean = np.mean(l_2_perturbation)


print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.2f}'.format(perturbation))
print('Average l2 perturbation: {:4.2f}'.format(l_2_perturbation_mean))

idx = 0
plt.matshow(x_test_adv[idx])
y_pred_adv = model.predict(x_test_adv[idx:idx+1])
print(f"Predicted class: {y_pred_adv.argmax()}")
print(f"L2 dist to original image {np.linalg.norm(x_test_adv[idx] - x_test[idx],):.2f}")
plt.show()

In [None]:
#5/ Generate a new APGD attack (class AutoProjectedGradientDescent) with the same (or lower) average l2 perturbation as the previous CW and FGSM, and compare their effectiveness
# to limit computation time you can set 'max_iter=10' and "nb_random_init=1", in the function arguments
eps= l_2_perturbation_mean_cw

attack_apgd = AutoProjectedGradientDescent(
         #fill missing code

)
x_test_adv_apgd = attack_apgd.generate(x_test)

loss_test, accuracy_test = model.evaluate(x_test_adv_apgd, y_test)


l_2_perturbation = np.linalg.norm(x_test_adv_apgd - x_test,axis=(1,2))
l_2_perturbation_mean = np.mean(l_2_perturbation)
print(l_2_perturbation_mean)

print('Accuracy on adversarial test data: {:4.2f}%'.format(accuracy_test * 100))
print('Average perturbation: {:4.2f}'.format(perturbation))
print('Average l2 perturbation: {:4.2f}'.format(l_2_perturbation_mean))
idx = 0
plt.matshow(x_test_adv_apgd[idx])
y_pred_adv_apgd = model.predict(x_test_adv_apgd[idx:idx+1])
print(f"Predicted class: {y_pred_adv_apgd.argmax()}")
print(f"L2 dist to original image {np.linalg.norm(x_test_adv_apgd[idx] - x_test[idx],):.2f}")
plt.show()

In [None]:
#5bis/ /!\ OPTIONAL CHALLENGE: find an adversarial attack in the same library (https://adversarial-robustness-toolbox.readthedocs.io/en/latest/modules/attacks/evasion.html)
#having better results with the SAME l2 perturbation

eps= l_2_perturbation_mean_cw
attack_? = ?( eps=eps, norm=2, #find a more efficient attack

)

# Part 4: Transfer attacks

In [None]:
classifier_source = KerasClassifier(model=model2, clip_values=(0, 1))
classifier_target = KerasClassifier(model=model, clip_values=(0, 1))

attack_fgsm = FastGradientMethod(estimator=classifier_source, eps=16/255, norm=np.inf)

In [None]:
x_transfer_adv = attack_fgsm.generate(x_test)

In [None]:
perturbation = np.mean(np.abs((x_transfer_adv - x_test)))
loss_source, accuracy_source = model2.evaluate(x_transfer_adv, y_test)
loss_target, accuracy_target = model.evaluate(x_transfer_adv, y_test)

print('Average perturbation: {:4.2f}'.format(perturbation))
print('Accuracy on adversarial test data for source model: {:4.2f}%'.format(accuracy_source * 100))
print('Accuracy on adversarial test data for target model: {:4.2f}%'.format(accuracy_target * 100))

In [None]:
#6/ Compare the transferability of the attack when using the norm 2 and eps=0.3

In [None]:
attack_fgsm = FastGradientMethod(estimator=classifier_source, eps=0.3, norm=2)

In [None]:
x_transfer_adv = attack_fgsm.generate(x_test)

In [None]:
perturbation = np.mean(np.abs((x_transfer_adv - x_test)))
loss_source, accuracy_source = model2.evaluate(x_transfer_adv, y_test)
loss_target, accuracy_target = model.evaluate(x_transfer_adv, y_test)

print('Average perturbation: {:4.2f}'.format(perturbation))
print('Accuracy on adversarial test data for source model: {:4.2f}%'.format(accuracy_source * 100))
print('Accuracy on adversarial test data for target model: {:4.2f}%'.format(accuracy_target * 100))

# Part 5: Adversarial Attack defenses

## Adversarial training

In [None]:
from art import config
from keras.models import load_model
from art.utils import get_file, load_dataset

(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('mnist')


In [None]:
path = get_file('mnist_cnn_original.h5', extract=False, path=config.ART_DATA_PATH,
                url='https://www.dropbox.com/s/p2nyzne9chcerid/mnist_cnn_original.h5?dl=1')
classifier_model = load_model(path)
classifier = KerasClassifier(clip_values=(min_, max_), model=classifier_model, use_logits=False)

In [None]:
classifier_model.summary()

In [None]:
x_test_pred = np.argmax(classifier.predict(x_test), axis=1)
nb_correct_pred = np.sum(x_test_pred == np.argmax(y_test, axis=1))

print("Original test data:")
print("Correctly classified: {}".format(nb_correct_pred))
print("Incorrectly classified: {}".format(len(x_test)-nb_correct_pred))
print("Accuracy: {}".format(nb_correct_pred/len(x_test)*100))

In [None]:
attacker = FastGradientMethod(classifier, eps=0.5)
x_test_adv = attacker.generate(x_test, y_test)

In [None]:
x_test_adv_pred = np.argmax(classifier.predict(x_test_adv), axis=1)
nb_correct_adv_pred = np.sum(x_test_adv_pred == np.argmax(y_test, axis=1))

print("Adversarial test data:")
print("Correctly classified: {}".format(nb_correct_adv_pred))
print("Incorrectly classified: {}".format(len(x_test_adv)-nb_correct_adv_pred))
print("Accuracy: {}".format(nb_correct_adv_pred/len(x_test_adv)*100))

In [None]:
path = get_file('mnist_cnn_robust.h5', extract=False, path=config.ART_DATA_PATH,
                url='https://www.dropbox.com/s/yutsncaniiy5uy8/mnist_cnn_robust.h5?dl=1')
robust_classifier_model = load_model(path)
robust_classifier = KerasClassifier(clip_values=(min_, max_), model=robust_classifier_model, use_logits=False)

In [None]:
robust_classifier_model.summary()

In [None]:
x_test_robust_pred = np.argmax(robust_classifier.predict(x_test), axis=1)
nb_correct_robust_pred = np.sum(x_test_robust_pred == np.argmax(y_test, axis=1))

print("Original test data:")
print("Correctly classified: {}".format(nb_correct_robust_pred))
print("Incorrectly classified: {}".format(len(x_test)-nb_correct_robust_pred))
print("Accuracy: {}".format(nb_correct_robust_pred/len(x_test)*100))

In [None]:
attacker_robust = FastGradientMethod(robust_classifier, eps=0.5)
x_test_adv_robust = attacker_robust.generate(x_test, y_test)

In [None]:
x_test_adv_robust_pred = np.argmax(robust_classifier.predict(x_test_adv_robust), axis=1)
nb_correct_adv_robust_pred = np.sum(x_test_adv_robust_pred == np.argmax(y_test, axis=1))

print("Adversarial test data:")
print("Correctly classified: {}".format(nb_correct_adv_robust_pred))
print("Incorrectly classified: {}".format(len(x_test_adv_robust)-nb_correct_adv_robust_pred))
print("Accuracy: {}".format(nb_correct_adv_robust_pred/len(x_test_adv_robust)*100))

### Against stronger attacks: PGD

In [None]:
from art.attacks.evasion import ProjectedGradientDescent
attacker_pgd = ProjectedGradientDescent(estimator=classifier, eps=0.5, eps_step=0.01, max_iter=100, verbose=False)
attacker_robust_pgd = ProjectedGradientDescent(estimator=robust_classifier, eps=0.5, eps_step=0.01, max_iter=100, verbose=False)

In [None]:
eps_range = [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
eps_range = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
nb_correct_original = []
nb_correct_robust = []

nb_samples = 100

for eps in eps_range:
    print("Running eps {}".format(eps))
    attacker_pgd.set_params(**{'eps': eps})
    attacker_robust_pgd.set_params(**{'eps': eps})
    x_test_adv = attacker_pgd.generate(x_test[:nb_samples], y_test[:nb_samples])
    x_test_adv_robust = attacker_robust_pgd.generate(x_test[:nb_samples], y_test[:nb_samples])

    x_test_adv_pred = np.argmax(classifier.predict(x_test_adv), axis=1)
    nb_correct_original += [np.sum(x_test_adv_pred == np.argmax(y_test[:nb_samples], axis=1)) / nb_samples]

    x_test_adv_robust_pred = np.argmax(robust_classifier.predict(x_test_adv_robust), axis=1)
    nb_correct_robust += [np.sum(x_test_adv_robust_pred == np.argmax(y_test[:nb_samples], axis=1)) / nb_samples]

eps_range = [0] + eps_range
nb_correct_original = [nb_correct_pred / 10000] + nb_correct_original
nb_correct_robust = [nb_correct_robust_pred / 10000] + nb_correct_robust



In [None]:
fig, ax = plt.subplots()
ax.plot(np.array(eps_range), np.array(nb_correct_original), 'b--', label='Original classifier')
ax.plot(np.array(eps_range), np.array(nb_correct_robust), 'r--', label='Robust classifier')

legend = ax.legend(loc='upper right', shadow=True, fontsize='large')

plt.xlabel('Peturbation size (eps, L-Inf)')
plt.ylabel('Classification Accuracy')
plt.show()

In [None]:
#8/ What happens when we reduce the number of max_iter to 10 ? Which model becomes better?



1.   List item
2.   List item

