1. Use a deep network for the MNIST data set. Perform at least three different types of
targeted attacks on 5 different numbers, including one attack which puts particular
effort on the fact that the attacked pattern is indistinguishable from the original one..
Evaluate the performance of the attacks visually (which attack does not change the
visual impression) and quantitatively (distance of attack to original sample, success
rate of the approach).


In [34]:
#importing all the required libraries

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import backend as K

In [35]:
#Loading the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [37]:
#Preprossesing and splitting the dataset

# Reshape the data
x_train = np.reshape(x_train, (len(x_train), 784))
x_test = np.reshape(x_test, (len(x_test), 784))

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0


In [38]:
# Convert labels to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

In [39]:
#defining the Deep Neural Network
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

In [40]:
# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f606c22a110>

Now we define 3 different targeted attacks :

1.Fast Gradient Sign Method(FGSM)

2.Deep Fool

3.Adversarial Examples via Transformation-Based Error Feedback(ABBA).

In [41]:
# Define function for FGSM attack
def fgsm_attack(input_image, target_label, epsilon=0.1):
    input_tensor = tf.convert_to_tensor(input_image, dtype=tf.float32)
    target_label = tf.convert_to_tensor(target_label, dtype=tf.float32)

    with tf.GradientTape() as tape:
        tape.watch(input_tensor)
        prediction = model(input_tensor)
        loss = K.categorical_crossentropy(target_label, prediction)

    gradient = tape.gradient(loss, input_tensor)
    signed_grad = tf.sign(gradient)
    perturbed_image = input_image + epsilon * signed_grad
    perturbed_image = np.clip(perturbed_image, 0, 1)

    return perturbed_image

In [42]:
def deepfool_attack(input_image, num_classes=10, overshoot=0.02, max_iter=50):
    input_image = tf.Variable(input_image, dtype=tf.float32)
    f = model(input_image)
    I = tf.argmax(f)
    I = tf.reshape(I, (1,))
    I = tf.cast(I, tf.int32)
    w = tf.Variable(np.zeros((num_classes,)), dtype=tf.float32)
    r = tf.zeros_like(input_image)
    perturbed_image = tf.Variable(input_image, dtype=tf.float32)

    for _ in range(max_iter):
        f = model(perturbed_image)
        f = tf.reshape(f, (num_classes,))
        I = tf.cast(I, tf.int32)
        label = tf.argmax(f)
        k_i = label
        pert = np.inf
        for k in range(num_classes):
            if k == k_i:
                continue
            w_gradient = f[k] - f[k_i]
            w_gradient_grad = tf.gradients(w_gradient, input_image)[0]
            norm = tf.norm(w_gradient_grad)
            pert_k = abs(w_gradient) / norm
            if pert_k < pert:
                pert = pert_k
                w = w_gradient_grad

        r_i = (pert + 1e-4) * w / tf.norm(w)
        r = tf.add(r, r_i)

        perturbed_image = tf.add(perturbed_image, (1 + overshoot) * r)
        perturbed_image = tf.clip_by_value(perturbed_image, 0, 1)
        perturbed_image = tf.Variable(perturbed_image)

    return perturbed_image

In [43]:
def abba_attack(input_image, target_label, num_iterations=50, epsilon=0.01):
    perturbed_image = tf.Variable(input_image, dtype=tf.float32)
    target_label = tf.constant(target_label, dtype=tf.float32)

    for _ in range(num_iterations):
        with tf.GradientTape() as tape:
            tape.watch(perturbed_image)
            prediction = model(perturbed_image)
            loss = K.categorical_crossentropy(target_label, prediction)

        gradient = tape.gradient(loss, perturbed_image)
        signed_grad = tf.sign(gradient)
        perturbed_image = tf.clip_by_value(perturbed_image + epsilon * signed_grad, 0, 1)

    return perturbed_image


In [44]:
# Perform targeted attacks and evaluate results
def evaluate_attack(attack_func, attack_indices):
    successful_attacks = 0
    distance_sum = 0
    for i in attack_indices:
        original_image = x_test[i]
        target_label = (y_test[i] + 1) % 10  # Choose a different target class

        perturbed_image = attack_func(original_image, target_label)
        perturbed_label = np.argmax(model.predict(np.reshape(perturbed_image, (1, 784))))

        distance = np.mean(np.abs(original_image - perturbed_image))
        distance_sum += distance

        if perturbed_label == target_label:
            successful_attacks += 1

        # Display the images
        plt.figure()
        plt.subplot(1, 2, 1)
        plt.imshow(original_image.reshape((28, 28)), cmap='gray')
        plt.title('Original')
        plt.axis('off')
        plt.subplot(1, 2, 2)
        plt.imshow(perturbed_image.reshape((28, 28)), cmap='gray')
        plt.title('Perturbed')
        plt.axis('off')
        plt.show()

        print(f"Sample {i+1}: Original Label - {np.argmax(y_test[i])}, Perturbed Label - {perturbed_label}")

In [45]:
# Choose 5 different numbers to attack
attack_indices = [0, 100, 500, 1000, 1500]

In [46]:
# FGSM attack
print("FGSM Attack:")
evaluate_attack(fgsm_attack, attack_indices)

# DeepFool attack
print("\nDeepFool Attack:")
evaluate_attack(deepfool_attack, attack_indices)

# ABBA attack
print("ABBA Attack:")
evaluate_attack(abba_attack, attack_indices)

FGSM Attack:


ValueError: ignored