# Implementing targeted adversarial attacks with Keras and TensorFlow

In [1]:
# import necessary packages
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.applications.resnet50 import decode_predictions
from tensorflow.keras.applications.resnet50 import preprocess_input
import tensorflow as tf
import numpy as np
import argparse
import cv2

In [2]:
def preprocess_image(image):
	# swap color channels, resize the input image, and add a batch
	# dimension
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	image = cv2.resize(image, (224, 224))
	image = np.expand_dims(image, axis=0)
	# return the preprocessed image
	return image

In [3]:
def clip_eps(tensor, eps):
	# clip the values of the tensor to a given range and return it
	return tf.clip_by_value(tensor, clip_value_min=-eps, clip_value_max=eps)

* This function will be used when we construct our perturbation vector, ensuring that the noise vector we construct falls within tolerable limits, and most importantly, does not significantly impact the visual quality of the output adversarial image.

In [4]:
def generate_targeted_adversaries(model, baseImage, delta, classIdx, target, steps=500):
	# iterate over the number of steps
	for step in range(0, steps):
		# record our gradients
		with tf.GradientTape() as tape:
			# explicitly indicate that our perturbation vector should
			# be tracked for gradient updates
			tape.watch(delta)
			# add our perturbation vector to the base image and
			# preprocess the resulting image
			adversary = preprocess_input(baseImage + delta)
			# run this newly constructed image tensor through our
			# model and calculate the loss with respect to the
			# both the *original* class label and the *target*
			# class label
			predictions = model(adversary, training=False)
			originalLoss = -sccLoss(tf.convert_to_tensor([classIdx]),
				predictions)
			targetLoss = sccLoss(tf.convert_to_tensor([target]),
				predictions)
			totalLoss = originalLoss + targetLoss
			# check to see if we are logging the loss value, and if
			# so, display it to our terminal
			if step % 20 == 0:
				print("step: {}, loss: {}...".format(step, totalLoss.numpy()))
		# calculate the gradients of loss with respect to the
		# perturbation vector
		gradients = tape.gradient(totalLoss, delta)
		# update the weights, clip the perturbation vector, and
		# update its value
		optimizer.apply_gradients([(gradients, delta)])
		delta.assign_add(clip_eps(delta, eps=EPS))
	# return the perturbation vector
	return delta

##### originalLoss:
Computes the negative sparse categorical cross-entropy loss with respect to the original class label.
##### targetLoss: 
Derives the positive categorical cross-entropy loss with respect to the target class label (i.e., what we want the image adversary to be misclassified as, hence the term targeted adversarial attack). We take the negative/positive signs that way because our objective is to minimize the probability for the true class and maximize the probability of the target class.
##### totalLoss: 
Sum of the original loss and the targeted loss.


In [5]:
# construct the argument parser and parse the arguments
# construct the argument parser and parse the arguments
image_in_path = 'pyimagesearch/pig.jpg'
image_out_path = 'pyimagesearch/adversarial.png'
#"ImageNet class ID of the predicted label"
class_idx        = 341 # Hog
target_class_idx = 189 # Lakeland_terrier

In [6]:
# define the epsilon and learning rate constants
EPS = 2 / 255.0
LR = 5e-3
# load image from disk and preprocess it
print("[INFO] loading image...")
image = cv2.imread(image_in_path)
image = preprocess_image(image)
print("[INFO] loading finished size is {}".format(image.shape))

[INFO] loading image...
[INFO] loading finished size is (1, 224, 224, 3)


* our epsilon (EPS) value used for clipping tensors when constructing the adversarial image. An EPS value of 2 / 255.0 is a standard value used in adversarial publications and tutorials
* A value of LR = 5e-3 was obtained by empirical tuning — you may need to update this value when constructing your own adversarial images.

In [7]:
# load the pre-trained ResNet50 model for running inference
print("[INFO] loading pre-trained ResNet50 model...")
model = ResNet50(weights="imagenet")
# initialize optimizer and loss function
optimizer = Adam(learning_rate=LR)
sccLoss = SparseCategoricalCrossentropy()
# create a tensor based off the input image and initialize the
# perturbation vector (we will update this vector via training)
baseImage = tf.constant(image, dtype=tf.float32)
delta = tf.Variable(tf.zeros_like(baseImage), trainable=True)

[INFO] loading pre-trained ResNet50 model...


In [8]:
# generate the perturbation vector to create an adversarial example
print("[INFO] generating perturbation...")
deltaUpdated = generate_targeted_adversaries(model, baseImage, delta, class_idx, target_class_idx)
# create the adversarial example, swap color channels, and save the
# output image to disk
print("[INFO] creating targeted adversarial example...")
adverImage = (baseImage + deltaUpdated).numpy().squeeze()
adverImage = np.clip(adverImage, 0, 255).astype("uint8")
adverImage = cv2.cvtColor(adverImage, cv2.COLOR_RGB2BGR)
#cv2.imwrite(args["output"], adverImage)

[INFO] generating perturbation...
step: 0, loss: 16.11762809753418...
step: 20, loss: 14.135979652404785...
step: 40, loss: 8.167085647583008...
step: 60, loss: 4.767631530761719...
step: 80, loss: 2.3731980323791504...
step: 100, loss: 0.6528186798095703...
step: 120, loss: -0.6520843505859375...
step: 140, loss: -1.7873167991638184...
step: 160, loss: -2.8169145584106445...
step: 180, loss: -3.7497940063476562...
step: 200, loss: -4.595920085906982...
step: 220, loss: -5.411230087280273...
step: 240, loss: -6.2791290283203125...
step: 260, loss: -7.076803684234619...
step: 280, loss: -7.859404563903809...
step: 300, loss: -8.648221969604492...
step: 320, loss: -9.431211471557617...
step: 340, loss: -10.133853912353516...
step: 360, loss: -10.722539901733398...
step: 380, loss: -11.269286155700684...
step: 400, loss: -11.774765014648438...
step: 420, loss: -12.33942985534668...
step: 440, loss: -12.890159606933594...
step: 460, loss: -13.371463775634766...
step: 480, loss: -13.8769931

In [None]:
# run inference with this adversarial example, parse the results,
# and display the top-1 predicted result
print("[INFO] running inference on the adversarial example...")
preprocessedImage = preprocess_input(baseImage + deltaUpdated)
predictions = model.predict(preprocessedImage)
predictions = decode_predictions(predictions, top=3)[0]
label = predictions[0][1]
confidence = predictions[0][2] * 100
print("[INFO] label: {} confidence: {:.2f}%".format(label, confidence))
# write the top-most predicted label on the image along with the
# confidence score
text = "{}: {:.2f}%".format(label, confidence)
cv2.putText(adverImage, text, (3, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# show the output image
cv2.imshow("Output", adverImage)
cv2.waitKey(0)

[INFO] running inference on the adversarial example...
[INFO] label: water_bottle confidence: 21.59%
