# CS 510 Project 3

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Part 0: train your own MNIST model (do not modify code in this part!)

The following training code is mainly based on [TensorFlow 2 quickstart for experts](https://www.tensorflow.org/tutorials/quickstart/advanced). The goal of this part is to familiarize yourself with basic usage of Colab notebook and TensorFlow. In this part, you only need to learn and run the code. By the end of this part, you will get a neural network which can classifiy the handwritten data with ~98% accuracy.

This is a [Google Colaboratory](https://colab.research.google.com/notebooks/welcome.ipynb) notebook file. Python programs are run directly in the browser—a great way to learn and use TensorFlow. To follow this tutorial, run the notebook in Google Colab by clicking the button at the top of this page.

1. In Colab, connect to a Python runtime: At the top-right of the menu bar, select *CONNECT*.
2. Run all the notebook code cells: Select *Runtime* > *Run all*.


Download and install TensorFlow 2. Import TensorFlow into your program:

Note: Upgrade `pip` to install the TensorFlow 2 package. See the [install guide](https://www.tensorflow.org/install) for details.

Import TensorFlow into your program:


In [None]:
# you can skip this if you use Google colab
!pip install numpy
!pip install tensorflow
!pip install keras
!pip install matplotlib

In [None]:
import random
import numpy as np

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model

random.seed(0)
tf.random.set_seed(0)
np.random.seed(0)

In [None]:
np.set_printoptions(threshold=np.inf)

In [None]:
random.seed(0)
tf.random.set_seed(0)
np.random.seed(0)
tf.keras.utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()

Load and prepare the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).

In [None]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis].astype("float32")
x_test = x_test[..., tf.newaxis].astype("float32")

Use `tf.data` to batch and shuffle the dataset:

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(10000).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

In [None]:
x_train.shape

Show some training images.

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow((images[i]*255).numpy().astype("uint8").reshape(28, 28), cmap='gray', vmin=0, vmax=255)
    plt.title(labels[i].numpy())
    plt.axis("off")

Build the `tf.keras` model using the Keras [model subclassing API](https://www.tensorflow.org/guide/keras#model_subclassing):

In [None]:
class MyModel(Model):
  def __init__(self):
    super(MyModel, self).__init__()
    self.flatten = Flatten()
    self.d1 = Dense(100, activation='relu')
    self.d2 = Dense(20, activation='relu')
    self.final = Dense(10)

  def call(self, x):
    x = self.flatten(x)
    x = self.d1(x)
    x = self.d2(x)
    return self.final(x)

  def get_internal_activations(self, x):
    x = self.flatten(x)
    x_d1 = self.d1(x)
    x_d2 = self.d2(x_d1)
    return x_d1, x_d2

# Create an instance of the model
model = MyModel()

Choose an optimizer and loss function for training:

In [None]:
@tf.function
def loss_object(target, pred):
  target = tf.one_hot(target, 10)
  pred_conf = tf.nn.softmax(pred)
  cross_entropy = -tf.reduce_sum(target * tf.math.log(pred_conf))
  return cross_entropy

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

Select metrics to measure the loss and the accuracy of the model. These metrics accumulate the values over epochs and then print the overall result.

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

Use `tf.GradientTape` to train the model:

In [None]:
@tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(images, training=True)
    loss = loss_object(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_loss(loss)
  train_accuracy(labels, predictions)

Test the model:

In [None]:
@tf.function
def test_step(images, labels):
  # training=False is only needed if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  predictions = model(images, training=False)
  t_loss = loss_object(labels, predictions)

  test_loss(t_loss)
  test_accuracy(labels, predictions)

In [None]:
EPOCHS = 20

for epoch in range(EPOCHS):
  # Reset the metrics at the start of the next epoch
  train_loss.reset_states()
  train_accuracy.reset_states()
  test_loss.reset_states()
  test_accuracy.reset_states()

  for images, labels in train_ds:
    train_step(images, labels)

  for test_images, test_labels in test_ds:
    test_step(test_images, test_labels)

  print(
    f'Epoch {epoch + 1}, '
    f'Loss: {train_loss.result()}, '
    f'Accuracy: {train_accuracy.result() * 100}, '
    f'Test Loss: {test_loss.result()}, '
    f'Test Accuracy: {test_accuracy.result() * 100}'
  )

  if test_accuracy.result() > .95:
    break

The image classifier is now trained to ~95% accuracy on this dataset. To learn more, read the [TensorFlow tutorials](https://www.tensorflow.org/tutorials).


# Part 1: Targeted Adversarial Attacks (50 points)

In this part, you need to find adversarial images that:

   - purturbed from imaged that are labeled 5 and MyModel() predicts 5

   - MyModel(), after purturbation, predicts 0


We will start with base images (from ground truth label 5)
Specifically, you should implement make_noise_pattern() function

In [None]:
# These are example parameters (they worked with my code)... you can modify those if you want
EPSILON = .01
EPOCHS  = 10
BATCH   = 10

In [None]:
# These must not be changed
ORIGINAL_LABEL = 5
TARGET_LABEL   = 0

NUM_SAMPLES    = 200

In [None]:
# Write this function. This should sign values of gradient
# useful methods
#   1. tape.gradient( xxx, xxx )
#   2. tf.sign()
#
#
# please refer to https://www.tensorflow.org/tutorials/generative/adversarial_fgsm and see create_adversarial_pattern() function.
#

def make_noise_pattern(x):
  with tf.GradientTape() as tape:
    # Write your code here

  return tf.sign(...)

In [None]:
# Do not modify this cell!

adv_samples = tf.reshape((), (0, 28, 28, 1))
orig_samples = tf.reshape((), (0, 28, 28, 1))

for i in range(0, x_test.shape[0], BATCH):
  x_samples = tf.Variable(x_test[y_test == ORIGINAL_LABEL][(i):(i+BATCH)])

  perturbed = x_samples + EPSILON * make_noise_pattern(x_samples)
  perturbed = tf.clip_by_value(perturbed, 0, 1.)

  for _ in range(EPOCHS):
    perturbed += EPSILON * make_noise_pattern(perturbed)
    perturbed = tf.clip_by_value(perturbed, 0, 1.)

  sample_succ = tf.boolean_mask(perturbed,  (TARGET_LABEL   == tf.math.argmax(model(perturbed), axis=1)) & \
                                            (ORIGINAL_LABEL == tf.math.argmax(model(x_samples), axis=1)))

  orig_succ   = tf.boolean_mask(x_samples,  (TARGET_LABEL   == tf.math.argmax(model(perturbed), axis=1)) & \
                                            (ORIGINAL_LABEL == tf.math.argmax(model(x_samples), axis=1)))

  print (f'perturbing images:{i}~{i+BATCH}, found {len(sample_succ)} samples')

  adv_samples = tf.concat([adv_samples, sample_succ], 0)
  orig_samples = tf.concat([orig_samples, orig_succ], 0)

  if (NUM_SAMPLES <= len(adv_samples)):
    break

# **Task 1**: Below number should be more than or equals to 100

In [None]:
len(adv_samples)

# **Task 2**: Below images need to obviously look like 5

In [None]:
plt.figure(figsize=(10, 80))

for i in range(200):
  ax = plt.subplot(40, 5, i + 1)
  plt.imshow((adv_samples[i]*255).numpy().astype("uint8").reshape(28, 28), cmap='gray', vmin=0, vmax=255)
  plt.axis("off")

# Part 2: Attack Detection (50 points)

This part is to detect attacks.

First, observe images or internal cell values (adversarial samples vs original images).

For example, you may observe that some neurons show diffrent values with adversarial samples. Then you can use these values to compose a simple classifier.

And then, propose a method to detect them.

It does not have to be the state-of-art. Comparative evaluations are not needed.

Below part, the given code will bring 200 adversarial images and 200 normal images.

You should use first 100 images for an observation or any kind of training (if needed). And then you should use the later 100 images to test your methodology.


Here, we aim at 80% of f-1 score.

# **Task 3**: propose your detection method


Answer for Task 3: Show your observations below

In [None]:
x_samples = tf.Variable(x_test[y_test == TARGET_LABEL][:200])

In [None]:
# Data for observations, don't go out of the boundary.
adv_samples[:100]
x_samples[:100]

In [None]:
# You may want to use internal values of model (optional)
d1_adv, d2_adv = model.get_internal_activations(adv_samples[:100])
d1_noarmal, d2_noarmal = model.get_internal_activations(x_samples[:100])

Answer for Task 3: Here, propose your detection method here

In [None]:
# Data for a test.
adv_samples_test = adv_samples[100:200]
normal_samples_test = x_samples[100:200]

In [None]:
# Implement your detection method

# **Task 4**: Print your precision, recall, f1 score. F1 score should be higher than 80%

In [None]:
'''
tp =
fn =
tn =
fp =
'''

'\ntp = \nfn =\ntn = \nfp =\n'

In [None]:
pr = tp / (tp + fp)
rc = tp / (tp + fn)

print ('precision:', pr, 'recall:', rc, 'f-1', 2 * pr * rc / (pr + rc))

# **Task 5**: Tell your feeling about the different asepct of this project against the traditional software analysis.

Answer for Task 5: