##### Copyright 2019 The TensorFlow Authors.




In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/privacy/blob/master/tutorials/Classification_Privacy.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/privacy/blob/master/tutorials/Classification_Privacy.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 1.x
except Exception:
  pass

import tensorflow as tf

import numpy as np

tf.compat.v1.logging.set_verbosity(tf.logging.ERROR)

In [0]:
!pip install tensorflow_privacy

from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

In [0]:
train, test = tf.keras.datasets.mnist.load_data()
train_data, train_labels = train
test_data, test_labels = test

train_data = np.array(train_data, dtype=np.float32) / 255
test_data = np.array(test_data, dtype=np.float32) / 255

train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)

train_labels = np.array(train_labels, dtype=np.int32)
test_labels = np.array(test_labels, dtype=np.int32)

train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=10)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=10)

assert train_data.min() == 0.
assert train_data.max() == 1.
assert test_data.min() == 0.
assert test_data.max() == 1.

In [0]:
epochs = 15
batch_size = 250

In [0]:
l2_norm_clips = [0.5, 1, 2, 4]
noise_multipliers = [0.5, 1, 2, 4]
num_microbatches = 250
learning_rate = 0.25

if batch_size % num_microbatches != 0:
  raise ValueError('Batch size should be an integer multiple of the number of microbatches')

truncate_proportions_list = [
                        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                        [1, 1, 1, 1, 1, 1, 1, 1, 0.8, 1],
                        [1, 1, 1, 1, 1, 1, 1, 1, 0.6, 1],
                        [1, 1, 1, 1, 1, 1, 1, 1, 0.4, 1],
                        [1, 1, 1, 1, 1, 1, 1, 1, 0.2, 1]
                        ]

In [0]:
import os

In [0]:
start_at = len(os.listdir('/content/drive/My Drive/cs839004/non_dp/'))
current = 0

for truncate_proportion_index in range(len(truncate_proportions_list)):
  if current < start_at:
    current += 1
    continue

  truncate_proportions = truncate_proportions_list[truncate_proportion_index]

  indices = [[] for i in range(len(truncate_proportions))]

  for label, proportion in enumerate(truncate_proportions):
    # Locate indices where train_label == label
    indices[label] = np.argwhere(train_labels[:, label])
    # Shuffle those indices
    indices[label] = indices[label][np.random.permutation(len(indices[label]))]
    # Truncate those indices by proportion
    indices[label] = indices[label][0:int(len(indices[label]) * truncate_proportions[label])].squeeze()

  indices = np.concatenate(indices)

  # Shuffle
  indices = indices[np.random.permutation(len(indices))]

  # Truncate to integer multiple of batch size (due to https://github.com/tensorflow/privacy/issues/40)
  indices = indices[:(len(indices) // batch_size * batch_size)]

  # Filter the train_data and train_labels by those indices
  print(train_data.shape)
  truncated_train_data = train_data[indices]
  print(truncated_train_data.shape)
  truncated_train_labels = train_labels[indices]

  for label in range(len(truncate_proportions)):
    print('Number of training samples for label ' + str(label) + ': ' +
          str(np.count_nonzero(truncated_train_labels[:, label])))
  
  # Split test_data into a list of test data with the same label

  test_data_label = []

  for label in range(len(truncate_proportions)):
    indices = np.argwhere(test_labels[:, label]).squeeze()
    test_data_label.append(test_data[indices, :])

  print('truncate_proportion_index: ' + str(truncate_proportions_list[truncate_proportion_index][8]))

  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3,
                            padding='same',
                            activation='relu',
                            input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPool2D(2, 1),
      tf.keras.layers.Conv2D(64, 3,
                            padding='valid',
                            activation='relu'),
      tf.keras.layers.MaxPool2D(2, 1),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
  ])

  loss = tf.keras.losses.CategoricalCrossentropy(
      from_logits=True, reduction=tf.losses.Reduction.NONE)

  model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

  model.fit(truncated_train_data, truncated_train_labels,
            epochs=epochs,
            validation_data=(test_data, test_labels),
            batch_size=batch_size)

  # Create the .tf file manually as expected by ERAN

  with open(f'./drive/My Drive/cs839004/non_dp/mnist_{truncate_proportions_list[truncate_proportion_index][8]}.tf', 'w') as f:
    for layer in model.layers:
      if isinstance(layer, tf.keras.layers.Conv2D):
        f.write('Conv2D\n')
        activation = tf.keras.activations.serialize(layer.activation)
        if activation == 'relu':
          f.write('ReLU')
        # add more here
        f.write(', ')
        f.write('filters={0}, '.format(layer.filters))
        f.write('stride={0}, '.format(list(layer.strides)))
        f.write('kernel_size={0}, '.format(list(layer.kernel_size)))
        f.write('input_shape={0}, '.format([i for i in layer.input_shape if i is not None]))
        f.write('padding={0}\n'.format(1 if layer.padding == 'same' else 0))
        for w in layer.get_weights():
          f.write(str(np.ndarray.tolist(w)))
          f.write('\n')
      elif isinstance(layer, tf.keras.layers.MaxPool2D):
        f.write('MaxPooling2D\n')
        f.write('pool_size={0}, '.format(list(layer.pool_size)))
        f.write('stride={0}, '.format(list(layer.strides)))
        f.write('input_shape={0}\n'.format([i for i in layer.input_shape if i is not None]))
      elif isinstance(layer, tf.keras.layers.Flatten):
        pass # Flattened by Dense layer. See https://github.com/eth-sri/eran/blob/master/tf_verify/read_net_file.py#L117
      elif isinstance(layer, tf.keras.layers.Dense):
        activation = tf.keras.activations.serialize(layer.activation)
        if activation == 'relu':
          f.write('ReLU')
        elif activation == 'softmax':
          f.write('Affine') # TODO not strictly true
        f.write('\n')
        for w in layer.get_weights():
          f.write(str(np.ndarray.tolist(w.transpose())))
          f.write('\n')

  for label, data in enumerate(test_data_label):
    results = model.evaluate(data, np.broadcast_to(tf.keras.utils.to_categorical(label, num_classes=10), (len(data), 10)), batch_size=1, verbose=0)
    print('Accuracy for label ' + str(label) + ': ' + str(results[1]) + '. Number of test samples: ' + str(len(data)))
  
  current += 1

In [0]:
start_at = len(os.listdir('/content/drive/My Drive/cs839004/dp/'))
current = 0

for truncate_proportion_index in range(len(truncate_proportions_list)):
  for l2_norm_clip in l2_norm_clips:
    for noise_multiplier in noise_multipliers:
      if current < start_at:
        current += 1
        continue

      truncate_proportions = truncate_proportions_list[truncate_proportion_index]

      indices = [[] for i in range(len(truncate_proportions))]

      for label, proportion in enumerate(truncate_proportions):
        # Locate indices where train_label == label
        indices[label] = np.argwhere(train_labels[:, label])
        # Shuffle those indices
        indices[label] = indices[label][np.random.permutation(len(indices[label]))]
        # Truncate those indices by proportion
        indices[label] = indices[label][0:int(len(indices[label]) * truncate_proportions[label])].squeeze()

      indices = np.concatenate(indices)

      # Shuffle
      indices = indices[np.random.permutation(len(indices))]

      # Truncate to integer multiple of batch size (due to https://github.com/tensorflow/privacy/issues/40)
      indices = indices[:(len(indices) // batch_size * batch_size)]

      # Filter the train_data and train_labels by those indices
      print(train_data.shape)
      truncated_train_data = train_data[indices]
      print(truncated_train_data.shape)
      truncated_train_labels = train_labels[indices]

      for label in range(len(truncate_proportions)):
        print('Number of training samples for label ' + str(label) + ': ' +
              str(np.count_nonzero(truncated_train_labels[:, label])))
      
      # Split test_data into a list of test data with the same label

      test_data_label = []

      for label in range(len(truncate_proportions)):
        indices = np.argwhere(test_labels[:, label]).squeeze()
        test_data_label.append(test_data[indices, :])

      print('l2_norm_clip: ' + str(l2_norm_clip))
      print('noise_multiplier: ' + str(noise_multiplier))
      print('truncate_proportion_index: ' + str(truncate_proportions_list[truncate_proportion_index][8]))

      model = tf.keras.Sequential([
          tf.keras.layers.Conv2D(32, 3,
                                padding='same',
                                activation='relu',
                                input_shape=(28, 28, 1)),
          tf.keras.layers.MaxPool2D(2, 1),
          tf.keras.layers.Conv2D(64, 3,
                                padding='valid',
                                activation='relu'),
          tf.keras.layers.MaxPool2D(2, 1),
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(64, activation='relu'),
          tf.keras.layers.Dense(10, activation='softmax')
      ])

      optimizer = DPGradientDescentGaussianOptimizer(
          l2_norm_clip=l2_norm_clip,
          noise_multiplier=noise_multiplier,
          num_microbatches=num_microbatches,
          learning_rate=learning_rate)

      loss = tf.keras.losses.CategoricalCrossentropy(
          from_logits=True, reduction=tf.losses.Reduction.NONE)

      model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

      model.fit(truncated_train_data, truncated_train_labels,
                epochs=epochs,
                validation_data=(test_data, test_labels),
                batch_size=batch_size)

      epsilon, _ = compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=len(truncated_train_data), batch_size=num_microbatches, noise_multiplier=noise_multiplier, epochs=epochs, delta=1e-5)

      # Create the .tf file manually as expected by ERAN

      with open(f'./drive/My Drive/cs839004/dp/mnist_{truncate_proportions_list[truncate_proportion_index][8]}_{l2_norm_clip}_{noise_multiplier}_{epsilon}.tf', 'w') as f:
        for layer in model.layers:
          if isinstance(layer, tf.keras.layers.Conv2D):
            f.write('Conv2D\n')
            activation = tf.keras.activations.serialize(layer.activation)
            if activation == 'relu':
              f.write('ReLU')
            # add more here
            f.write(', ')
            f.write('filters={0}, '.format(layer.filters))
            f.write('stride={0}, '.format(list(layer.strides)))
            f.write('kernel_size={0}, '.format(list(layer.kernel_size)))
            f.write('input_shape={0}, '.format([i for i in layer.input_shape if i is not None]))
            f.write('padding={0}\n'.format(1 if layer.padding == 'same' else 0))
            for w in layer.get_weights():
              f.write(str(np.ndarray.tolist(w)))
              f.write('\n')
          elif isinstance(layer, tf.keras.layers.MaxPool2D):
            f.write('MaxPooling2D\n')
            f.write('pool_size={0}, '.format(list(layer.pool_size)))
            f.write('stride={0}, '.format(list(layer.strides)))
            f.write('input_shape={0}\n'.format([i for i in layer.input_shape if i is not None]))
          elif isinstance(layer, tf.keras.layers.Flatten):
            pass # Flattened by Dense layer. See https://github.com/eth-sri/eran/blob/master/tf_verify/read_net_file.py#L117
          elif isinstance(layer, tf.keras.layers.Dense):
            activation = tf.keras.activations.serialize(layer.activation)
            if activation == 'relu':
              f.write('ReLU')
            elif activation == 'softmax':
              f.write('Affine') # TODO not strictly true
            f.write('\n')
            for w in layer.get_weights():
              f.write(str(np.ndarray.tolist(w.transpose())))
              f.write('\n')

      for label, data in enumerate(test_data_label):
        results = model.evaluate(data, np.broadcast_to(tf.keras.utils.to_categorical(label, num_classes=10), (len(data), 10)), batch_size=1, verbose=0)
        print('Accuracy for label ' + str(label) + ': ' + str(results[1]) + '. Number of test samples: ' + str(len(data)))
      
      current += 1