In [None]:
import sys
import os

from os.path import expanduser

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("")))
sys.path.append(ROOT_DIR)
home = expanduser("~")
sys.path.append(os.path.abspath(home+'/AI-SDC'))


In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Scikit-learn utils
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_moons
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# Tensorflow imports
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
import tensorflow_privacy as tf_privacy
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy

# Classifiers for attack models
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Safe Keras
from safemodel.classifiers import SafeKerasModel

## A Quick Start Guide to implementing Safer Keras Models
### Definition of the datasets
1. We draw data points from a distribution.
2. We split these data points into the target dataset and a shadow dataset drawn from the same distribution.
3. We also draw a dataset from a different distribution.

**NOTE**. ***we make datasets with few samples but with many features to force the target model to overfit.***


**NOTE**: batch_size 25 so DP optimizer would run with same hyperparams

**NOTE**: Next cell detemrienswhich dataset is used

In [None]:
simple_data_for_pytests = False

In [None]:
if not simple_data_for_pytests:
    n_classes = 2

    # (X,y): Original distribution
    X, y = make_classification(
        n_samples=1000,
        n_classes=n_classes,
        n_features=300,
        n_informative=300,
        n_redundant=0,
        n_repeated=0,
        random_state=15,
    )
    # One-hot encoding of the label
    y = np.eye(n_classes)[y]

    # (Xt, yt) is the target dataset, owned by the TRE and drawn from the (X,y) distribution
    # (Xs, ys) is a shadow dataset drawn from the (X,y) distribution
    Xt, Xs, yt, ys = train_test_split(X, y, test_size=0.50, random_state=15)

    # (Xd, yd) is a shadow dataset, drawn from a different distribution (different seed)
    Xd, yd = make_classification(
        n_samples=1000,
        n_classes=n_classes,
        n_features=300,
        n_informative=300,
        n_redundant=0,
        n_repeated=0,
        random_state=42,
    )
    yd = np.eye(n_classes)[yd]

    # Split into train (member) and test (non-member) datasets
    # Set shuffle to False so that Xt_membership is consistent with Xt, otherwise
    # we need to stack Xt_member and Xt_nonmember again to get a consistent Xt.
    Xt_member, Xt_nonmember, yt_member, yt_nonmember = train_test_split(
        Xt, yt, test_size=0.5, shuffle=False
    )

    # Set membership status for future tests
    Xt_membership = np.vstack(
        (
            np.ones((Xt_member.shape[0], 1), np.uint8),
            np.zeros((Xt_nonmember.shape[0], 1), np.uint8),
        )
    ).flatten()

    X = Xt_member
    y = yt_member
    Xval = Xt_nonmember
    yval = yt_nonmember

In [None]:
if simple_data_for_pytests:
    from sklearn import datasets

    def get_data():
        iris = datasets.load_iris()
        x = np.asarray(iris.data, dtype=np.float64)
        y = np.asarray(iris.target, dtype=np.float64)
        x = np.vstack([x, (7, 2.0, 4.5, 1)])
        y = np.append(y, 4)
        return x, y

    xall, yall = get_data()
    n_classes = 4
    X, Xval, y, yval = train_test_split(
        xall, yall, test_size=0.2, shuffle=True, random_state=12345
    )

    y = tf.one_hot(y, n_classes)
    yval = tf.one_hot(yval, n_classes)
# yval

## Define the target model architecture

*Again, we use a rather big model (for the classification task) to favour overfitting.*

In [None]:
# Define target model
# Tensorflow model (MLP) (making it big to make it overfit)

# amke results repeatable
tf.random.set_seed(12345)
initializer = tf.keras.initializers.Zeros()

input_data = Input(shape=X[0].shape)
x = Dense(128, activation="relu", kernel_initializer=initializer)(input_data)
x = Dense(128, activation="relu", kernel_initializer=initializer)(x)
x = Dense(64, activation="relu", kernel_initializer=initializer)(x)
output = Dense(n_classes, activation="softmax", kernel_initializer=initializer)(x)

### Define the SafeModel

In [None]:
safeModel = SafeKerasModel(
    inputs=input_data,
    outputs=output,
    name="safekeras-test",
    num_samples=X.shape[0],
    epochs=10,
)

### Set loss and compile

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy(
    from_logits=False, reduction=tf.losses.Reduction.NONE
)


safeModel.compile(loss=loss, optimizer=None)

### Fit the model

In [None]:
epochs = 20
batch_size = 1

r_DP = safeModel.fit(
    X,
    y,
    validation_data=(Xval, yval),
    epochs=epochs,
    batch_size=batch_size,
)
if r_DP == None:
    print("You have chosen to exit. Reset relevant parameter values then re-run fit().")
else:
    plt.plot(r_DP.history["accuracy"], label="accuracy")
    plt.plot(r_DP.history["val_accuracy"], label="validation accuracy")
    plt.legend()
    plt.show()
    loss, acc = safeModel.evaluate(X, y)
    print(f"training loss {loss} accuracy {acc}")

### Compute privacy and check if requirements for Differential Privacy are met

In [None]:
num_samples = X.shape[0]
batch_size = safeModel.batch_size
epochs = 20

dp_met, privacy = safeModel.dp_epsilon_met(num_examples=num_samples, batch_size=batch_size, epochs=epochs)

print(f"with these settings privacy = {privacy}")

In [None]:
dp_met, msg = safeModel.check_epsilon(X.shape[0], safeModel.batch_size, safeModel.epochs)
print(f'Satisfies DP: {dp_met}')
print(f'{msg}')

### Check model and request release

In [None]:
safeModel.save("safe1.h5")
safeModel.preliminary_check()
safeModel.request_release(filename="safe1.h5")

### Examine Checkfile


In [None]:
!echo "contents of checkfile are"; cat *_check*ile.json