# Chapter 14: Data Privacy

### NOTE as of September 2020, tf-privacy relies on the updated Keras optimizer which will be part of the TensorFlow 2.4 release

Until the release of a stable 2.4 version, this notebook requires the TensorFlow's nightly builds. Due to the unstable nature of the nightly builds, this notebook might fail intermittently.

In [None]:
!pip install tensorflow_privacy
!pip install tf-nightly

Collecting tf-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/51/2f/410f5153862dc461c8c1d1bafc0be6e5942eafaffc1764e71ce284b4034e/tf_nightly-2.4.0.dev20200909-cp36-cp36m-manylinux2010_x86_64.whl (389.9MB)
[K     |████████████████████████████████| 389.9MB 46kB/s 
Collecting tb-nightly<3.0.0a0,>=2.4.0a0
[?25l  Downloading https://files.pythonhosted.org/packages/30/8f/8195d11bc8e6e1945fab68f85ced31f8ff60f88d856867dd310c31b34c22/tb_nightly-2.4.0a20200909-py3-none-any.whl (9.2MB)
[K     |████████████████████████████████| 9.2MB 51.0MB/s 
[?25hCollecting tf-estimator-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/a2/4c/b8c1af2d1a2a8e0ca7b07287e2be948addf7e3884d022e62e37e72232dea/tf_estimator_nightly-2.4.0.dev2020090901-py2.py3-none-any.whl (460kB)
[K     |████████████████████████████████| 460kB 57.1MB/s 
[?25hCollecting flatbuffers>=1.12
  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import os

In [None]:

from pathlib import Path

repo_dir = Path.cwd().parents[1]
data_file_path = os.path.join(repo_dir, 'data/consumer_complaints_with_narrative.csv')
print(data_file_path)

/content/data/reduced_consumer_complaints_with_narrative.csv


In [None]:
!ls /content/

data  sample_data


## Feature engineering

In [None]:
ONE_HOT_FEATURES = {
    "product": None,
    "sub_product": None,
    "company_response": None, 
    "state": None,
    "issue": None
}

# feature name, bucket count
BUCKET_FEATURES = {
    "zip_code": 10
}

# feature name, value is unused
TEXT_FEATURES = {
    "consumer_complaint_narrative": None
}

In [None]:
feature_names = ["product", "sub_product", "issue", "sub_issue", "consumer_complaint_narrative", "company", "state", "zip_code", "company_response", "timely_response", "consumer_disputed"]
df = pd.read_csv(data_file_path, usecols=feature_names)

In [None]:
def make_one_hot(df):
    one_hot_array = []
    for feature_name in ONE_HOT_FEATURES.keys():
        temp_array = pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values))
        ONE_HOT_FEATURES[feature_name] = temp_array.shape[1]
        one_hot_array.append(temp_array)

    return one_hot_array

In [None]:
for feature in ONE_HOT_FEATURES.keys():
    df[feature] = df[feature].astype("category").cat.codes

one_hot_x = make_one_hot(df)

embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in TEXT_FEATURES.keys()]

df['zip_code'] = df['zip_code'].str.replace('X', '0', regex=True)
df['zip_code'] = df['zip_code'].str.replace(r'\[|\*|\+|\-|`|\.|\ |\$|\/|!|\(', '0', regex=True)
df['zip_code'] = df['zip_code'].fillna(0)
df['zip_code'] = df['zip_code'].astype('int32')
# one bucket per 10k
df['zip_code'] = df['zip_code'].apply(lambda x: x//10000)
numeric_x = [df['zip_code'].values]

X = one_hot_x + numeric_x + embedding_x
y = np.asarray(df["consumer_disputed"], dtype=np.uint8).reshape(-1)

  after removing the cwd from sys.path.
  


## Adding DP

In [None]:
# DP parameters
NOISE_MULTIPLIER = 1.1
NUM_MICROBATCHES = 32
LEARNING_RATE = 0.1
POPULATION_SIZE = 1000
L2_NORM_CLIP = 1.0
BATCH_SIZE = 32 
EPOCHS = 1

In [None]:
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

optimizer = DPGradientDescentGaussianOptimizer(
    l2_norm_clip=L2_NORM_CLIP,
    noise_multiplier=NOISE_MULTIPLIER,
    num_microbatches=NUM_MICROBATCHES,
    learning_rate=LEARNING_RATE)
    
loss = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.losses.Reduction.NONE)

The model is unchanged, we just pass in the differentially private optimizer and loss.

In [None]:
def transformed_name(key):
    return key + '_xf'

def get_model(dp_optimizer, dp_loss, show_summary=True):
    """
    This function defines a Keras model and returns the model as a Keras object.
    """
    
    # one-hot categorical features
    input_features = []
    for key, dim in ONE_HOT_FEATURES.items():
        input_features.append(tf.keras.Input(shape=(dim), name=transformed_name(key)))

    # adding bucketized features 
    for key, dim in BUCKET_FEATURES.items():
        input_features.append(tf.keras.Input(1, name=transformed_name(key)))

    # adding text input features
    input_texts = []
    for key in TEXT_FEATURES.keys():
        input_texts.append(tf.keras.Input(shape=(1,), name=transformed_name(key), dtype=tf.string))

    # embed text features
    MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.KerasLayer(MODULE_URL)
    reshaped_narrative = tf.reshape(input_texts[0], [-1])
    embed_narrative = embed(reshaped_narrative) 
    deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)
    
    deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)
    deep = tf.keras.layers.Dense(64, activation='relu')(deep)
    deep = tf.keras.layers.Dense(16, activation='relu')(deep)

    wide_ff = tf.keras.layers.concatenate(input_features)
    wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)

    both = tf.keras.layers.concatenate([deep, wide])

    output = tf.keras.layers.Dense(1, activation='sigmoid')(both) 

    inputs = input_features + input_texts

    keras_model = tf.keras.models.Model(inputs, output)
    keras_model.compile(optimizer=dp_optimizer,
                        loss=dp_loss,  
                        metrics=[
                            tf.keras.metrics.BinaryAccuracy(),
                            tf.keras.metrics.TruePositives()
                        ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [None]:
model = get_model(show_summary=False, dp_optimizer=optimizer, dp_loss=loss)

In [None]:
model.fit(x=X, y=y, batch_size=32, validation_split=0.1, epochs=EPOCHS)



<tensorflow.python.keras.callbacks.History at 0x7fbb85c9ec18>

### Calculate Epsilon

In [None]:
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy


compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=POPULATION_SIZE, 
                                              batch_size=BATCH_SIZE, 
                                              noise_multiplier=NOISE_MULTIPLIER, 
                                              epochs=EPOCHS, 
                                              delta=1e-3)

DP-SGD with sampling rate = 3.2% and noise_multiplier = 1.1 iterated over 32 steps satisfies differential privacy with eps = 1.38 and delta = 0.001.
The optimal RDP order is 7.0.


(1.3845887532963042, 7.0)