## DP Credit Card Detection

### Library Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from diffprivlib.models import LogisticRegression as dpLogisticRegression
import pandas as pd


### Dataset Setup

In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv"
df = pd.read_csv(url)


In [None]:
print(df.columns)

In [None]:
Y = df['Class'].values
X = df.drop('Time', axis = 1).drop('Class', axis = 1).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                       Y, 
                                       test_size=0.2, 
                                       random_state=123)


### DP Logistic Regression

#### Base Model (no DP)

In [None]:
# Train a regular logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
# Evaluate the model on the test set
score = model.score(X_test, y_test)
print("Test set accuracy for regular logistic regression: {:.2f}%".format(score*100))


#### Model with DP

In [None]:
# Train a differentially private logistic regression model
dp_model = dpLogisticRegression(epsilon=1.0, data_norm=10)
dp_model.fit(X_train, y_train)


In [None]:
# Evaluate the model on the test set
score = dp_model.score(X_test, y_test)
print("Test set accuracy for differentially private logistic regression: {:.2f}%".format(score*100))


### DP Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from diffprivlib.models import RandomForestClassifier as dpRandomForestClassifier

# Train a regular logistic regression model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model on the test set
score = model.score(X_test, y_test)
print("Test set accuracy for regular RF: {:.2f}%".format(score*100))

# Train a differentially private logistic regression model
dp_model = dpRandomForestClassifier(epsilon=1.0, data_norm=10)
dp_model.fit(X_train, y_train)

# Evaluate the model on the test set
score = dp_model.score(X_test, y_test)
print("Test set accuracy for differentially private RF: {:.2f}%".format(score*100))


### Examining the effect of ϵ

#### Logistic Regression

In [None]:
import numpy as np

EPS_MIN = 0.1
EPS_MAX = 10
STEP_SIZE = 0.1
scores = []

epsilons = np.arange(EPS_MIN, EPS_MAX, STEP_SIZE)

for eps in epsilons:

  # Train a differentially private logistic regression model
  dp_model = dpLogisticRegression(epsilon= eps, 
                                  data_norm=10)
  dp_model.fit(X_train, y_train)

  # Evaluate the model on the test set
  score = dp_model.score(X_test, y_test)
  scores.append(100.0*score)


In [None]:
import matplotlib.pyplot as plt 
plt.plot(epsilons, scores)


#### Random Forest

In [None]:
import numpy as np

EPS_MIN = 0.1
EPS_MAX = 10
STEP_SIZE = 0.1
scores = []

epsilons = np.arange(EPS_MIN, EPS_MAX, STEP_SIZE)

for eps in epsilons:

  # Train a differentially private logistic regression model
  dp_model = dpRandomForestClassifier(epsilon= eps, 
                                  data_norm=10)
  dp_model.fit(X_train, y_train)

  # Evaluate the model on the test set
  score = dp_model.score(X_test, y_test)
  scores.append(100.0*score)


## Differentially Private Deep Learning

### Library Imports

In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_privacy
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy


### Dataset Setup

In [None]:
def load_and_process_MNIST_Data():

    # Define constants
    SCALE_FACTOR = 1/255
    NUM_CLASS = 10

    # Load train and test data
    train, test = tf.keras.datasets.mnist.load_data()
    train_data, train_labels = train
    test_data, test_labels = test
    print("----- Loaded Train and Test Raw Data -----")

    # Scale train and test data
    train_data = np.array(train_data, dtype=np.float32) * SCALE_FACTOR
    test_data = np.array(test_data, dtype=np.float32) * SCALE_FACTOR
    print("----- Scaled Train and Test Data -----")

    # Reshape data for Convolutional NN
    train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
    test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)
    print("----- Reshaped Train and Test Data -----")

    # Load train and test labels
    train_labels = np.array(train_labels, dtype=np.int32)
    test_labels = np.array(test_labels, dtype=np.int32)
    print("----- Loaded Train and Test Labels -----")

    # One-Hot Encode the labels
    train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=NUM_CLASS)
    test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=NUM_CLASS)
    print("----- Categorized Train and Test Labels -----")

    return train_data, train_labels, test_data, test_labels


### CNN Classification Model

In [None]:
def MNIST_CNN_Model (num_hidden = 1):
    model_layers = list()

    # Add input layer

    # Convolution
    model_layers.append(tf.keras.layers.Conv2D(16, 8,
                           strides=2,
                           padding='same',
                           activation='relu',
                           input_shape=(28, 28, 1)))
    
    # Pooling
    model_layers.append(tf.keras.layers.MaxPool2D(2, 1))

    # Add Hidden Layers
    for _ in range(num_hidden):

        # Convolution
        model_layers.append(tf.keras.layers.Conv2D(32, 4,
                           strides=2,
                           padding='valid',
                           activation='relu'))
        
        # Pooling
        model_layers.append(tf.keras.layers.MaxPool2D(2, 1))

    # Flatten to vector
    model_layers.append(tf.keras.layers.Flatten())

    # Final Dense Layer
    model_layers.append(tf.keras.layers.Dense(32, activation='relu'))
    model_layers.append(tf.keras.layers.Dense(10))

    # Initialize model with these layers
    model = tf.keras.Sequential(model_layers)

    return model


### Model Training

In [None]:
train_data, train_labels, test_data, test_labels = load_and_process_MNIST_Data()

In [None]:
NUM_EPOCHS = 3
BATCH_SIZE = 250
MICRO_BATCHES = 250
L2_NORM_CLIP = 1.5
NOISE_MULTIPLIER = 1.3
LEARN_RATE = 0.2

model = MNIST_CNN_Model()
model.summary()


In [None]:
optimizer = tensorflow_privacy.DPKerasSGDOptimizer(
                  l2_norm_clip = L2_NORM_CLIP,
                  noise_multiplier = NOISE_MULTIPLIER,
                  num_microbatches = MICRO_BATCHES,
                  learning_rate = LEARN_RATE)

loss = tf.keras.losses.CategoricalCrossentropy(
                  from_logits=True, 
                  reduction=tf.losses.Reduction.NONE)


In [None]:
model.compile(optimizer=optimizer, 
              loss=loss, 
              metrics=['accuracy'])

model.fit(train_data, 
          train_labels,
          epochs = NUM_EPOCHS,
          validation_data = (test_data, test_labels),
          batch_size = BATCH_SIZE)


In [None]:
compute_dp_sgd_privacy.compute_dp_sgd_privacy(
                  n = train_data.shape[0],
                  batch_size = BATCH_SIZE,
                  noise_multiplier = NOISE_MULTIPLIER,
                  epochs = NUM_EPOCHS,
                  delta=1e-5)
