In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

RANDOM_SEED = 8

In [34]:
df = pd.read_csv('creditcard.csv')

# df.isnull().values.any()

transactions = df.drop('Class', axis=1)
labels = df['Class']

# transactions['Time'] = np.log(transactions['Time'])
# print(transactions.head())
transactions['Time'] = np.log1p(transactions['Time'])   # log1p(x) = log(1+x) is used to avoid -inf values, since the 'Time' feature can be 0 and log(0) is -inf
# (transactions['Amount'] == 0).any()
transactions['Amount'] = np.log1p(transactions['Amount'])

transactions = MinMaxScaler().fit_transform(transactions)

legitimate_transactions = transactions[labels == 0]
fraudulent_transactions = transactions[labels == 1]

X_train, X_test = train_test_split(legitimate_transactions, test_size=0.05, random_state=RANDOM_SEED)

y_train = np.zeros(X_train.shape[0])
y_test = np.zeros(X_test.shape[0])

In [35]:
class Autoencoder(Model):
  def __init__(self):
    super(Autoencoder, self).__init__()
    
    self.encoder = tf.keras.Sequential([
      layers.Dense(15, activation="relu"),
      layers.Dense(7, activation="relu")
    ])
    
    self.decoder = tf.keras.Sequential([
      layers.Dense(15, activation="relu"),
      layers.Dense(30, activation="relu")
    ])
    
  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded
  
autoencoder = Autoencoder()

autoencoder.compile(optimizer='adam', loss='mae')

history = autoencoder.fit(X_train, X_train,
                          epochs=100,
                          batch_size=64,
                          validation_data=(X_test, X_test),
                          shuffle=True)

Epoch 1/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.1538 - val_loss: 0.1049
Epoch 2/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1043 - val_loss: 0.1032
Epoch 3/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 988us/step - loss: 0.1031 - val_loss: 0.1025
Epoch 4/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 954us/step - loss: 0.1022 - val_loss: 0.1011
Epoch 5/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.1010 - val_loss: 0.1007
Epoch 6/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 925us/step - loss: 0.1005 - val_loss: 0.0995
Epoch 7/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.0994 - val_loss: 0.0991
Epoch 8/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.0990 - val_loss: 0.0987
Epoch 9/1

In [36]:
reconstructions_train = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions_train, X_train)

X_validation = np.concatenate((X_test, fraudulent_transactions), axis=0)
y_validation = np.concatenate((y_test, np.ones(fraudulent_transactions.shape[0])), axis=0)

reconstructions_validation = autoencoder.predict(X_validation)
validation_loss = tf.keras.losses.mae(reconstructions_validation, X_validation)

[1m8441/8441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 631us/step
[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step


In [37]:
# threshold = np.mean(train_loss)
threshold = np.mean(train_loss) + np.std(train_loss)
# threshold = np.percentile(train_loss, 90)
# threshold = np.percentile(train_loss, 95)
# threshold = np.percentile(train_loss, 99)

predictions = tf.math.greater(validation_loss, threshold)

print("True Positives = {}".format(np.sum(np.logical_and(predictions, y_validation))))
print("False Positives = {}".format(np.sum(np.logical_and(predictions, np.logical_not(y_validation)))))
print("True Negatives = {}".format(np.sum(np.logical_and(np.logical_not(predictions), np.logical_not(y_validation)))))
print("False Negatives = {}".format(np.sum(np.logical_and(np.logical_not(predictions), y_validation))))

print("Accuracy = {}".format(accuracy_score(y_validation, predictions)))
print("Precision = {}".format(precision_score(y_validation, predictions)))
print("Recall = {}".format(recall_score(y_validation, predictions)))
print("F1 Score = {}".format(f1_score(y_validation, predictions)))

True Positives = 435
False Positives = 1617
True Negatives = 12599
False Negatives = 57
Accuracy = 0.8861843894479196
Precision = 0.21198830409356725
Recall = 0.8841463414634146
F1 Score = 0.34198113207547176


In [38]:
class Sampling(layers.Layer):
  def call(self, inputs):
    mean, logvar = inputs
    eps = tf.random.normal(shape=tf.shape(mean))
    return eps * tf.exp(logvar * 0.5) + mean

class VAE(Model):
  def __init__(self):
    super(VAE, self).__init__()
    
    self.encoder = tf.keras.Sequential([
      layers.Dense(15, activation="relu"),
      layers.Dense(7, activation="relu"),
      layers.Dense(2 + 2)
    ])
    
    self.decoder = tf.keras.Sequential([
      layers.Dense(15, activation="relu"),
      layers.Dense(30, activation="relu")
    ])
    
  def encode(self, x):
    mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
    return mean, logvar
  
  def reparameterize(self, mean, logvar):
    return Sampling()([mean, logvar])
  
  def decode(self, z):
    decoded = self.decoder(z)
    return decoded

  def call(self, x):
    mean, logvar = self.encode(x)
    z = self.reparameterize(mean, logvar)
    decoded = self.decode(z)
    return decoded

vae = VAE()

vae.compile(optimizer='adam', loss='mae')

history = vae.fit(X_train, X_train,
                  epochs=100,
                  batch_size=64,
                  validation_data=(X_test, X_test),
                  shuffle=True)

Epoch 1/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 889us/step - loss: 0.0828 - val_loss: 0.0343
Epoch 2/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 859us/step - loss: 0.0342 - val_loss: 0.0340
Epoch 3/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 840us/step - loss: 0.0337 - val_loss: 0.0335
Epoch 4/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 792us/step - loss: 0.0335 - val_loss: 0.0336
Epoch 5/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 829us/step - loss: 0.0334 - val_loss: 0.0333
Epoch 6/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 808us/step - loss: 0.0332 - val_loss: 0.0331
Epoch 7/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 811us/step - loss: 0.0331 - val_loss: 0.0331
Epoch 8/100
[1m4221/4221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 826us/step - loss: 0.0329 - val_loss: 0.0326


In [39]:
reconstructions_train = vae.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions_train, X_train)

X_validation = np.concatenate((X_test, fraudulent_transactions), axis=0)
y_validation = np.concatenate((y_test, np.ones(fraudulent_transactions.shape[0])), axis=0)

reconstructions_validation = vae.predict(X_validation)
validation_loss = tf.keras.losses.mae(reconstructions_validation, X_validation)

[1m8441/8441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 653us/step


In [None]:
# threshold = np.mean(train_loss)
threshold = np.mean(train_loss) + np.std(train_loss)
# threshold = np.percentile(train_loss, 90)
# threshold = np.percentile(train_loss, 95)
# threshold = np.percentile(train_loss, 99)

predictions = tf.math.greater(validation_loss, threshold)

print("True Positives = {}".format(np.sum(np.logical_and(predictions, y_validation))))
print("False Positives = {}".format(np.sum(np.logical_and(predictions, np.logical_not(y_validation)))))
print("True Negatives = {}".format(np.sum(np.logical_and(np.logical_not(predictions), np.logical_not(y_validation)))))
print("False Negatives = {}".format(np.sum(np.logical_and(np.logical_not(predictions), y_validation))))

print("Accuracy = {}".format(accuracy_score(y_validation, predictions)))
print("Precision = {}".format(precision_score(y_validation, predictions)))
print("Recall = {}".format(recall_score(y_validation, predictions)))
print("F1 Score = {}".format(f1_score(y_validation, predictions)))

True Positives = 443
False Positives = 1872
True Negatives = 12344
False Negatives = 49
Accuracy = 0.8693908077236878
Precision = 0.1913606911447084
Recall = 0.9004065040650406
F1 Score = 0.3156394727467047
