In [13]:
#Fraud Detection Using Variational Autoencoder
#
# TEAM MEMBERS
# Deepesh Kumar Sahu cs23mtech11006
# Naveen Nayak cs23mtech11011
# Pramod Hembrom cs23mtech11015
# Sameer Atram cs23mtech11017


import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import layers, regularizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
import matplotlib.pyplot as plt

In [14]:
# Define sampling function
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VariationalAutoencoder(tf.keras.Model):
    def __init__(self, input_dim, latent_dim):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(20, activation='relu', activity_regularizer=regularizers.l1(10e-5)),
            layers.Dense(10, activation='relu')
        ])
        self.z_mean = layers.Dense(latent_dim, name="mean")
        self.z_log_var = layers.Dense(latent_dim, name="log-variance")
        self.decoder = tf.keras.Sequential([
            layers.Dense(10, activation='relu'),
            layers.Dense(20, activation='relu'),
            layers.Dense(input_dim, activation='sigmoid')
        ])

    def call(self, x):
        h = self.encoder(x)
        z_mean = self.z_mean(h)
        z_log_var = self.z_log_var(h)
        z = sampling([z_mean, z_log_var])
        decoded = self.decoder(z)
        return decoded

In [15]:
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/creditcard.csv"
data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Prepare data
data["LogAmount"] = np.log1p(data["Amount"]+1)
data["LogTime"] = np.log1p(data["Time"]+1)
data.drop(columns=['Time', 'Amount'], inplace=True)
x = data.drop(["Class"], axis=1)
y = data["Class"].values
x_scale = MinMaxScaler().fit_transform(x.values)
x_norm = x_scale[y == 0]
x_fraud = x_scale[y == 1]

# Split fraud data into train and test sets
X_train_fraud, X_test_fraud = train_test_split(x_fraud, test_size=0.5, random_state=42)

In [17]:
# Define VAE architecture
latent_dim = 2  # Set the latent dimension
input_dim = x.shape[1]
vae = VariationalAutoencoder(input_dim, latent_dim)

In [18]:
# Compile VAE model
vae.compile(optimizer='adam', loss='mse')

In [19]:
# Train the VAE
history = vae.fit(x_norm, x_norm,
                   batch_size=256, epochs=5,
                   shuffle=True, validation_split=0.20)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
reconstructed_data = vae.predict(X_train_fraud)
reconstruction_error = np.mean(np.square(X_train_fraud - reconstructed_data), axis=1)
threshold = 0.001



In [21]:
# Classify instances as normal or fraud based on the threshold
predictions = np.where(reconstruction_error > threshold, 1, 0)

In [22]:
true_labels = np.ones(len(X_test_fraud))
# Evaluate the predictions
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

In [23]:
print("F1 Score:", f1)
print("Number of fraud predictions:", np.sum(predictions == 1))
print("Number of non-fraud predictions:", np.sum(predictions == 0))

vae.summary()

F1 Score: 0.9834710743801653
Number of fraud predictions: 238
Number of non-fraud predictions: 8
Model: "variational_autoencoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, 10)                830       
                                                                 
 mean (Dense)                multiple                  22        
                                                                 
 log-variance (Dense)        multiple                  22        
                                                                 
 sequential_3 (Sequential)   (None, 30)                880       
                                                                 
Total params: 1754 (6.85 KB)
Trainable params: 1754 (6.85 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
