In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,  MinMaxScaler

import os # accessing directory structure
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [36]:
df=pd.read_csv('C:/Users/azegl/_ZHAWaufC/HSLU/prepared_creditcard.csv')


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V10     284807 non-null  float64
 1   V11     284807 non-null  float64
 2   V12     284807 non-null  float64
 3   V14     284807 non-null  float64
 4   V16     284807 non-null  float64
 5   V17     284807 non-null  float64
 6   Class   284807 non-null  int64  
dtypes: float64(6), int64(1)
memory usage: 15.2 MB


In [38]:
X = df.drop(columns=["Class"])
y = df["Class"]

print(X.shape)
print(y.shape)

(284807, 6)
(284807,)


In [39]:
import numpy as np
from keras.layers import Dense, Input
from keras.models import Sequential, Model
from keras.optimizers import Adam
import keras.backend as K

# Load the dataset (fraud and non-fraud data)

fraud_data = df[df['Class'] == 1].drop('Class', axis=1).values
non_fraud_data = df[df['Class'] == 0].drop('Class', axis=1).values


In [40]:
type(fraud_data)

numpy.ndarray

In [41]:
fraud_data.shape

(492, 6)

In [42]:
fraud_data.shape[1]

6

In [43]:
type(fraud_data.shape[1])

int

In [44]:
# Calculate the number of synthetic fraud samples to generate
num_real_fraud = len(fraud_data)
num_synthetic_samples = len(non_fraud_data) - num_real_fraud
print("# of non-fraud: ", len(non_fraud_data))
print("# of Real Fraud:", num_real_fraud)
print("# of Synthetic Fraud required:", num_synthetic_samples)

# of non-fraud:  284315
# of Real Fraud: 492
# of Synthetic Fraud required: 283823


In [45]:
# Define the generator network
def build_generator(latent_dim, output_dim):
    model = Sequential()
    model.add(Dense(64, input_shape=(latent_dim,)))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(output_dim, activation='sigmoid'))
    return model

# Define the discriminator network
def build_discriminator(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Dimensionality of the input noise for the generator
latent_dim = 32

# Build generator and discriminator models
generator = build_generator(latent_dim, fraud_data.shape[1])
discriminator = build_discriminator(fraud_data.shape[1])

# Display model summaries
print("Generator Summary:")
generator.summary()
print("\nDiscriminator Summary:")
discriminator.summary()

Generator Summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                2112      
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dense_2 (Dense)             (None, 6)                 774       
                                                                 
Total params: 11,206
Trainable params: 11,206
Non-trainable params: 0
_________________________________________________________________

Discriminator Summary:
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               896       
                                                     

In [46]:
# Compile the discriminator model
from keras.metrics import Precision, Recall
discriminator.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss='binary_crossentropy',  metrics=[Precision(), Recall()])

In [47]:
import tensorflow as tf
from tensorflow.keras import backend as K

In [48]:
def generator_loss_log_d(y_true, y_pred):
    return -tf.reduce_mean(tf.math.log(y_pred + K.epsilon()))

# GAN model combining generator and discriminator
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss=generator_loss_log_d)

    return model

# Build and compile the GAN model
gan = build_gan(generator, discriminator)

In [49]:
import gc


In [50]:
# Set hyperparameters
epochs = 10000
batch_size = 16

# Training loop for the GAN
for epoch in range(epochs):
    # Train discriminator (freeze generator)
    discriminator.trainable = True
    generator.trainable = False

    # Select random real fraud samples
    real_fraud_samples = fraud_data[np.random.randint(0, num_real_fraud, batch_size)]

    # Generate fake fraud samples using the generator
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    fake_fraud_samples = generator.predict(noise)

    # Create labels for real and fake fraud samples
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))

    # Train the discriminator on real and fake fraud samples
    d_loss_real = discriminator.train_on_batch(real_fraud_samples, real_labels)
    d_loss_fake = discriminator.train_on_batch(fake_fraud_samples, fake_labels)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train generator (freeze discriminator)
    discriminator.trainable = False
    generator.trainable = True

    # Generate fake fraud samples and create labels for training the generator
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    valid_labels = np.ones((batch_size, 1))

    # Train the generator to generate samples that "fool" the discriminator
    g_loss = gan.train_on_batch(noise, valid_labels)

    # Print the progress
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} - D Loss: {d_loss} - G Loss: {g_loss}")

    if epoch % 500 == 0 and epoch != 0:
        tf.keras.backend.clear_session()
        gc.collect()
        print("Session cleared and garbage collected.")

# After training, use the generator to create synthetic fraud data
noise = np.random.normal(0, 1, size=(num_synthetic_samples, latent_dim))
synthetic_fraud_data = generator.predict(noise)

Epoch: 0 - D Loss: [1.01352763 0.5        0.09375   ] - G Loss: 0.379738450050354
Epoch: 100 - D Loss: [0.53378084 0.5        0.5       ] - G Loss: 0.6839779615402222
Epoch: 200 - D Loss: [0.39655939 0.5        0.375     ] - G Loss: 0.996909499168396
Epoch: 300 - D Loss: [0.29545917 0.5        0.4375    ] - G Loss: 1.2538460493087769
Epoch: 400 - D Loss: [0.24309953 0.5        0.4375    ] - G Loss: 1.4711008071899414
Epoch: 500 - D Loss: [0.24385314 0.5        0.4375    ] - G Loss: 1.6181843280792236
Session cleared and garbage collected.
Epoch: 600 - D Loss: [0.32225337 0.5        0.375     ] - G Loss: 1.7006959915161133
Epoch: 700 - D Loss: [0.21934114 0.5        0.4375    ] - G Loss: 1.792607069015503
Epoch: 800 - D Loss: [0.09796719 0.5        0.5       ] - G Loss: 1.874536156654358
Epoch: 900 - D Loss: [0.20045716 0.5        0.4375    ] - G Loss: 1.9512274265289307
Epoch: 1000 - D Loss: [0.12279323 0.5        0.46875   ] - G Loss: 1.9927988052368164
Session cleared and garbage col

In [51]:
print(type(synthetic_fraud_data))
print(synthetic_fraud_data.shape)

<class 'numpy.ndarray'>
(283823, 6)


In [52]:
features = df.columns[:-1]
features

Index(['V10', 'V11', 'V12', 'V14', 'V16', 'V17'], dtype='object')

In [53]:
fake_df = pd.DataFrame(synthetic_fraud_data, columns=features.to_list())
fake_df['Class'] = 1

In [54]:
# Now, combine the synthetic DataFrame with the origianl DataFrame to create the working dataframe
wdf = pd.concat([df, fake_df], axis=0)

# Resetting the index of the new DataFrame
wdf.reset_index(drop=True, inplace=True)

In [55]:
wdf.shape

(568630, 7)

In [56]:
wdf.isnull().sum()

V10      0
V11      0
V12      0
V14      0
V16      0
V17      0
Class    0
dtype: int64

In [57]:
wdf.to_csv('C:/Users/azegl/_ZHAWaufC/HSLU/final_creditcard.csv', index=False)

In [58]:
wdf.to_pickle('C:/Users/azegl/_ZHAWaufC/HSLU/final_creditcard.pkl')