In [2]:
# loading needed methods
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm

from scipy.fft import fft,dst
from random import seed,sample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve, auc,\
precision_score
import tensorflow.keras as keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler



In [3]:
dataset = pd.read_csv('../input/Fraud/backup.csv') # unchanged dataset1

In [4]:
dataset = dataset.drop('Unnamed: 0', axis = 1)
dataset = pd.get_dummies(dataset,prefix=['type'])

In [5]:
len(dataset[dataset['isFraud']==1])

8213

In [6]:
X = dataset.drop("isFraud",1)
y = dataset.isFraud

In [7]:
X

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,errorBalanceOrg,errorBalanceDest,HourOfDay,type_CASH_OUT,type_TRANSFER
0,1,181.00,181.00,0.0,0.00,0.00,0.00,1.810000e+02,1,0,1
1,1,181.00,181.00,0.0,21182.00,0.00,0.00,2.136300e+04,1,1,0
2,1,229133.94,15325.00,0.0,5083.00,51513.44,213808.94,1.827035e+05,1,1,0
3,1,215310.30,705.00,0.0,22425.00,0.00,214605.30,2.377353e+05,1,0,1
4,1,311685.89,10835.00,0.0,6267.00,2719172.89,300850.89,-2.401220e+06,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2770404,743,339682.13,339682.13,0.0,0.00,339682.13,0.00,0.000000e+00,23,1,0
2770405,743,6311409.28,6311409.28,0.0,0.00,0.00,0.00,6.311409e+06,23,0,1
2770406,743,6311409.28,6311409.28,0.0,68488.84,6379898.11,0.00,1.000000e-02,23,1,0
2770407,743,850002.52,850002.52,0.0,0.00,0.00,0.00,8.500025e+05,23,0,1


In [8]:
yf = dst(X, type=2, norm='ortho')

In [9]:
# rad = yf@yf.T

In [10]:
yf=pd.DataFrame(np.abs(yf))
yf['isFraud']=y

In [11]:
# yf.to_csv('Xdst.csv', encoding='utf-8', index = 'false')

In [12]:
def preprocess_data(df):
    class0_df = df[df['isFraud'] == 0] ## majority class, will be used for autoencoder training
    class1_df = df[df['isFraud'] == 1]
    ##
    class0_arr = np.array(class0_df.drop('isFraud',axis = 1)) 
    class1_arr = np.array(class1_df.drop('isFraud',axis = 1))
    X = class0_arr
    X_train, X_test = train_test_split(X, test_size=0.3)
    X_train, X_val = train_test_split(X_train, test_size=0.1)
    
    print("====><>>>>>")
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)
    return X_train, X_val, X_test, class1_arr

In [13]:
X_train, X_val, X_test, class1_arr = preprocess_data(yf)
n_features = X_train.shape[1]

====><>>>>>


In [14]:
n_features

11

In [15]:
print("Train Size: {}".format(len(X_train)))
print("Val Size: {}".format(len(X_val)))
print("Test Size: {}".format(len(X_test)))
print("No of Features: {}".format(n_features))

Train Size: 1740183
Val Size: 193354
Test Size: 828659
No of Features: 11


In [16]:
len(class1_arr)

8213

In [17]:
# model archticeture

def build_model():
    model = keras.Sequential([
        keras.Input(shape=(n_features,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        
        keras.layers.Dense(32, activation='relu'),
        keras.layers.BatchNormalization(),
        
        keras.layers.Dense(16, activation='relu'),
        keras.layers.BatchNormalization(),
        
        keras.layers.Dense(32, activation='relu'),
        keras.layers.BatchNormalization(),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        
        keras.layers.Dense(n_features)
    ])
    
    return model

In [18]:
# model building
model = build_model()

In [19]:
# model compile
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='mse', metrics=['accuracy'])

In [20]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 5
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model_dst.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

# model fitting
model.fit(X_train, X_train, batch_size=512, epochs=10, validation_data=(X_val, X_val), callbacks=[early_stop, model_chkpoint, lrate_scheduler])

Train on 1740183 samples, validate on 193354 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.02925, saving model to best_model_dst.h5
Epoch 2/10
Epoch 00002: val_loss improved from 0.02925 to 0.02657, saving model to best_model_dst.h5
Epoch 3/10
Epoch 00003: val_loss improved from 0.02657 to 0.02656, saving model to best_model_dst.h5
Epoch 4/10
Epoch 00004: val_loss improved from 0.02656 to 0.01322, saving model to best_model_dst.h5
Epoch 5/10
Epoch 00005: val_loss improved from 0.01322 to 0.01017, saving model to best_model_dst.h5
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.01017
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.01017
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.01017
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.01017
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.01017
Epoch 00010: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f5c163f1ed0>

In [21]:
def rel(y_test,y_pred):
    predictionsRFR=np.array(y_pred,dtype='int') 
    CM_RFR = confusion_matrix(y_test,predictionsRFR)
    CR_RFR = classification_report(y_test,predictionsRFR)
    fprRFR, recallRFR, thresholdsRFR = roc_curve(y_test, predictionsRFR)
    AUC_RFR = auc(fprRFR, recallRFR)
    print("=============<>==================\n")
    resultsRFR = {"Confusion Matrix":CM_RFR,"Classification Report":CR_RFR,"Area Under Curve":AUC_RFR}
    for measure in resultsRFR:
        print(measure,": \n",resultsRFR[measure])
    print("=============<>==================\n")

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import precision_recall_curve, roc_curve


def reconstruction_error(actual, pred):
    return np.mean((actual - pred)**2, axis=1)

def evaluate(model, X, y):
    X = X.reshape(-1, n_features)
    out = reconstruction_error(X, model.predict(X))
    print("AUC score: {}".format(roc_auc_score(y, out)))
    print("PR score: {}".format(average_precision_score(y, out)))
    print("\n\n")
    for th in [0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1 , 1.2, 1.3, 1.4, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4]:
        out_th = [1 if (o > th) else 0 for o in out]
        print("TH - {}".format(th))
        print("Precision: {}".format(precision_score(y, out_th)))
        print("Recall: {}".format(recall_score(y, out_th)))
        print("F1score: {}".format(f1_score(y, out_th)))
        rel(y,out_th)
        print("\n\n")

In [23]:
final_X_test = np.concatenate((X_test, class1_arr))
final_Y_test = np.concatenate(([0]*len(X_test), [1]*len(class1_arr)))

In [24]:
# Metrics
evaluate(model, final_X_test, final_Y_test)

AUC score: 1.0
PR score: 1.0



TH - 0.5
Precision: 0.7781883646010991
Recall: 1.0
F1score: 0.8752597644802046

Confusion Matrix : 
 [[826318   2341]
 [     0   8213]]
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    828659
           1       0.78      1.00      0.88      8213

    accuracy                           1.00    836872
   macro avg       0.89      1.00      0.94    836872
weighted avg       1.00      1.00      1.00    836872

Area Under Curve : 
 0.9985874768752889




TH - 0.6
Precision: 0.8324548956010541
Recall: 1.0
F1score: 0.9085679517672438

Confusion Matrix : 
 [[827006   1653]
 [     0   8213]]
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    828659
           1       0.83      1.00      0.91      8213

    accuracy                           1.00    836872
   macro avg       0.92      1.00      0.95    836872
w