In [75]:
import numpy as np
import pandas as pd
import shap
import tensorflow as tf
import keras
import lime

import keras
import tensorflow
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Embedding, SimpleRNN , concatenate, Lambda, Conv1D, MaxPooling1D
from keras.layers import LeakyReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras import metrics
from keras.losses import BinaryCrossentropy
from keras.wrappers.scikit_learn import KerasClassifier


from imblearn.over_sampling import RandomOverSampler, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn import linear_model, model_selection

## Loading Models

In [76]:
X_train = pd.read_csv('../datasets/word2vec/train_data_imputed_FINAL.csv')
X_test = pd.read_csv('../datasets/word2vec/test_data_imputed_FINAL.csv')
y_train = pd.read_csv('../datasets/word2vec/y_train_FINAL.csv')
y_test = pd.read_csv('../datasets/word2vec/y_test_FINAL.csv')

In [77]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(11516, 263)
(11516, 2)
(2880, 263)
(2880, 2)


In [78]:
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test.drop(columns=['Unnamed: 0'])

## Neural Networks

### Base Model

In [79]:
### Model building
model = Sequential()
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

### Random Oversampler

In [80]:
ros = RandomOverSampler(sampling_strategy = 1, random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [81]:
y_res.value_counts()

fraudulent
0             11293
1             11293
dtype: int64

In [82]:
model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c50a36a550>

In [83]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 256)               67584     
                                                                 
 dropout_16 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 128)               32896     
                                                                 
 dropout_17 (Dropout)        (None, 128)               0         
                                                                 
 dense_22 (Dense)            (None, 64)                8256      
                                                                 
 dropout_18 (Dropout)        (None, 64)                0         
                                                                 
 dense_23 (Dense)            (None, 32)               

In [84]:
train_score = model.evaluate(X_res, y_res)
print(train_score)

[0.0037445640191435814, 0.9987159967422485, 0.9974386096000671, 1.0, 0.9999555945396423]


In [85]:
## Train Prediction
y_tr_pred = model.predict(X_train)
y_t_pred = np.where(y_tr_pred < 0.5, 0, 1)
print(f1_score(y_train, y_t_pred))

0.9389473684210526


In [86]:
print(accuracy_score(y_train, y_t_pred))
print(precision_score(y_train, y_t_pred))
print(recall_score(y_train, y_t_pred))
print(f1_score(y_train, y_t_pred))

0.997481764501563
0.8849206349206349
1.0
0.9389473684210526


In [87]:
## Test Prediction
pred = model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [88]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2824
           1       0.75      0.88      0.81        56

    accuracy                           0.99      2880
   macro avg       0.88      0.93      0.90      2880
weighted avg       0.99      0.99      0.99      2880



In [89]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9920138888888889
0.7538461538461538
0.875
0.8099173553719008


### SMOTE

In [90]:
sme = SMOTEENN(sampling_strategy=0.5, random_state=42, 
               smote=SMOTE(sampling_strategy=1, k_neighbors=7, random_state=42), 
               enn=EditedNearestNeighbours(n_neighbors=3))

In [91]:
X_res, y_res = sme.fit_resample(X_train, y_train)

In [92]:
y_res.value_counts()

fraudulent
0             11293
1             11284
dtype: int64

In [93]:
model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c51951c970>

In [94]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 256)               67584     
                                                                 
 dropout_16 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 128)               32896     
                                                                 
 dropout_17 (Dropout)        (None, 128)               0         
                                                                 
 dense_22 (Dense)            (None, 64)                8256      
                                                                 
 dropout_18 (Dropout)        (None, 64)                0         
                                                                 
 dense_23 (Dense)            (None, 32)               

In [95]:
train_score = model.evaluate(X_res, y_res)
print(train_score)

[4.044349043397233e-05, 1.0, 1.0, 1.0, 1.0000001192092896]


In [96]:
## Test Prediction
pred = model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [97]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2824
           1       0.85      0.70      0.76        56

    accuracy                           0.99      2880
   macro avg       0.92      0.85      0.88      2880
weighted avg       0.99      0.99      0.99      2880



In [98]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9916666666666667
0.8478260869565217
0.6964285714285714
0.7647058823529412


### ADASYN

In [99]:
ada = ADASYN(sampling_strategy = 1, random_state=42, n_neighbors = 2)
X_res, y_res = ada.fit_resample(X_train, y_train)

In [100]:
y_res.value_counts()

fraudulent
0             11293
1             11286
dtype: int64

In [101]:
model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c5195697c0>

In [102]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 256)               67584     
                                                                 
 dropout_16 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 128)               32896     
                                                                 
 dropout_17 (Dropout)        (None, 128)               0         
                                                                 
 dense_22 (Dense)            (None, 64)                8256      
                                                                 
 dropout_18 (Dropout)        (None, 64)                0         
                                                                 
 dense_23 (Dense)            (None, 32)               

In [103]:
train_score = model.evaluate(X_res, y_res)
print(train_score)

[0.0001750803057802841, 0.9999557137489319, 1.0, 0.999911367893219, 1.0]


In [104]:
## Train Prediction
y_tr_pred = model.predict(X_train)
y_t_pred = np.where(y_tr_pred < 0.5, 0, 1)
print(f1_score(y_train, y_t_pred))

0.9977528089887641


In [105]:
## Test Prediction
pred = model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [106]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2824
           1       0.88      0.62      0.73        56

    accuracy                           0.99      2880
   macro avg       0.93      0.81      0.86      2880
weighted avg       0.99      0.99      0.99      2880



In [107]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9909722222222223
0.875
0.625
0.7291666666666666


## CNN

### Base Model

In [108]:
cnn_model = Sequential()
cnn_model.add(Conv1D(32, 3, activation='relu', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2, strides=3))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

### Random OverSampler

In [109]:
ros = RandomOverSampler(sampling_strategy = 1, random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [110]:
y_res.value_counts()

fraudulent
0             11293
1             11293
dtype: int64

In [111]:
cnn_model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c5191913d0>

In [112]:
cnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 261, 32)           128       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 87, 32)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 2784)              0         
                                                                 
 dense_25 (Dense)            (None, 256)               712960    
                                                                 
 dropout_20 (Dropout)        (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 128)               32896     
                                                      

In [113]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.0005574066890403628, 0.9999557137489319, 0.9999114871025085, 1.0, 1.0000001192092896]


In [114]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [115]:
confusion_matrix(y_test, y_pred)

array([[2807,   17],
       [  11,   45]], dtype=int64)

In [116]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2824
           1       0.73      0.80      0.76        56

    accuracy                           0.99      2880
   macro avg       0.86      0.90      0.88      2880
weighted avg       0.99      0.99      0.99      2880



In [117]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9902777777777778
0.7258064516129032
0.8035714285714286
0.7627118644067797


### SMOTE

In [118]:
sme = SMOTEENN(sampling_strategy=0.5, random_state=42, 
               smote=SMOTE(sampling_strategy=1, k_neighbors=7, random_state=42), 
               enn=EditedNearestNeighbours(n_neighbors=3))

In [119]:
X_res, y_res = sme.fit_resample(X_train, y_train)

In [120]:
y_res.value_counts()

fraudulent
0             11293
1             11284
dtype: int64

In [121]:
cnn_model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c51ce3ba00>

In [122]:
cnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 261, 32)           128       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 87, 32)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 2784)              0         
                                                                 
 dense_25 (Dense)            (None, 256)               712960    
                                                                 
 dropout_20 (Dropout)        (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 128)               32896     
                                                      

In [123]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[9.681146184448153e-05, 1.0, 1.0, 1.0, 1.0]


In [124]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [125]:
confusion_matrix(y_test, y_pred)

array([[2815,    9],
       [  21,   35]], dtype=int64)

In [126]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2824
           1       0.80      0.62      0.70        56

    accuracy                           0.99      2880
   macro avg       0.89      0.81      0.85      2880
weighted avg       0.99      0.99      0.99      2880



In [127]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9895833333333334
0.7954545454545454
0.625
0.7


### ADASYN

In [128]:
ada = ADASYN(sampling_strategy = 1, random_state=42, n_neighbors = 2)
X_res, y_res = ada.fit_resample(X_train, y_train)

In [129]:
y_res.value_counts()

fraudulent
0             11293
1             11286
dtype: int64

In [130]:
cnn_model.fit(X_res, y_res, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c51ce3be20>

In [131]:
cnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 261, 32)           128       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 87, 32)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 2784)              0         
                                                                 
 dense_25 (Dense)            (None, 256)               712960    
                                                                 
 dropout_20 (Dropout)        (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 128)               32896     
                                                      

In [132]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[1.1963349606958218e-05, 1.0, 1.0, 1.0, 1.0]


In [133]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [134]:
confusion_matrix(y_test, y_pred)

array([[2814,   10],
       [  15,   41]], dtype=int64)

In [135]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2824
           1       0.80      0.73      0.77        56

    accuracy                           0.99      2880
   macro avg       0.90      0.86      0.88      2880
weighted avg       0.99      0.99      0.99      2880



In [136]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9913194444444444
0.803921568627451
0.7321428571428571
0.766355140186916
