In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8

RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [2]:
df = pd.read_csv("creditcard_data.csv")

In [3]:
df.shape

(284806, 31)

In [4]:
df.isnull().values.any()

False

In [5]:
frauds = df[df.Class == 1]
normal = df[df.Class == 0]

In [6]:
from sklearn.preprocessing import StandardScaler

data = df.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

In [7]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support, accuracy_score, precision_score, recall_score)

In [8]:
columns = data.columns.tolist()
columns = [c for c in columns if c not in ["Class"]]
X = df[columns]
y = df['Class']
print('Shape of X: ' , X.shape)
print('Shape of y: ' , y.shape)

Shape of X:  (284806, 29)
Shape of y:  (284806,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=RANDOM_SEED)

# y_train = X_train['Class']
# X_train = X_train.drop(['Class'], axis=1)

# y_test = X_test['Class']
# X_test = X_test.drop(['Class'], axis=1)

# X_train = X_train.values
# X_test = X_test.values

print('Shape of X_train: ' , X_train.shape)
print('Shape of y_train: ' , y_train.shape)
print('\nShape of X_test: ' , X_test.shape)
print('Shape of y_test: ' , y_test.shape)

Shape of X_train:  (227844, 29)
Shape of y_train:  (227844,)

Shape of X_test:  (56962, 29)
Shape of y_test:  (56962,)


In [10]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
print('Shape of X_train_scaled: ' , X_train_scaled.shape)

Shape of X_train_scaled:  (227844, 29)


In [11]:
ML_Model = []
acc_test = []
prec = []
recall = []
f1 = []
def storeResults(model,a,b,c,d):
  ML_Model.append(model)
  acc_test.append(round(a, 3))
  prec.append(round(b, 3))
  recall.append(round(c, 3))
  f1.append(round(d, 3))

In [12]:
X_test_scaled = sc.transform(X_test)

In [13]:
rf = RandomForestClassifier(max_depth=5, random_state=RANDOM_SEED)
rf.fit(X_train_scaled,y_train)
y_pred_rf = rf.predict(X_test_scaled)

In [14]:
acc_test_rf = accuracy_score(y_test,y_pred_rf)
prec_rf = precision_score(y_test,y_pred_rf)
recall_rf = recall_score(y_test,y_pred_rf)
f1_rf = f1_score(y_test,y_pred_rf)
print("RF : Accuracy on test Data: {:.3f}".format(acc_test_rf))
print("Classification Report :")
print(classification_report(y_test,y_pred_rf))

RF : Accuracy on test Data: 0.999
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.92      0.71      0.80        95

    accuracy                           1.00     56962
   macro avg       0.96      0.85      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [15]:
storeResults('Random Forest', acc_test_rf,prec_rf,recall_rf,f1_rf)

In [16]:
lr = LogisticRegression(C=100.0, random_state = 42)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

In [17]:
acc_test_lr = accuracy_score(y_test,y_pred_lr)
prec_lr = precision_score(y_test,y_pred_lr)
recall_lr = recall_score(y_test,y_pred_lr)
f1_lr = f1_score(y_test,y_pred_lr)
print("LR : Accuracy on test Data: {:.3f}".format(acc_test_lr))
print("Classification Report :")
print(classification_report(y_test,y_pred_lr))

LR : Accuracy on test Data: 0.999
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.84      0.61      0.71        95

    accuracy                           1.00     56962
   macro avg       0.92      0.81      0.85     56962
weighted avg       1.00      1.00      1.00     56962



In [18]:
storeResults('Logistic Regression', acc_test_lr,prec_lr,recall_lr,f1_lr)

In [19]:
Fraud = data[data['Class']==1]
Valid = data[data['Class']==0]
outlier_fraction = len(Fraud)/float(len(Valid))

In [20]:
svc = SVC(C=10.0, random_state = RANDOM_SEED)
svc.fit(X_train_scaled, y_train)
y_pred_svc = svc.predict(X_test_scaled)

In [21]:
acc_test_svc = accuracy_score(y_test,y_pred_svc)
prec_svc = precision_score(y_test,y_pred_svc)
recall_svc = recall_score(y_test,y_pred_svc)
f1_svc = f1_score(y_test,y_pred_svc)
print("SVC : Accuracy on test Data: {:.3f}".format(acc_test_svc))
print("Classification Report :")
print(classification_report(y_test,y_pred_svc))

SVC : Accuracy on test Data: 0.999
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.94      0.71      0.81        95

    accuracy                           1.00     56962
   macro avg       0.97      0.85      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [22]:
storeResults('Support Vector Classification', acc_test_svc,prec_svc,recall_svc,f1_svc)

In [23]:
xgb_clf = XGBClassifier(max_depth = 5, learning_rate = 0.08, objective = 'binary:logistic')
xgb_clf.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_clf.predict(X_test_scaled)

In [24]:
acc_test_xgb = accuracy_score(y_test,y_pred_xgb)
prec_xgb = precision_score(y_test,y_pred_xgb)
recall_xgb = recall_score(y_test,y_pred_xgb)
f1_xgb = f1_score(y_test,y_pred_xgb)
print("XGBoost : Accuracy on test Data: {:.3f}".format(acc_test_xgb))
print("Classification Report :")
print(classification_report(y_test,y_pred_xgb))

XGBoost : Accuracy on test Data: 1.000
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.94      0.80      0.86        95

    accuracy                           1.00     56962
   macro avg       0.97      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [25]:
storeResults('XGBoost', acc_test_xgb,prec_xgb,recall_xgb,f1_xgb)

In [26]:
models = list()

models.append( ('rf' , RandomForestClassifier(max_depth=5, random_state=19)) )
models.append(('svc', SVC(C=10.0, random_state = RANDOM_SEED)))
models.append(('xgb', XGBClassifier(max_depth = 5, learning_rate = 0.08, objective = 'binary:logistic') ))
# Define the hard voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')


In [27]:
# Fit the model on the training data
ensemble.fit(X_train_scaled, y_train)
y_pred_ens = ensemble.predict(X_test_scaled)

In [28]:
acc_test_ens = accuracy_score(y_test,y_pred_ens)
prec_ens = precision_score(y_test,y_pred_ens)
recall_ens = recall_score(y_test,y_pred_ens)
f1_ens = f1_score(y_test,y_pred_ens)
print("Ensemble : Accuracy on test Data: {:.3f}".format(acc_test_ens))
print("Classification Report :")
print(classification_report(y_test,y_pred_ens))

Ensemble : Accuracy on test Data: 1.000
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.96      0.78      0.86        95

    accuracy                           1.00     56962
   macro avg       0.98      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [29]:
storeResults('Ensemble', acc_test_ens,prec_ens,recall_ens,f1_ens)

In [30]:
acc_test_auto = 0.98
prec_auto = 1.00
recall_auto = 0.98
f1_auto = 0.99

storeResults('Autoencoder', acc_test_auto,prec_auto,recall_auto,f1_auto)

In [31]:
f_acc_test = [y*100 for y in acc_test]
f_prec = [y for y in prec]
f_recall = [y for y in recall]
f_f1 = [y for y in f1]
results = pd.DataFrame({ 'ML Model': ML_Model,    
                        'Test Accuracy (in %)': f_acc_test,
                        'Precision': f_prec,
                        'Recall': f_recall,
                        'F1': f_f1})
results.sort_values(by=['Precision','Recall','F1'], ascending=False)

Unnamed: 0,ML Model,Test Accuracy (in %),Precision,Recall,F1
5,Autoencoder,98.0,1.0,0.98,0.99
4,Ensemble,100.0,0.961,0.779,0.86
2,Support Vector Classification,99.9,0.944,0.705,0.807
3,XGBoost,100.0,0.938,0.8,0.864
0,Random Forest,99.9,0.918,0.705,0.798
1,Logistic Regression,99.9,0.841,0.611,0.707
