In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebr        
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE


from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support, accuracy_score)
from sklearn.linear_model import LogisticRegression

from keras.models import Model, load_model, Sequential
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 12, 5
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data:

In [3]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

# Exploring Data:

In [4]:
df.info()

In [5]:
df.head()

In [6]:
df.groupby('Class')['Amount'].agg([np.max, np.mean, np.min])

In [7]:
df[df.Amount == 0]['Class'].value_counts() # Amount == 0 ??????????????

In [8]:
sns.boxplot('Class', 'Amount', data = df);

In [9]:
fig, ax = plt.subplots(1,2)
sns.distplot(df[df.Class == 0]['Amount'], ax = ax[0])
sns.distplot(df[df.Class == 1]['Amount'], ax = ax[1])
fig.tight_layout(pad=3.0)
plt.show()

In [10]:
# Make A copy of data
dfc = df.copy()

In [11]:
dfc['Log_Amount'] = np.log1p(dfc['Amount'])

In [12]:
fig, ax = plt.subplots(1,2)
sns.distplot(dfc[dfc.Class == 0]['Log_Amount'], ax = ax[0])
sns.distplot(dfc[dfc.Class == 1]['Log_Amount'], ax = ax[1])
fig.tight_layout(pad=3.0)
plt.show()          #better

# Splitting Data:
*Split the data into 3 samples: (80% normal, 10% both (for val) and 10% both(for test).*

In [13]:
# Before Splitting
dfc.drop(columns = ['Time', 'Amount'], inplace = True)

In [14]:
normal = dfc[dfc.Class == 0].sample(frac = .8, random_state= 42)
# test
normal.shape

In [15]:
normal.head()

In [16]:
rest = dfc[~dfc.index.isin(normal.index)]
# test
rest.shape

In [17]:
normal.reset_index(drop = True, inplace = True)
rest.reset_index(drop = True, inplace = True)

In [18]:
train , test = train_test_split(rest, test_size=0.2, random_state=42, stratify = rest.Class)

In [44]:
train.shape, test.shape

In [45]:
train.Class.value_counts(normalize= True)

In [46]:
test.Class.value_counts(normalize= True)

# Building AutoEncoder:

In [32]:
norm_enc = normal.drop(columns= ['Class'])

In [35]:
input_dim = norm_enc.shape[1]


In [36]:
stacked_encoder = Sequential([
    Input(shape=(input_dim, )),
    Dense(100, activation = 'selu'),
    Dense(30, activation = 'selu')
    
])
stacked_decoder = Sequential([
    Dense(100, activation = 'selu', input_shape = [30]),
    Dense(input_dim)
])

stacked_ae = Sequential([stacked_encoder, stacked_decoder])

In [37]:
nb_epoch = 100
batch_size = 32

stacked_ae.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

history = stacked_ae.fit(norm_enc, norm_enc,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(norm_enc, norm_enc),
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history

In [38]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right');

In [39]:
norm_train = train[train.Class == 0].drop(columns = ['Class'])
fraud_train = train[train.Class == 1].drop(columns = ['Class'])


In [43]:
norm_train.shape

In [40]:
norm_pred = stacked_ae.predict(norm_train)
fraud_pred = stacked_ae.predict(fraud_train)

In [48]:
norm_y = np.zeros(norm_pred.shape[0])
fraud_y = np.ones(fraud_pred.shape[0])

In [50]:
# check
print(norm_pred.shape, norm_y.shape)
print(fraud_pred.shape, fraud_y.shape)

In [52]:
X = np.append(norm_pred, fraud_pred, axis = 0)
Y = np.append(norm_y, fraud_y, axis = 0)

In [56]:
#train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25)
clf = LogisticRegression(solver="lbfgs").fit(X, Y)
pred_y = clf.predict(test.drop(columns=['Class']))

print ("")
print ("Classification Report: ")
print (classification_report(test['Class'], pred_y))

print ("")
print ("Accuracy Score: ", accuracy_score(test['Class'], pred_y))

# Try

In [57]:
norm_train_2 = rest[rest.Class == 0].drop(columns=['Class'])
fraud_train_2 = rest[rest.Class == 1].drop(columns = ['Class'])

In [59]:
norm_pred_2 = stacked_ae.predict(norm_train_2)
fraud_pred_2 = stacked_ae.predict(fraud_train_2)

In [60]:
norm_y_2 = np.zeros(norm_pred_2.shape[0])
fraud_y_2 = np.ones(fraud_pred_2.shape[0])

In [63]:
# check
print(norm_pred_2.shape, norm_y_2.shape)
print(fraud_pred_2.shape, fraud_y_2.shape)

In [65]:
X_2 = np.append(norm_pred_2, fraud_pred_2, axis = 0)
Y_2 = np.append(norm_y_2, fraud_y_2, axis = 0)

In [68]:
train_x, val_x, train_y, val_y = train_test_split(X_2, Y_2, test_size=0.25)
clf = LogisticRegression(solver="lbfgs").fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y))

print ("")
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))