# Credit Card Fraud Detection 

In [1]:
%pylab inline
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import SGD, RMSprop
from keras.regularizers import l2
from keras.backend.tensorflow_backend import set_session
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import keras

import tensorflow as tf
import pandas as pd
import numpy as np


from IPython.display import SVG, Image
from keras.utils.visualize_util import model_to_dot

np.random.seed(0)

Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


## Data

[Credit Card Fraud Detection - Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud)에서 다운받을수 있습니다.

데이터는 2013년 유럽 카드회사의 이틀동안 일어난 transactions에 관한 것이며, <br>
492건의 frauds 가 284,807건의 transactions중에 일어 났습니다.

Class에서 1은 fraud를 뜻하며, 0은 아닌것을 말합니다.

Time데이터는 첫번재 Column으로부터 몇초 이후에 발생한 transaction이라는 뜻입니다. <br>
나머지 데이터들은 PCA의 규제에 의해서 어떤 데이터인지 밝히지 않습니다.

In [2]:
data = pd.read_csv('/dataset/credit-card-fraud-detection/creditcard.csv')

# Preprocessing Amount
amt_scale = MinMaxScaler()
data['NormAmount'] =  amt_scale.fit_transform(data['Amount'].values.reshape(-1, 1))

# Split Train and Test Data
X = data.drop(['Time', 'Amount', 'Class'], axis=1).as_matrix()
Y = data['Class'].as_matrix()

# Standardization
scale_x = MinMaxScaler()
X = scale_x.fit_transform(X)

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25, random_state=1)

fraud_test_y = test_y == 1
fraud_test_x = test_x[fraud_test_y]
fraud_test_y = test_y[fraud_test_y]

train_category_y = np_utils.to_categorical(train_y)
test_category_y = np_utils.to_categorical(test_y)

#### Checking the number of fraud transactions in training and test data

In [3]:
print('The number of Fraud transactions in Training Data:', train_y[train_y == 1].shape[0])
print('The number of Fraud transactions in Test Data:',  test_y[test_y == 1].shape[0])

The number of Fraud transactions in Training Data: 381
The number of Fraud transactions in Test Data: 111


#### Checking the target classes

fraud transactions이 492개밖에 되지 않기 때문에, 일반적인 classification algorithm으로 돌리면 물론 정확도는 매우 높게 나오지만.. 
실상은 1에 해당하는 fraud transactions에서는 대부분 틀릴 가능성이 매우 높습니다. 


In [4]:
pd.value_counts(data['Class'], sort=True)

0    284315
1       492
Name: Class, dtype: int64

## Resampling

resampling에는 여러가지 방법이 있습니다. 

1. Over Sampling: SMOTE (Synthetic Minority Over-Sampling Technique)
2. Under Sampling

아래의 resample function에서는 5:5의 비율로 under sampling을 해줍니다.<br>
resample을 하면서 시간관계가 어차피 깨지기 때문에 (사실 각각의 transactions들 사이에 상관관계가 있는지도 모르겠음)<br>
shuffle을 통해서 train되는 데이터를 augment해줍니다.

In [5]:
def resample(X, Y, ratio=1.):
    index = np.arange(Y.shape[0])
    fraud_indices = index[Y == 1]
    normal_indices = index[Y == 0]
    normal_n = int(len(fraud_indices) * ratio)
    
    random_normal_indices = np.random.permutation(normal_indices)[:normal_n]
    
    sample_indices = np.concatenate([fraud_indices, random_normal_indices])
    np.random.shuffle(sample_indices)
    sample_indices = np.array(sample_indices)
    
    sample_x = X[sample_indices]
    sample_y = Y[sample_indices]
    return sample_x, sample_y

resampled_train_x, resampled_train_y = resample(train_x, train_y)

print('resampled_train_x:', resampled_train_x.shape)
print('resampled_train_y:', resampled_train_y.shape)

resampled_train_x: (762, 29)
resampled_train_y: (762,)


# Logistic Regression

전체적으로 0.99% accuracy를 보이지만, 실제 fraud data만 test를 했을때는 0.57%로.. 실질적으로 못맞추는 수준입니다.<br>
사실 일반적인 알고리즘으로 학습시키기 위해서는 over sampling (SMOTE 같은) 또는 under sampling이 필요합니다.<br>
sampling을 통해서 skewed data를 보정하는 것입니다.

#### resample 없이 데이터 학습뒤 예측하면..

In [6]:
lg = LogisticRegression()
lg.fit(train_x, train_y)
predicted_y = lg.predict(test_x)
accuracy_score(test_y, predicted_y)

0.99908710429482317

In [7]:
predicted_y = lg.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.51351351351351349

#### resampled data로 학습뒤 예측하면...

In [8]:
lg = LogisticRegression()
lg.fit(*resample(train_x, train_y))

predicted_y = lg.predict(test_x)
accuracy_score(test_y, predicted_y)

0.9970506446448133

In [9]:
predicted_y = lg.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.77477477477477474

# Decision Tree

#### resample 없이 데이터 학습뒤 예측하면..

In [10]:
dtc = DecisionTreeClassifier(max_depth=10, criterion='entropy')
dtc.fit(train_x, train_y)
predicted_y = dtc.predict(test_x)
accuracy_score(test_y, predicted_y)

0.99925563888654811

In [11]:
predicted_y = dtc.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.72972972972972971

#### resampled data로 학습뒤 예측하면...

In [12]:
dtc = DecisionTreeClassifier(max_depth=10, criterion='entropy')
for i in range(3):
    dtc.fit(*resample(train_x, train_y))
    predicted_y = dtc.predict(test_x)
    print(accuracy_score(test_y, predicted_y))

0.899974719811
0.913345130755
0.911154181062


In [13]:
predicted_y = dtc.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.91891891891891897

# Deep Learning with Keras

하.. 드디어 딥러닝으로.. 해보면 어떤 결과가 나올 것인가.. Sampling VS UnSampling!

#### Model

In [14]:
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

In [15]:
resampled_train_x.shape

(762, 29)

In [16]:
def generate_model():
    model = Sequential()
    model.add(Dense(762, input_dim=29, name='dense01'))
#     model.add(BatchNormalization())
    model.add(Activation('relu', name='activation01'))
    model.add(Dropout(0.5))
    
    model.add(Dense(512, name='dense02'))
#     model.add(BatchNormalization())
    model.add(Activation('relu', name='activation02'))
    model.add(Dropout(0.5))
    
    model.add(Dense(256, name='dense03'))
#     model.add(BatchNormalization())
    model.add(Activation('relu', name='activation03'))
    model.add(Dropout(0.4))
    
    model.add(Dense(128, name='dense04'))
#     model.add(BatchNormalization())
    model.add(Activation('relu', name='activation04'))
    model.add(Dropout(0.3))
    
    model.add(Dense(1, name='dense05'))
    model.add(Activation('relu', name='activation05'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model


# # Visualization
model = generate_model()
model.summary()
# SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense01 (Dense)              (None, 762)               22860     
_________________________________________________________________
activation01 (Activation)    (None, 762)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 762)               0         
_________________________________________________________________
dense02 (Dense)              (None, 512)               390656    
_________________________________________________________________
activation02 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense03 (Dense)              (None, 256)               131328    
__________

#### resample 없이 데이터 학습뒤 예측하면..

In [17]:
model = generate_model()
model.fit(train_x, train_y, verbose=2)

Epoch 1/10
34s - loss: 0.0287 - acc: 0.9982
Epoch 2/10
38s - loss: 0.0287 - acc: 0.9982
Epoch 3/10
41s - loss: 0.0287 - acc: 0.9982
Epoch 4/10
40s - loss: 0.0287 - acc: 0.9982
Epoch 5/10
42s - loss: 0.0287 - acc: 0.9982
Epoch 6/10
41s - loss: 0.0287 - acc: 0.9982
Epoch 7/10
41s - loss: 0.0287 - acc: 0.9982
Epoch 8/10
42s - loss: 0.0287 - acc: 0.9982
Epoch 9/10
41s - loss: 0.0287 - acc: 0.9982
Epoch 10/10
43s - loss: 0.0287 - acc: 0.9982


<keras.callbacks.History at 0x7f86656c1c50>

In [18]:
predicted_y = model.predict(test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
print(accuracy_score(test_y, predicted_y))

0.998441055027


In [19]:
predicted_y = model.predict(fraud_test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
accuracy_score(fraud_test_y, predicted_y)

0.0

#### resampled data로 학습뒤 예측하면...

In [42]:
def generate_model():
    np.random.seed(0)
    model = Sequential()
    model.add(Dense(384, input_dim=29))
    model.add(LeakyReLU())
    model.add(Dropout(0.4))
    
    model.add(Dense(256))
    model.add(LeakyReLU())
    model.add(Dropout(0.3))
    
    model.add(Dense(204))
    model.add(LeakyReLU())
    model.add(Dropout(0.2))
    
    model.add(Dense(157))
    model.add(LeakyReLU())
    model.add(Dropout(0.2))
    
    model.add(Dense(64))
    model.add(LeakyReLU())
    model.add(Dropout(0.1))
    
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model


# # Visualization
model = generate_model()
early_stopping = EarlyStopping(monitor='loss', patience=10)

for i in range(40):
    _train_data = resample(train_x, train_y)
        
    history = model.fit(*_train_data,
                        verbose=0, 
                        epochs=100,
                        callbacks=[early_stopping])
    loss = np.mean(history.history.get('loss'))
    acc = np.mean(history.history.get('acc'))
    epoch = len(history.epoch)
    print(f'[{i+1:2}] epoch:{epoch:<2} loss:{loss:<8.4} acc:{acc:<8.4}')

[ 1] epoch:17 loss:0.249    acc:0.9023  
[ 2] epoch:40 loss:0.1844   acc:0.9344  
[ 3] epoch:14 loss:0.1807   acc:0.9353  
[ 4] epoch:32 loss:0.1804   acc:0.9366  
[ 5] epoch:13 loss:0.1717   acc:0.9379  
[ 6] epoch:33 loss:0.1773   acc:0.9347  
[ 7] epoch:16 loss:0.176    acc:0.9364  
[ 8] epoch:19 loss:0.1666   acc:0.9417  
[ 9] epoch:24 loss:0.177    acc:0.9352  
[10] epoch:17 loss:0.1728   acc:0.938   
[11] epoch:24 loss:0.1601   acc:0.9424  
[12] epoch:25 loss:0.1685   acc:0.9371  
[13] epoch:17 loss:0.1716   acc:0.9372  
[14] epoch:14 loss:0.1743   acc:0.9364  
[15] epoch:16 loss:0.1645   acc:0.94    
[16] epoch:17 loss:0.1847   acc:0.9332  
[17] epoch:14 loss:0.1611   acc:0.9433  
[18] epoch:34 loss:0.1484   acc:0.9474  
[19] epoch:42 loss:0.1595   acc:0.9423  
[20] epoch:29 loss:0.1576   acc:0.9432  
[21] epoch:14 loss:0.1708   acc:0.9382  
[22] epoch:34 loss:0.1558   acc:0.9441  
[23] epoch:19 loss:0.1502   acc:0.9487  
[24] epoch:18 loss:0.1648   acc:0.9413  
[25] epoch:27 lo

In [40]:
predicted_y = model.predict(test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
print(accuracy_score(test_y, predicted_y))

0.98391899104


In [41]:
predicted_y = model.predict(fraud_test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
accuracy_score(fraud_test_y, predicted_y)

0.86486486486486491