# Credit Card Fraud Detection 

In [1]:
%pylab inline
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import SGD, RMSprop
from keras.regularizers import l2
from keras.backend.tensorflow_backend import set_session
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import keras

import tensorflow as tf
import pandas as pd
import numpy as np


from IPython.display import SVG, Image
from keras.utils.visualize_util import model_to_dot

np.random.seed(0)

Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


## Data

[Credit Card Fraud Detection - Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud)에서 다운받을수 있습니다.

데이터는 2013년 유럽 카드회사의 이틀동안 일어난 transactions에 관한 것이며, <br>
492건의 frauds 가 284,807건의 transactions중에 일어 났습니다.

Class에서 1은 fraud를 뜻하며, 0은 아닌것을 말합니다.

Time데이터는 첫번재 Column으로부터 몇초 이후에 발생한 transaction이라는 뜻입니다. <br>
나머지 데이터들은 PCA의 규제에 의해서 어떤 데이터인지 밝히지 않습니다.

In [2]:
data = pd.read_csv('/dataset/credit-card-fraud-detection/creditcard.csv')

# Preprocessing Amount
amt_scale = MinMaxScaler()
data['NormAmount'] =  amt_scale.fit_transform(data['Amount'].values.reshape(-1, 1))

# Split Train and Test Data
X = data.drop(['Time', 'Amount', 'Class'], axis=1).as_matrix()
Y = data['Class'].as_matrix()

# Standardization
scale_x = MinMaxScaler()
X = scale_x.fit_transform(X)

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25, random_state=1)

fraud_test_y = test_y == 1
fraud_test_x = test_x[fraud_test_y]
fraud_test_y = test_y[fraud_test_y]

train_category_y = np_utils.to_categorical(train_y)
test_category_y = np_utils.to_categorical(test_y)

#### Checking the number of fraud transactions in training and test data

In [3]:
print('The number of Fraud transactions in Training Data:', train_y[train_y == 1].shape[0])
print('The number of Fraud transactions in Test Data:',  test_y[test_y == 1].shape[0])

The number of Fraud transactions in Training Data: 381
The number of Fraud transactions in Test Data: 111


#### Checking the target classes

fraud transactions이 492개밖에 되지 않기 때문에, 일반적인 classification algorithm으로 돌리면 물론 정확도는 매우 높게 나오지만.. 
실상은 1에 해당하는 fraud transactions에서는 대부분 틀릴 가능성이 매우 높습니다. 


In [4]:
pd.value_counts(data['Class'], sort=True)

0    284315
1       492
Name: Class, dtype: int64

## Resampling

resampling에는 여러가지 방법이 있습니다. 

1. Over Sampling: SMOTE (Synthetic Minority Over-Sampling Technique)
2. Under Sampling

아래의 resample function에서는 5:5의 비율로 under sampling을 해줍니다.<br>
resample을 하면서 시간관계가 어차피 깨지기 때문에 (사실 각각의 transactions들 사이에 상관관계가 있는지도 모르겠음)<br>
shuffle을 통해서 train되는 데이터를 augment해줍니다.

In [5]:
def resample(X, Y, ratio=1.):
    index = np.arange(Y.shape[0])
    fraud_indices = index[Y == 1]
    normal_indices = index[Y == 0]
    normal_n = int(len(fraud_indices) * ratio)
    
    random_normal_indices = np.random.permutation(normal_indices)[:normal_n]
    
    sample_indices = np.concatenate([fraud_indices, random_normal_indices])
    np.random.shuffle(sample_indices)
    sample_indices = np.array(sample_indices)
    
    sample_x = X[sample_indices]
    sample_y = Y[sample_indices]
    return sample_x, sample_y

resampled_train_x, resampled_train_y = resample(train_x, train_y)

print('resampled_train_x:', resampled_train_x.shape)
print('resampled_train_y:', resampled_train_y.shape)

resampled_train_x: (762, 29)
resampled_train_y: (762,)


# Logistic Regression

전체적으로 0.99% accuracy를 보이지만, 실제 fraud data만 test를 했을때는 0.57%로.. 실질적으로 못맞추는 수준입니다.<br>
사실 일반적인 알고리즘으로 학습시키기 위해서는 over sampling (SMOTE 같은) 또는 under sampling이 필요합니다.<br>
sampling을 통해서 skewed data를 보정하는 것입니다.

#### resample 없이 데이터 학습뒤 예측하면..

In [6]:
lg = LogisticRegression()
lg.fit(train_x, train_y)
predicted_y = lg.predict(test_x)
accuracy_score(test_y, predicted_y)

0.99908710429482317

In [7]:
predicted_y = lg.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.51351351351351349

#### resampled data로 학습뒤 예측하면...

In [8]:
lg = LogisticRegression()
lg.fit(*resample(train_x, train_y))

predicted_y = lg.predict(test_x)
accuracy_score(test_y, predicted_y)

0.9970506446448133

In [9]:
predicted_y = lg.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.77477477477477474

# Decision Tree

#### resample 없이 데이터 학습뒤 예측하면..

In [10]:
dtc = DecisionTreeClassifier(max_depth=10, criterion='entropy')
dtc.fit(train_x, train_y)
predicted_y = dtc.predict(test_x)
accuracy_score(test_y, predicted_y)

0.99925563888654811

In [11]:
predicted_y = dtc.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.72972972972972971

#### resampled data로 학습뒤 예측하면...

In [12]:
dtc = DecisionTreeClassifier(max_depth=10, criterion='entropy')
dtc.fit(*resample(train_x, train_y))
predicted_y = dtc.predict(test_x)
print(accuracy_score(test_y, predicted_y))

0.899974719811


In [13]:
predicted_y = dtc.predict(fraud_test_x)
accuracy_score(fraud_test_y, predicted_y)

0.95495495495495497

# Deep Learning with Keras

하.. 드디어 딥러닝으로.. 해보면 어떤 결과가 나올 것인가.. Sampling VS UnSampling!

## Model

In [14]:
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.InteractiveSession(config=config))

In [15]:
def generate_model():
    np.random.seed(0)
    model = Sequential()
    model.add(Dense(256, input_dim=29))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(160))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.4))
    
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(96))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model


# # Visualization
model = generate_model()
model.summary()
# SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               7680      
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 160)               41120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 160)               640       
_________________________________________________________________
activation_2 (Activation)    (None, 160)               0         
__________

#### resample 없이 데이터 학습뒤 예측하면..

In [16]:
model = generate_model()
model.fit(train_x, train_y, verbose=2)

Epoch 1/10
27s - loss: 0.0100 - acc: 0.9980
Epoch 2/10
27s - loss: 0.0041 - acc: 0.9993
Epoch 3/10
27s - loss: 0.0038 - acc: 0.9993
Epoch 4/10
27s - loss: 0.0036 - acc: 0.9994
Epoch 5/10
27s - loss: 0.0034 - acc: 0.9993
Epoch 6/10
27s - loss: 0.0035 - acc: 0.9993
Epoch 7/10
27s - loss: 0.0034 - acc: 0.9994
Epoch 8/10
27s - loss: 0.0034 - acc: 0.9994
Epoch 9/10
27s - loss: 0.0034 - acc: 0.9994
Epoch 10/10
27s - loss: 0.0033 - acc: 0.9994


<keras.callbacks.History at 0x7f01d2528898>

In [17]:
predicted_y = model.predict(test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
print(accuracy_score(test_y, predicted_y))

0.999311817084


In [18]:
predicted_y = model.predict(fraud_test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
accuracy_score(fraud_test_y, predicted_y)

0.74774774774774777

#### resampled data로 학습뒤 예측하면...

In [19]:
# # Visualization
model = generate_model()
early_stopping = EarlyStopping(monitor='loss', patience=10)

for i in range(100):
    _train_data = resample(train_x, train_y, ratio=1)
        
    history = model.fit(*_train_data,
                        verbose=0, 
                        epochs=50,)
#                         callbacks=[early_stopping])
    loss = np.mean(history.history.get('loss'))
    acc = np.mean(history.history.get('acc'))
    epoch = len(history.epoch)
    print(f'[{i+1:2}] epoch:{epoch:<2} loss:{loss:<8.4} acc:{acc:<8.4}')

[ 1] epoch:50 loss:0.1747   acc:0.9351  
[ 2] epoch:50 loss:0.1154   acc:0.957   
[ 3] epoch:50 loss:0.09408  acc:0.9642  
[ 4] epoch:50 loss:0.09055  acc:0.9646  
[ 5] epoch:50 loss:0.06973  acc:0.9732  
[ 6] epoch:50 loss:0.06866  acc:0.9734  
[ 7] epoch:50 loss:0.05985  acc:0.9764  
[ 8] epoch:50 loss:0.05556  acc:0.9785  
[ 9] epoch:50 loss:0.05241  acc:0.9789  
[10] epoch:50 loss:0.04598  acc:0.9827  
[11] epoch:50 loss:0.04265  acc:0.9832  
[12] epoch:50 loss:0.03687  acc:0.9853  
[13] epoch:50 loss:0.04159  acc:0.9841  
[14] epoch:50 loss:0.03852  acc:0.9851  
[15] epoch:50 loss:0.03585  acc:0.9861  
[16] epoch:50 loss:0.04974  acc:0.9817  
[17] epoch:50 loss:0.03046  acc:0.9885  
[18] epoch:50 loss:0.03561  acc:0.9875  
[19] epoch:50 loss:0.03836  acc:0.986   
[20] epoch:50 loss:0.02893  acc:0.9888  
[21] epoch:50 loss:0.0328   acc:0.9882  
[22] epoch:50 loss:0.02712  acc:0.9898  
[23] epoch:50 loss:0.03774  acc:0.9865  
[24] epoch:50 loss:0.02368  acc:0.991   
[25] epoch:50 lo

In [20]:
predicted_y = model.predict(test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
print(accuracy_score(test_y, predicted_y))

0.997457936575


In [21]:
predicted_y = model.predict(fraud_test_x)
predicted_y = predicted_y.reshape(predicted_y.shape[0])
predicted_y = np.where(predicted_y >= 0.5, 1, 0)
accuracy_score(fraud_test_y, predicted_y)

0.8288288288288288