# Resample Fruad=1 Class
- Train using Keras DNN

In [1]:
import pandas as pd
import numpy as np
import keras

from collections import Counter

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

import xgboost as xgb

!python -c 'import tensorflow as tf; print(tf.version)'
print(cpu_count())

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>
2


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/kfold/transaction_fold_0_0_0.csv', drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [3]:
train = read_data(drop_string_features=True)

In [4]:
train.shape

(73816, 380)

In [5]:
train.isFraud.value_counts()

0    71376
1     2440
Name: isFraud, dtype: int64

# Undersample majority class

In [6]:
def preprocess(df):
    df = df.fillna(-999)
    
    return df.drop(columns=['TransactionID', 'TransactionDT'])


train = preprocess(train)

In [7]:
train.shape

(73816, 378)

In [8]:
def split_features_labels(df):
    return df.drop(columns=['isFraud']), df['isFraud']

In [11]:
def undersample_then_oversample(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    sampling_strategy = 0.5
#     X = df.drop(columns=['isFraud'])
#     y = df['isFraud']
    X, y = split_features_labels(df)
    
    # Declare Random Under Sampler
    rus = NearMiss(version=3, 
                   sampling_strategy=sampling_strategy,
                   n_jobs=cpu_count(),
                   random_state=random_state)
    
    ros = SMOTE(n_jobs=cpu_count(),
               random_state=random_state)
    
    X, y = rus.fit_resample(X, y)
    X, y = ros.fit_resample(X, y)
    
    return X, y.astype('int32')


%time X, y = undersample_then_oversample(train)

CPU times: user 37.7 s, sys: 756 ms, total: 38.5 s
Wall time: 26.2 s


In [12]:
print(X.shape)
print(y.shape)

(9760, 377)
(9760,)


In [13]:
Counter(y)

Counter({0: 4880, 1: 4880})

# Build Model

In [21]:
def get_model(input_dim, lr=1e-3):
    
#     inputs = keras.layers.Input(shape=[input_dim,])
    
#     x = keras.layers.Dense(256, activation='relu')(inputs)
#     x = keras.layers.Dense(128, activation='relu')(x)
#     x = keras.layers.Dense(64, activation='relu')(x)
#     x = keras.layers.Dense(16, activation='relu')(x)
#     outputs = keras.layers.Dense(1, activation='sigmoid')(x)
    
#     model = keras.models.Model(inputs=inputs,
#                               outputs=outputs)
    

    model = keras.models.Sequential()
    model.add(Dense(256, activation='relu', kernel_initializer = 'uniform', input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid', kernel_initializer = 'uniform'))
    
    optimizer = keras.optimizers.Adam(lr=lr)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                 metrics=['accuracy'])
    
    
    return model


model = get_model(X.shape[1], lr=1e-3)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               96768     
_________________________________________________________________
batch_normalization_7 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               32896     
_________________________________________________________________
batch_normalization_8 (Batch (None, 128)               512       
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
__________

# Train

In [22]:
epochs = 50
batch_size = 256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5cdcd6bc88>

In [23]:
pred_prob = model.predict_proba(X,
                               batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))


roc-auc score=0.6664095883163128


# Train using XGB

In [12]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_thread=cpu_count(),
                            seed=seed)

%time model.fit(X, y, verbose=True)

CPU times: user 7.07 s, sys: 7.03 ms, total: 7.08 s
Wall time: 7.1 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              n_thread=2, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, silent=None, subsample=1, verbosity=1)

# Read Test Data

In [13]:
del train

import gc
gc.collect()

151

In [14]:
test = read_data('./datasets/kfold/transaction_fold_0_0_1.csv')

test = preprocess(test)

In [15]:
X, y = split_features_labels(test)

In [17]:
type(X)

pandas.core.frame.DataFrame

In [18]:
X = X.values

In [19]:
type(X)

numpy.ndarray

In [20]:
pred_prob = model.predict_proba(X)

score = roc_auc_score(y, pred_prob[:, 1])

print('roc-auc score={}'.format(score))

roc-auc score=0.8543205875032993
