# Resample Fruad=1 Class
- Train using Keras DNN
- Memory Reduction Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

In [1]:
import pandas as pd
import numpy as np
import keras

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count


import xgboost as xgb

!python -c 'import tensorflow as tf; print(tf.version)'

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/train_transaction.csv', drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [2]:
%time train = read_data(drop_string_features=True)

CPU times: user 22.4 s, sys: 3.46 s, total: 25.9 s
Wall time: 27.9 s


In [3]:
train.shape

(590540, 380)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 380 entries, TransactionID to V339
dtypes: float64(376), int64(4)
memory usage: 1.7 GB


In [5]:
train = train.fillna(-999)

In [6]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,13926,-999.0,150.0,142.0,315.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Reduce Memory

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
%time train = reduce_mem_usage(train)

train.info()

Memory usage of dataframe is 1712.08 MB
Memory usage after optimization is: 479.27 MB
Decreased by 72.0%
CPU times: user 1min, sys: 2min 28s, total: 3min 28s
Wall time: 3min 28s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 380 entries, TransactionID to V339
dtypes: float16(332), float32(44), int16(1), int32(2), int8(1)
memory usage: 479.3 MB


# Resample minority class

In [8]:
def upsample_minority(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    
    minority = df[df['isFraud']==1]
    majority = df[df['isFraud']==0]
    
    minority_upsampled = resample(minority,
                                 replace=True, # sample with replacement
                                 n_samples=len(majority), # the size of Fraud equals to non-Fruad
                                 random_state=random_state)
    
    df = pd.concat([majority, 
                    minority_upsampled])
    
    return shuffle(df, random_state=random_state)


train_resampled = upsample_minority(train)

In [9]:
train_resampled.shape

(1139754, 380)

In [10]:
train_resampled.isFraud.value_counts()

1    569877
0    569877
Name: isFraud, dtype: int64

# Preprocess Data

In [10]:
def preprocess(df, is_test_dataset=False):
    if is_test_dataset:
        return df.drop(columns=['TransactionID', 'TransactionDT'])
    else:
        return df.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), df['isFraud']

In [11]:
X, y = preprocess(train_resampled)

In [12]:
X = X.fillna(-999)
y = y.astype('int32')

In [13]:
del train
del train_resampled

import gc
gc.collect()

116

# Build Model

In [14]:
def get_model(input_dim, lr=1e-3):
    
#     inputs = keras.layers.Input(shape=[input_dim,])
    
#     x = keras.layers.Dense(256, activation='relu')(inputs)
#     x = keras.layers.Dense(128, activation='relu')(x)
#     x = keras.layers.Dense(64, activation='relu')(x)
#     x = keras.layers.Dense(16, activation='relu')(x)
#     outputs = keras.layers.Dense(1, activation='sigmoid')(x)
    
#     model = keras.models.Model(inputs=inputs,
#                               outputs=outputs)
    

    model = keras.models.Sequential()
    model.add(Dense(256, activation='relu', kernel_initializer = 'uniform', input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu', kernel_initializer = 'uniform'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid', kernel_initializer = 'uniform'))
    
    optimizer = keras.optimizers.Adam(lr=lr)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                 metrics=['accuracy'])
    
    
    return model


model = get_model(X.shape[1], lr=1e-3)

model.summary()

W0801 10:26:35.398051 140173097551552 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0801 10:26:35.420169 140173097551552 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0801 10:26:35.427523 140173097551552 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0801 10:26:35.507461 140173097551552 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.p

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               96768     
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
__________

# Train

In [15]:
epochs = 100
batch_size = 256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f7c3c351978>

In [16]:
pred_prob = model.predict_proba(X,
                               batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))


roc-auc score=0.9396283669312819


In [17]:
model.save('./models/keras/dnn/transactions_all_100.h5')

In [18]:
epochs = 100
batch_size = 256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f7c39bac780>

In [19]:
pred_prob = model.predict_proba(X,
                               batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))


roc-auc score=0.9498802193771105


In [20]:
model.save('./models/keras/dnn/transactions_all_epochs_200.h5')

# Load Model and Test

In [21]:
model_test = keras.models.load_model('./models/keras/dnn/transactions_all_epochs_200.h5')

pred_prob = model_test.predict_proba(X,
                               batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))

roc-auc score=0.9498802193771105


# Read Test Data

In [4]:
%time test = read_data(filename='./datasets/test_transaction.csv', drop_string_features=True)

CPU times: user 20.2 s, sys: 3.13 s, total: 23.3 s
Wall time: 25.1 s


In [6]:
test = test.fillna(-999)

In [5]:
test.shape

(506691, 379)

In [8]:
%time test = reduce_mem_usage(test)

test.info()

Memory usage of dataframe is 1465.12 MB
Memory usage after optimization is: 418.47 MB
Decreased by 71.4%
CPU times: user 53.7 s, sys: 2min 4s, total: 2min 58s
Wall time: 2min 58s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506691 entries, 0 to 506690
Columns: 379 entries, TransactionID to V339
dtypes: float16(324), float32(52), int16(1), int32(2)
memory usage: 418.5 MB


In [11]:
X = preprocess(test, is_test_dataset=True)

In [13]:
batch_size = 256

model_test = keras.models.load_model('./models/keras/dnn/transactions_all_epochs_200.h5')

pred_prob = model_test.predict_proba(X,
                               batch_size=batch_size)


In [14]:
pred_prob.shape

(506691, 1)

In [15]:
test_result_df = pd.DataFrame(test['TransactionID'], columns=['TransactionID'])
test_result_df['isFraud'] = pred_prob

In [16]:
test_result_df.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.008168
1,3663550,0.008616
2,3663551,0.053845
3,3663552,0.172444
4,3663553,0.031337


In [17]:
test_result_df.to_csv('./prediction_test_dnn_epochs_200_2019_0801.csv', index=False)