# Resample Fruad=1 Class
- Train using Keras DNN

In [1]:
import pandas as pd
import numpy as np
import keras

from collections import Counter

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

import xgboost as xgb

!python -c 'import tensorflow as tf; print(tf.version)'
print(cpu_count())

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>
4


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/kfold/transaction_fold_0_0_0.csv', drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [3]:
filename = './datasets/train_transaction.csv'
train = read_data(filename, drop_string_features=True)

In [4]:
train.shape

(590540, 380)

In [5]:
train.isFraud.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

# Undersample majority class

In [6]:
def preprocess(df):
    df = df.fillna(-999)
    
    return df.drop(columns=['TransactionID', 'TransactionDT'])


train = preprocess(train)

In [7]:
train.shape

(590540, 378)

In [8]:
def split_features_labels(df):
    return df.drop(columns=['isFraud']), df['isFraud']

In [None]:
# def undersample_then_oversample(df, random_state=27):
#     """
#     Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
#     """
#     sampling_strategy = 0.5
# #     X = df.drop(columns=['isFraud'])
# #     y = df['isFraud']
#     X, y = split_features_labels(df)
    
#     # Declare Random Under Sampler
#     rus = NearMiss(version=3, 
#                    sampling_strategy=sampling_strategy,
#                    n_jobs=cpu_count(),
#                    random_state=random_state)
    
#     ros = SMOTE(n_jobs=cpu_count(),
#                random_state=random_state)
    
#     X, y = rus.fit_resample(X, y)
#     X, y = ros.fit_resample(X, y)
    
#     return X, y.astype('int32')


# %time X, y = undersample_then_oversample(train)

In [9]:
def oversample(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    sampling_strategy = 0.15
#     X = df.drop(columns=['isFraud'])
#     y = df['isFraud']
    X, y = split_features_labels(df)
    
    ros = SMOTE(sampling_strategy=sampling_strategy,
                n_jobs=cpu_count(),
               random_state=random_state)
    
    X, y = ros.fit_resample(X, y)
    
    return X, y.astype('int32')


%time X, y = oversample(train)

CPU times: user 1min, sys: 4.48 s, total: 1min 5s
Wall time: 22.4 s


In [10]:
print(X.shape)
print(y.shape)

(655358, 377)
(655358,)


In [11]:
Counter(y)

Counter({0: 569877, 1: 85481})

# Train using XGB

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 378 entries, isFraud to V339
dtypes: float64(376), int64(2)
memory usage: 1.7 GB


In [15]:
del train

import gc
gc.collect()

40

In [18]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_jobs=cpu_count(),
                            seed=seed)

%time model.fit(X, y, verbose=True)

CPU times: user 13min 1s, sys: 4.3 s, total: 13min 5s
Wall time: 3min 20s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=1, verbosity=1)

In [25]:
pred_prob = model.predict_proba(X)

score = roc_auc_score(y, pred_prob[:, 1])

print('roc-auc score={}'.format(score))

roc-auc score=0.9258128610980769


# Save Model

In [27]:
import pickle

filename_model = './models/xgboost/transaction_oversampling.pkl'

with open(filename_model, 'wb') as f:
    pickle.dump(model, f)

# Predict Test Data And Write to File

In [38]:
filename = './datasets/test_transaction.csv'
test = read_data(filename, drop_string_features=True)

In [39]:
test_id = test['TransactionID']

In [30]:
test.shape

(506691, 379)

In [31]:
test = preprocess(test)

In [34]:
test.head()

Unnamed: 0,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,31.95,10409,111.0,150.0,226.0,170.0,87.0,1.0,-999.0,6.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,49.0,4272,111.0,150.0,226.0,299.0,87.0,4.0,-999.0,3.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,171.0,4476,574.0,150.0,226.0,472.0,87.0,2635.0,-999.0,2.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,284.95,10989,360.0,150.0,166.0,205.0,87.0,17.0,-999.0,5.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,67.95,18018,452.0,150.0,117.0,264.0,87.0,6.0,-999.0,6.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [35]:
del X
import gc
gc.collect()

1619

In [36]:
%time pred_prob = model.predict_proba(test.values)

CPU times: user 5.82 s, sys: 2.39 s, total: 8.21 s
Wall time: 4.43 s


In [41]:
test_result_df = pd.DataFrame(test_id, columns=['TransactionID'])
test_result_df['isFraud'] = pred_prob[:, 1]

filename_prediction = './prediction_test_oversampling.csv'
test_result_df.to_csv(filename_prediction, index=False)