# Resample Fruad=1 Class
- Train using Keras DNN

In [1]:
import pandas as pd
import numpy as np
import keras

from collections import Counter

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

import xgboost as xgb

!python -c 'import tensorflow as tf; print(tf.version)'
print(cpu_count())

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>
4


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/kfold/transaction_fold_0_0_0.csv',
              filename_identity=None,
              drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if filename_identity is not None:
        df_identity = pd.read_csv(filename_identity)
        
        df = pd.merge(df, df_identity, on='TransactionID', how='left')
        
        del df_identity
    
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [10]:
filename = './datasets/train_transaction.csv'
# filename_identity = './datasets/train_identity.csv'

# %time train = read_data(filename, filename_identity, drop_string_features=True)

%time train = read_data(filename, drop_string_features=False)

CPU times: user 21.3 s, sys: 1.66 s, total: 23 s
Wall time: 22.3 s


In [4]:
train.shape

(590540, 394)

In [5]:
train.isFraud.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

# Encode Labels

In [40]:
def encode(df, mappings=None):
    if mappings is None:
        create_mappings = True
        mappings= {}
    else:
        create_mappings = False
        
    
    
    string_features = get_string_features(df)
    
    for feature in string_features:
        if create_mappings:
            uniques = df[feature].unique()
            m = dict(zip(uniques, range(len(uniques))))
            mappings[feature] = m
        else:
            m = mappings[feature]
        
        df[feature] = df[feature].map(m)
        
    return mappings


In [None]:
%time mappings = encode(train)

In [13]:
mappings

{'ProductCD': {'W': 0, 'H': 1, 'C': 2, 'S': 3, 'R': 4},
 'card4': {'discover': 0,
  'mastercard': 1,
  'visa': 2,
  'american express': 3,
  nan: 4},
 'card6': {'credit': 0,
  'debit': 1,
  nan: 2,
  'debit or credit': 3,
  'charge card': 4},
 'P_emaildomain': {nan: 0,
  'gmail.com': 1,
  'outlook.com': 2,
  'yahoo.com': 3,
  'mail.com': 4,
  'anonymous.com': 5,
  'hotmail.com': 6,
  'verizon.net': 7,
  'aol.com': 8,
  'me.com': 9,
  'comcast.net': 10,
  'optonline.net': 11,
  'cox.net': 12,
  'charter.net': 13,
  'rocketmail.com': 14,
  'prodigy.net.mx': 15,
  'embarqmail.com': 16,
  'icloud.com': 17,
  'live.com.mx': 18,
  'gmail': 19,
  'live.com': 20,
  'att.net': 21,
  'juno.com': 22,
  'ymail.com': 23,
  'sbcglobal.net': 24,
  'bellsouth.net': 25,
  'msn.com': 26,
  'q.com': 27,
  'yahoo.com.mx': 28,
  'centurylink.net': 29,
  'servicios-ta.com': 30,
  'earthlink.net': 31,
  'hotmail.es': 32,
  'cfl.rr.com': 33,
  'roadrunner.com': 34,
  'netzero.net': 35,
  'gmx.de': 36,
  'sudd

In [14]:
get_string_features(train)

[]

# Undersample majority class

In [15]:
def preprocess(df):
    df = df.fillna(-999)
    
    return df.drop(columns=['TransactionID', 'TransactionDT'])


train = preprocess(train)

In [16]:
train.shape

(590540, 392)

In [17]:
def split_features_labels(df):
    return df.drop(columns=['isFraud']), df['isFraud']

In [18]:
# def undersample_then_oversample(df, random_state=27):
#     """
#     Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
#     """
#     sampling_strategy = 0.5
# #     X = df.drop(columns=['isFraud'])
# #     y = df['isFraud']
#     X, y = split_features_labels(df)
    
#     # Declare Random Under Sampler
#     rus = NearMiss(version=3, 
#                    sampling_strategy=sampling_strategy,
#                    n_jobs=cpu_count(),
#                    random_state=random_state)
    
#     ros = SMOTE(n_jobs=cpu_count(),
#                random_state=random_state)
    
#     X, y = rus.fit_resample(X, y)
#     X, y = ros.fit_resample(X, y)
    
#     return X, y.astype('int32')


# %time X, y = undersample_then_oversample(train)

In [19]:
def oversample(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    sampling_strategy = 0.15
#     X = df.drop(columns=['isFraud'])
#     y = df['isFraud']
    X, y = split_features_labels(df)
    
    ros = SMOTE(sampling_strategy=sampling_strategy,
                n_jobs=cpu_count(),
               random_state=random_state)
    
    X, y = ros.fit_resample(X, y)
    
    return X, y.astype('int32')


%time X, y = oversample(train)

CPU times: user 1min, sys: 4.26 s, total: 1min 4s
Wall time: 22.2 s


In [20]:
print(X.shape)
print(y.shape)

(655358, 391)
(655358,)


In [21]:
Counter(y)

Counter({0: 569877, 1: 85481})

# Train using XGB

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 392 entries, isFraud to V339
dtypes: float64(376), int64(16)
memory usage: 1.7 GB


In [23]:
del train

import gc
gc.collect()

54

In [24]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_jobs=cpu_count(),
                            seed=seed)

%time model.fit(X, y, verbose=True)

CPU times: user 12min 58s, sys: 4.27 s, total: 13min 2s
Wall time: 3min 19s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=1, verbosity=1)

In [25]:
pred_prob = model.predict_proba(X)

score = roc_auc_score(y, pred_prob[:, 1])

print('roc-auc score={}'.format(score))

roc-auc score=0.9359134334539352


# Save Model

In [26]:
import pickle

filename_model = './models/xgboost/encoded_transaction_oversampling.pkl'

with open(filename_model, 'wb') as f:
    pickle.dump(model, f)

# Predict Test Data And Write to File

In [28]:
filename = './datasets/test_transaction.csv'
# filename_identity = './datasets/test_identity.csv'
filename_identity = None
%time test = read_data(filename, filename_identity, drop_string_features=False)

CPU times: user 18.6 s, sys: 1.48 s, total: 20.1 s
Wall time: 21.2 s


In [33]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [41]:
encode(test, mappings)

{'ProductCD': {'W': 0, 'H': 1, 'C': 2, 'S': 3, 'R': 4},
 'card4': {'discover': 0,
  'mastercard': 1,
  'visa': 2,
  'american express': 3,
  nan: 4},
 'card6': {'credit': 0,
  'debit': 1,
  nan: 2,
  'debit or credit': 3,
  'charge card': 4},
 'P_emaildomain': {nan: 0,
  'gmail.com': 1,
  'outlook.com': 2,
  'yahoo.com': 3,
  'mail.com': 4,
  'anonymous.com': 5,
  'hotmail.com': 6,
  'verizon.net': 7,
  'aol.com': 8,
  'me.com': 9,
  'comcast.net': 10,
  'optonline.net': 11,
  'cox.net': 12,
  'charter.net': 13,
  'rocketmail.com': 14,
  'prodigy.net.mx': 15,
  'embarqmail.com': 16,
  'icloud.com': 17,
  'live.com.mx': 18,
  'gmail': 19,
  'live.com': 20,
  'att.net': 21,
  'juno.com': 22,
  'ymail.com': 23,
  'sbcglobal.net': 24,
  'bellsouth.net': 25,
  'msn.com': 26,
  'q.com': 27,
  'yahoo.com.mx': 28,
  'centurylink.net': 29,
  'servicios-ta.com': 30,
  'earthlink.net': 31,
  'hotmail.es': 32,
  'cfl.rr.com': 33,
  'roadrunner.com': 34,
  'netzero.net': 35,
  'gmx.de': 36,
  'sudd

In [42]:
test_id = test['TransactionID']

In [43]:
test.shape

(506691, 393)

In [46]:
%time test = preprocess(test)

In [47]:
test.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,31.95,0,10409,111.0,150.0,2,226.0,1,170.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,49.0,0,4272,111.0,150.0,2,226.0,1,299.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,171.0,0,4476,574.0,150.0,2,226.0,1,472.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,284.95,0,10989,360.0,150.0,2,166.0,1,205.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,67.95,0,18018,452.0,150.0,1,117.0,1,264.0,87.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [48]:
del X
import gc
gc.collect()

760

In [49]:
%time pred_prob = model.predict_proba(test.values)

CPU times: user 5.75 s, sys: 2.48 s, total: 8.23 s
Wall time: 4.26 s


In [51]:
test_result_df = pd.DataFrame(test_id, columns=['TransactionID'])
test_result_df['isFraud'] = pred_prob[:, 1]

filename_prediction = './prediction_test_oversampling_encoded_transaction_2019_0805.csv'
%time test_result_df.to_csv(filename_prediction, index=False)

CPU times: user 2.15 s, sys: 18.2 ms, total: 2.17 s
Wall time: 2.04 s
