# Resample Fruad=1 Class
- Train using Keras DNN

In [1]:
import pandas as pd
import numpy as np
import keras

from collections import Counter

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

import xgboost as xgb

!python -c 'import tensorflow as tf; print(tf.version)'
print(cpu_count())

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>
4


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/kfold/transaction_fold_0_0_0.csv',
              filename_identity=None,
              drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if filename_identity is not None:
        df_identity = pd.read_csv(filename_identity)
        
        df = pd.merge(df, df_identity, on='TransactionID', how='left')
        
        del df_identity
    
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [59]:
# filename = './datasets/train_transaction.csv'
# filename_identity = './datasets/train_identity.csv'

# %time train = read_data(filename, filename_identity, drop_string_features=True)

%time train = read_data(drop_string_features=False)

CPU times: user 2.49 s, sys: 132 ms, total: 2.62 s
Wall time: 2.62 s


In [60]:
train.shape

(73816, 394)

In [61]:
train.isFraud.value_counts()

0    71376
1     2440
Name: isFraud, dtype: int64

In [62]:
get_string_features(train)

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

# Encode Labels

In [50]:
uniques = train['card4'].unique()
mapping = dict(zip(uniques, range(len(uniques))))
print(mapping)

{'mastercard': 0, 'visa': 1, 'american express': 2, 'discover': 3, nan: 4}


In [75]:
def encode(df, mappings=None):
    mappings= {}
    
    string_features = get_string_features(df)
    
    for feature in string_features:
        
        if mappings is None:
            uniques = df[feature].unique()
            m = dict(zip(uniques, range(len(uniques))))
            mappings[feature] = m
        else:
            m = mappings[feature]
        
        df[feature] = df[feature].map(m)
        
        
        
        
    return mappings


%time mappings = encode(train)

CPU times: user 2 ms, sys: 0 ns, total: 2 ms
Wall time: 2.01 ms


In [64]:
mappings

{'ProductCD': {'W': 0, 'C': 1, 'H': 2, 'R': 3, 'S': 4},
 'card4': {'mastercard': 0,
  'visa': 1,
  'american express': 2,
  'discover': 3,
  nan: 4},
 'card6': {'credit': 0,
  'debit': 1,
  'debit or credit': 2,
  nan: 3,
  'charge card': 4},
 'P_emaildomain': {'gmail.com': 0,
  'mail.com': 1,
  'yahoo.com': 2,
  'me.com': 3,
  'hotmail.com': 4,
  'anonymous.com': 5,
  'comcast.net': 6,
  'prodigy.net.mx': 7,
  nan: 8,
  'aol.com': 9,
  'live.com': 10,
  'icloud.com': 11,
  'outlook.com': 12,
  'juno.com': 13,
  'msn.com': 14,
  'yahoo.com.mx': 15,
  'q.com': 16,
  'earthlink.net': 17,
  'hotmail.es': 18,
  'live.com.mx': 19,
  'roadrunner.com': 20,
  'charter.net': 21,
  'verizon.net': 22,
  'optonline.net': 23,
  'ymail.com': 24,
  'sbcglobal.net': 25,
  'bellsouth.net': 26,
  'att.net': 27,
  'rocketmail.com': 28,
  'cox.net': 29,
  'centurylink.net': 30,
  'suddenlink.net': 31,
  'gmail': 32,
  'outlook.es': 33,
  'aim.com': 34,
  'frontier.com': 35,
  'twc.com': 36,
  'windstream.

In [65]:
get_string_features(train)

[]

# Undersample majority class

In [66]:
def preprocess(df):
    df = df.fillna(-999)
    
    return df.drop(columns=['TransactionID', 'TransactionDT'])


train = preprocess(train)

In [67]:
train.shape

(73816, 392)

In [68]:
def split_features_labels(df):
    return df.drop(columns=['isFraud']), df['isFraud']

In [69]:
# def undersample_then_oversample(df, random_state=27):
#     """
#     Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
#     """
#     sampling_strategy = 0.5
# #     X = df.drop(columns=['isFraud'])
# #     y = df['isFraud']
#     X, y = split_features_labels(df)
    
#     # Declare Random Under Sampler
#     rus = NearMiss(version=3, 
#                    sampling_strategy=sampling_strategy,
#                    n_jobs=cpu_count(),
#                    random_state=random_state)
    
#     ros = SMOTE(n_jobs=cpu_count(),
#                random_state=random_state)
    
#     X, y = rus.fit_resample(X, y)
#     X, y = ros.fit_resample(X, y)
    
#     return X, y.astype('int32')


# %time X, y = undersample_then_oversample(train)

CPU times: user 33 s, sys: 695 ms, total: 33.7 s
Wall time: 25.2 s


In [14]:
def oversample(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    sampling_strategy = 0.15
#     X = df.drop(columns=['isFraud'])
#     y = df['isFraud']
    X, y = split_features_labels(df)
    
    ros = SMOTE(sampling_strategy=sampling_strategy,
                n_jobs=cpu_count(),
               random_state=random_state)
    
    X, y = ros.fit_resample(X, y)
    
    return X, y.astype('int32')


%time X, y = oversample(train)

CPU times: user 2.13 s, sys: 492 ms, total: 2.62 s
Wall time: 1.8 s


In [70]:
print(X.shape)
print(y.shape)

(9760, 391)
(9760,)


In [71]:
Counter(y)

Counter({0: 4880, 1: 4880})

# Train using XGB

In [72]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73816 entries, 0 to 73815
Columns: 392 entries, isFraud to V339
dtypes: float64(376), int64(16)
memory usage: 220.8 MB


In [21]:
del train

import gc
gc.collect()

630

In [73]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_jobs=cpu_count(),
                            seed=seed)

%time model.fit(X, y, verbose=True)

CPU times: user 10.9 s, sys: 46.7 ms, total: 11 s
Wall time: 5.52 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=2,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=1, verbosity=1)

In [74]:
pred_prob = model.predict_proba(X)

score = roc_auc_score(y, pred_prob[:, 1])

print('roc-auc score={}'.format(score))

roc-auc score=0.9272683754367105
