# Resample Fruad=1 Class
- Train using Keras DNN

In [1]:
import pandas as pd
import numpy as np
import keras

from collections import Counter

from sklearn.utils import resample, shuffle

from keras.layers import Dense, Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

from multiprocessing import cpu_count

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

!python -c 'import tensorflow as tf; print(tf.version)'
print(cpu_count())

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>
4


# Read Dataset

In [2]:
def get_string_features(df):
    string_features = []
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)

    return string_features


def read_data(filename='./datasets/kfold/transaction_fold_0_0_0.csv',
              filename_identity=None,
              drop_string_features=True):
    
    df = pd.read_csv(filename)
    
    if filename_identity is not None:
        df_identity = pd.read_csv(filename_identity)
        
        df = pd.merge(df, df_identity, on='TransactionID', how='left')
        
        del df_identity
    
    
    if drop_string_features:
        string_features = get_string_features(df)
        df = df.drop(columns=string_features)
    
    return df

In [3]:
filename = './datasets/train_transaction.csv'
filename_identity = './datasets/train_identity.csv'

# %time train = read_data(filename, drop_string_features=False)

%time train = read_data(filename, filename_identity, drop_string_features=False)

# %time train = read_data(drop_string_features=False)

CPU times: user 25.4 s, sys: 7.74 s, total: 33.1 s
Wall time: 32.2 s


In [4]:
train.shape

(590540, 434)

In [5]:
train.isFraud.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

# Encode Labels

In [6]:
def encode(df, mappings=None):
    if mappings is None:
        create_mappings = True
        mappings= {}
    else:
        create_mappings = False
        

    string_features = get_string_features(df)
    
    for feature in string_features:
        if create_mappings:
            uniques = df[feature].unique()
            m = dict(zip(uniques, range(len(uniques))))
            mappings[feature] = m
        else:
            m = mappings[feature]
        
        df[feature] = df[feature].map(m)
        
    return mappings


In [7]:
%time mappings = encode(train)

CPU times: user 8.79 s, sys: 212 ms, total: 9 s
Wall time: 4.51 s


In [8]:
mappings

{'ProductCD': {'W': 0, 'H': 1, 'C': 2, 'S': 3, 'R': 4},
 'card4': {'discover': 0,
  'mastercard': 1,
  'visa': 2,
  'american express': 3,
  nan: 4},
 'card6': {'credit': 0,
  'debit': 1,
  nan: 2,
  'debit or credit': 3,
  'charge card': 4},
 'P_emaildomain': {nan: 0,
  'gmail.com': 1,
  'outlook.com': 2,
  'yahoo.com': 3,
  'mail.com': 4,
  'anonymous.com': 5,
  'hotmail.com': 6,
  'verizon.net': 7,
  'aol.com': 8,
  'me.com': 9,
  'comcast.net': 10,
  'optonline.net': 11,
  'cox.net': 12,
  'charter.net': 13,
  'rocketmail.com': 14,
  'prodigy.net.mx': 15,
  'embarqmail.com': 16,
  'icloud.com': 17,
  'live.com.mx': 18,
  'gmail': 19,
  'live.com': 20,
  'att.net': 21,
  'juno.com': 22,
  'ymail.com': 23,
  'sbcglobal.net': 24,
  'bellsouth.net': 25,
  'msn.com': 26,
  'q.com': 27,
  'yahoo.com.mx': 28,
  'centurylink.net': 29,
  'servicios-ta.com': 30,
  'earthlink.net': 31,
  'hotmail.es': 32,
  'cfl.rr.com': 33,
  'roadrunner.com': 34,
  'netzero.net': 35,
  'gmx.de': 36,
  'sudd

In [9]:
get_string_features(train)

[]

# Reduce Memory Usage for KFold

In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
%time train = reduce_mem_usage(train)

Memory usage after optimization is: 529.96 MB
Decreased by 73.0%
CPU times: user 1min 3s, sys: 2min 8s, total: 3min 11s
Wall time: 1min 35s


# Undersample majority class

In [12]:
def preprocess(df):
    df = df.fillna(-999)
    
    return df.drop(columns=['TransactionID', 'TransactionDT'])


train = preprocess(train)

In [13]:
train.shape

(590540, 432)

In [14]:
def split_features_labels(df):
    return df.drop(columns=['isFraud']), df['isFraud']

In [15]:
# def undersample_then_oversample(df, random_state=27):
#     """
#     Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
#     """
#     sampling_strategy = 0.5
# #     X = df.drop(columns=['isFraud'])
# #     y = df['isFraud']
#     X, y = split_features_labels(df)
    
#     # Declare Random Under Sampler
#     rus = NearMiss(version=3, 
#                    sampling_strategy=sampling_strategy,
#                    n_jobs=cpu_count(),
#                    random_state=random_state)
    
#     ros = SMOTE(n_jobs=cpu_count(),
#                random_state=random_state)
    
#     X, y = rus.fit_resample(X, y)
#     X, y = ros.fit_resample(X, y)
    
#     return X, y.astype('int32')


# %time X, y = undersample_then_oversample(train)

In [16]:
def oversample(df, random_state=27):
    """
    Upsample minority class (isFraud=1), combine with majority class, and then shuffle them.
    """
    sampling_strategy = 0.15
#     X = df.drop(columns=['isFraud'])
#     y = df['isFraud']
    X, y = split_features_labels(df)
    
    ros = SMOTE(sampling_strategy=sampling_strategy,
                n_jobs=cpu_count(),
               random_state=random_state)
    
    X, y = ros.fit_resample(X, y)
    
    return X, y.astype('int32')


%time X, y = oversample(train)

CPU times: user 43.8 s, sys: 2.82 s, total: 46.6 s
Wall time: 16.1 s


In [17]:
print(X.shape)
print(y.shape)

(655358, 431)
(655358,)


In [18]:
Counter(y)

Counter({0: 569877, 1: 85481})

# Train using XGB

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 432 entries, isFraud to DeviceInfo
dtypes: float16(354), float32(45), int16(4), int8(29)
memory usage: 525.4 MB


In [20]:
del train

import gc
gc.collect()

54

In [21]:
seed = 27

random_state = seed
epochs = 5

kf = StratifiedKFold(n_splits=epochs,
                    random_state=random_state,
                    shuffle=True)

model = xgb.XGBClassifier(objective='binary:logistic',
                            n_jobs=cpu_count(),
                            seed=seed,
#                             n_estimators=500,
#                             max_depth=17,
#                             learning_rate=0.03,
                            subsample=0.9,
                            colsample_bytree=0.9,)

scores = []

for index, (index_train, index_valid) in tqdm(enumerate(kf.split(X, y))):
    X_train, y_train = X[index_train, :], y[index_train]
    
    model.fit(X_train, y_train)
    
    del X_train
    del y_train
    gc.collect()
    
    X_valid, y_valid = X[index_valid, :], y[index_valid]
    y_pred_prob_valid = model.predict_proba(X_valid)
    score = roc_auc_score(y_valid, y_pred_prob_valid[:, 1])
    
    del X_valid
    del y_valid
    gc.collect()
    

    scores.append(score)
    print('Fold {}, roc-auc score={}'.format(index,
                                             score))
  

print('Average score={}'.format(np.mean(scores)))

1it [02:59, 179.05s/it]

Fold 0, roc-auc score=0.9597420454055278


2it [05:58, 179.31s/it]

Fold 1, roc-auc score=0.9603108516117753


3it [08:58, 179.50s/it]

Fold 2, roc-auc score=0.963254022059653


4it [11:57, 179.24s/it]

Fold 3, roc-auc score=0.961910065841882


5it [14:55, 178.88s/it]

Fold 4, roc-auc score=0.961236467526117
Average score=0.961290690488991





In [22]:
pred_prob = model.predict_proba(X)

score = roc_auc_score(y, pred_prob[:, 1])

print('roc-auc score={}'.format(score))

roc-auc score=0.9616845932433111


In [None]:
print(model)

# Save Model

In [23]:
import pickle

filename_model = './models/xgboost/kfold_encoded_transaction_identity_oversampling_2019_0805.pkl'

with open(filename_model, 'wb') as f:
    pickle.dump(model, f)

In [24]:
del X
del y

import gc
gc.collect()

80

# Predict Test Data And Write to File

In [36]:
filename = './datasets/test_transaction.csv'
filename_identity = './datasets/test_identity.csv'
# filename_identity = None
%time test = read_data(filename, filename_identity, drop_string_features=False)

CPU times: user 21.8 s, sys: 5.76 s, total: 27.5 s
Wall time: 26.8 s


In [37]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [38]:
_ = encode(test, mappings)

In [39]:
test_id = test['TransactionID']

In [40]:
test.shape

(506691, 433)

In [41]:
%time test = preprocess(test)

CPU times: user 2.16 s, sys: 3.32 s, total: 5.48 s
Wall time: 3.28 s


In [44]:
test.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,31.95,0,10409,111.0,150.0,2,226.0,1,170.0,87.0,...,0.0,-999.0,0.0,0,0,0,0,0,0,0.0
1,49.0,0,4272,111.0,150.0,2,226.0,1,299.0,87.0,...,0.0,-999.0,0.0,0,0,0,0,0,0,0.0
2,171.0,0,4476,574.0,150.0,2,226.0,1,472.0,87.0,...,0.0,-999.0,0.0,0,0,0,0,0,0,0.0
3,284.95,0,10989,360.0,150.0,2,166.0,1,205.0,87.0,...,0.0,-999.0,0.0,0,0,0,0,0,0,0.0
4,67.95,0,18018,452.0,150.0,1,117.0,1,264.0,87.0,...,0.0,-999.0,0.0,0,0,0,0,0,0,0.0


In [31]:
# del X
# import gc
# gc.collect()

In [45]:
%time pred_prob = model.predict_proba(test.values)

CPU times: user 6.29 s, sys: 2.63 s, total: 8.92 s
Wall time: 4.68 s


In [46]:
test_result_df = pd.DataFrame(test_id, columns=['TransactionID'])
test_result_df['isFraud'] = pred_prob[:, 1]

filename_prediction = './prediction_test_kfold_oversampling_encoded_transaction_identity_2019_0805.csv'
%time test_result_df.to_csv(filename_prediction, index=False)

CPU times: user 2.15 s, sys: 309 µs, total: 2.15 s
Wall time: 2.08 s
