# Train using XGBoost

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from tqdm import tqdm

# Read Datasets

In [2]:
%time train = pd.read_csv('./datasets/train_transaction.csv')

CPU times: user 21.6 s, sys: 2.53 s, total: 24.1 s
Wall time: 23.3 s


In [3]:
%time train_identity = pd.read_csv('./datasets/train_identity.csv')

CPU times: user 597 ms, sys: 11.6 ms, total: 609 ms
Wall time: 599 ms


# Encode
- Special decoding of R_emaildomain and P_emaildomain
    - Combine these 2 features and fit them.

In [4]:
def get_string_features(df):
    """
    Get features with numpy.dtype of 'object'
    """
    string_features = []
    for feature in df.columns:
        if df[feature].dtype == np.dtype('object'):
            string_features.append(feature)
            
    return string_features


string_features = get_string_features(train)
print(string_features)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


# Encode String Features of Transaction File

In [5]:
import pdb

def gen_encoder(series, encode_nan=False):
    """
    Args:
        series (pandas.Series):
        
    Return:
        encoder (sklearn.preprocessing.LabelEncoder):
    """
    
    encoder = preprocessing.LabelEncoder()
    
    if encode_nan:
        encoder.fit(series)
    else:
        encoder.fit(series[series.notnull()])
    
    return encoder


def get_string_features(df):
    """
    Args:
        df (pandas.DataFrame):
    
    Return:
        string_features (list of str):
    """
    string_features = []
    
    for col in df.columns:
        if df[col].dtype is np.dtype('object'):
            string_features.append(col)
            
    return string_features
    

def generate_encoders(df, feature_names):
    encoder_dict = {}
    
    # fit 
    for feature in feature_names:
        
        # P_emaildomain and R_emaildomain needs to be encoded together.
        if feature is not 'P_emaildomain' and \
            feature is not 'R_emaildomain':
            
#             pdb.set_trace()
            
            # encode labels
            encoder = gen_encoder(df[feature], encode_nan=False)
            
            # record encoder
            encoder_dict[feature] = encoder
    
    
    # Combine P_emaildomain and R_emaildomain
    encoder = gen_encoder(pd.concat([df['P_emaildomain'], df['R_emaildomain']],ignore_index=True),
                         encode_nan=False)
    
    encoder_dict['P_emaildomain'] = encoder
    encoder_dict['R_emaildomain'] = encoder
    
    return encoder_dict
    

def encode_feature(series, encoder):
    """
    Args:
        series (pandas.Series):
        encoder (sklearn.preprocessing.LabelEncoder):
    
    Return:
        series_encoded (pandas.Series):
    """
    
    return pd.Series(
                    encoder.transform(series[series.notnull()]),
                    index=series[series.notnull()].index)


def encode_string_features(df, encoder_dict):
    """
    Args:
        df (pandas.DataFrame):
        encoder_dict (dict): encoders w.r.t. string features.
    """
    
    for feature, encoder in encoder_dict.items():
        # get encoder
        encoder = encoder_dict[feature]

        # Start encoding
        new_feature_name = '{}_encoded'.format(feature)
        new_index = df.columns.get_loc(feature) + 1
        
        df.insert(new_index, 
                  new_feature_name, 
                  encode_feature(df[feature], encoder))
            
            
encoder_dict = generate_encoders(train, string_features)

In [6]:
%time encode_string_features(train, encoder_dict)

CPU times: user 3.11 s, sys: 11.7 ms, total: 3.13 s
Wall time: 2.73 s


### Drop old string features before training

In [7]:
train = train.drop(columns=string_features)

# Encode String Features of Identity File

In [8]:
import re

def get_os(info):
    version = get_os_version(info)
    
    if version is np.nan or version is None:
        return info
    
    return info.strip(version).strip()


def get_os_version(info):
    
    results = re.findall('\d+\.\d\.*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+\_\d+\_*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+', info)
    if len(results) > 0:
        return results[0]
    
    return np.nan


def split_browser_info(info):
    
    if info is np.nan:
        return pd.Series([np.nan, np.nan, np.nan])
    
    known_developers = ['android browser',
                       'chrome',
                       'edge',
                        'firefox',
                        'google search application',
                        'google',
                        'Generic/Android',
                       'ie',
                        'line',
                        'Microsoft',
                        'opera',
                        'samsung',
                       'safari',
                        'ZTE/Blade',
                       'other']
    
    developer = np.nan
    version = np.nan
    
    # Grep version
    results = re.findall('\d+\.\d+', info)
    if len(results) > 0:
        str_version = results[0]
        version = float(str_version)
        info = info.replace(str_version, '')
        
    # Grep Developer
    for d in known_developers:
        if d in info:
            developer = d
            info = info.replace(d, '')
    
    # Remove 'for'
    if 'for' in info:
        info = info.replace('for', '')
    
    # remove consecutive whitespaces and keep only 1 whitespace.
    info = ' '.join(info.split())
    if len(info.strip()) == 0:
        info = np.nan
    
    if developer is np.nan:
        developer = info
        info = np.nan
        
    return pd.Series([developer, version, info])


def get_manufacture(device_info, return_nan=True):
    
    if device_info is None:
        return device_info
    
    starts_dict = {'0P': 'HTC',
                   '2P': 'HTC',
                   '40': 'ALCATEL',
                   '50': 'ALCATEL',
                   '60': 'ALCATEL',
                   '70': 'ALCATEL',
                   '80': 'ALCATEL',
                   '90': 'ALCATEL',
                   'Android': 'os-Android', #############
                   'ASUS': 'ASUS',
                   'Aquaris': 'Aquaris',
                   'ALCATEL': 'ALCATEL',
                   'AX': 'Bmobile', # Mexico
                   'B1-': 'Acer',
                   'B3-': 'Acer',
                   'BB': 'BlackBerry',
                   'BLADE': 'ZTE',
                   'BLN': 'HUAWEI',
                   'BLU': 'BLU', # American
                   'BND': 'HUAWEI',
                   'Build/': 'factory-image', ###############
                   'BV': 'Blackview',
                   'C1': 'sony',
                   'C2': 'sony',
                   'C6': 'sony',
                   'D2': 'sony',
                   'D5': 'sony',
                   'D6': 'sony',
                   'E2': 'sony',
                   'E501': 'Hyundai',
                   'E53': 'sony',
                   'E55': 'sony',
                   'E56': 'sony',
                   'E58': 'sony',
                   'E6': 'sony',
                   'E8': 'sony',
                   'en-': 'encoding',  ####################
                   'es-': 'encoding',  ####################
                   'F3': 'sony',
                   'F5': 'sony',
                   'F80': 'F2-mobile',
                   'F81': 'sony',
                   'F83': 'sony',
                   'H1': 'sony',
                   'H3': 'sony',
                   'H5': 'sony',
                   'IdeaTab': 'Lenovo',
                   'G255': 'Hyundai',
                   'G527': 'HUAWEI',
                   'G620': 'HUAWEI',
                   'G630': 'HUAWEI',
                   'G814': 'sony',
                   'G8341': 'sony',
                   'G3': 'sony',
                   'GT-': 'samsung',
                   'HTC': 'HTC',
                   'HUAWEI': 'HUAWEI',
                   'Hisense': 'Hisense',
                   'Ilium': 'Lanix', # Italian Company
                   'iOS': 'os-ios', ###################
                   'iPhone': 'iPhone', #######################
                   'iris': 'Lava',
                   'K1': 'koobee',
                   'K8': 'koobee',
                   'K9': 'koobee',
                   'Linux': 'os-linux', ######################
                   'KF': 'Amazon',  # Kindle Fire
                   'Lenovo': 'Lenovo',
                   'LG': 'LG',
                   'M431': 'Morphe',
                   'M4': 'M4',
                   'MacOS': 'MacOS',
                   'Mi ': 'xiaomi',
                   'Microsoft': 'Microsoft',  #############
                   'Moto': 'Moto',
                   'Nexus': 'google',
                   'ONE': 'ONEPLUS',
                   'P0': 'Asus',
                   'P4': 'Polaroid',
                   'P5': 'Polaroid',
                   'Pixel': 'google',
                   'Q': 'verizon',
                   'Redmi': 'xiaomi',
                   'RCT': 'RCA',
                   'rv:': 'firefox user-agent', ######################
                   'SAMSUNG': 'samsung',
                   'SCH-': 'samsung',
                   'SGH-': 'samsung',
                   'SGP': 'sony',
                   'SLA': 'HUAWEI',
                   'SPH-': 'samsung',
                   'SM-': 'samsung',
                   'STV100': 'BlackBerry',
                   'TA-': 'karbonn',
                   'verykool': 'verykool',
                   'VK': 'LG',
                   'VS': 'LG',
                   'Win': 'Windows',
                   'XT1': 'Motorola',
                   'Z410': 'Acer',
                   'Z5': 'ZTE',
                   'Z7': 'ZTE',
                   'Z8': 'ZTE',
                   'Z9 PLUS': 'QMobile',
                   'Z95': 'ZTE',
                   'Z96': 'ZTE',
                   'Z97': 'ZTE',
                   'Z98': 'ZTE',
                   'ZA': 'Zonda',
                   'ZTE': 'ZTE'}
    
    substring_dict = {'HUAWEI': 'HUAWEI',
                     'Build': 'substring-build'
                     }
    
    for prefix, manufacture in starts_dict.items():
        if device_info.lower().startswith(prefix.lower()):
            return manufacture
        
        
    for substring, manufacture in substring_dict.items():
        if substring.lower() in device_info.lower():
            return manufacture

    if return_nan:
        return
    else:
        return device_info
    
    
def insert_feature(feature_org, feature_new, extractor, df):
    """
    Args:
        feature_new (string):
        feature_org (string):
        df (pandas.DataFrame):
    """
    
    


extractors = [
              ['id_30', 'id_30_os', get_os],
              ['id_30', 'id_30_os_version', get_os_version],
              ['id_31', 'id_31_developer', lambda x: split_browser_info(x)[0]],
              ['id_31', 'id_31_version', lambda x: split_browser_info(x)[1]],
              ['id_31', 'id_31_annotation', lambda x: split_browser_info(x)[2]],
              ['id_33', 'id_33_width', lambda s: float(s.split('x')[0])],
              ['id_33', 'id_33_height', lambda s: float(s.split('x')[1])],
              ['DeviceInfo', 'DeviceInfo_map', lambda x: get_manufacture(x, return_nan=False)]
             ]

def insert_features(extractors, df):
    """
    Extract features of id_30, id_31, id_33, id_34.
    """
    feature_list = []
    
    for feature_org, feature_new, extractor in extractors:
        print('Extract features: {}'.format(feature_org))
        
        feature_list.append(feature_org)
        
        # get index of original feature
        index_feature_org = df.columns.get_loc(feature_org)

        # insert feature
        %time df.insert(index_feature_org+1, \
                    feature_new, \
                     df[feature_org].map(extractor, na_action='ignore'))
    
    
    return feature_list


feature_list = insert_features(extractors, train_identity)

Extract features: id_30
CPU times: user 364 ms, sys: 0 ns, total: 364 ms
Wall time: 364 ms
Extract features: id_30
CPU times: user 312 ms, sys: 0 ns, total: 312 ms
Wall time: 312 ms
Extract features: id_31
CPU times: user 18.1 s, sys: 197 ms, total: 18.3 s
Wall time: 17.9 s
Extract features: id_31
CPU times: user 17.7 s, sys: 145 ms, total: 17.8 s
Wall time: 17.7 s
Extract features: id_31
CPU times: user 18 s, sys: 211 ms, total: 18.2 s
Wall time: 17.9 s
Extract features: id_33
CPU times: user 62.6 ms, sys: 0 ns, total: 62.6 ms
Wall time: 62.4 ms
Extract features: id_33
CPU times: user 62.9 ms, sys: 0 ns, total: 62.9 ms
Wall time: 62.8 ms
Extract features: DeviceInfo
CPU times: user 3.2 s, sys: 0 ns, total: 3.2 s
Wall time: 3.2 s


### Drop Original Features

In [9]:
feature_list

['id_30', 'id_30', 'id_31', 'id_31', 'id_31', 'id_33', 'id_33', 'DeviceInfo']

In [10]:
train_identity = train_identity.drop(columns=feature_list)

# Check whether it has features with numpy object

In [11]:
for col in train_identity.columns:
    if train_identity[col].dtype == np.dtype('object'):
        print(col)

id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30_os_version
id_30_os
id_31_annotation
id_31_developer
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo_map


### Encode Labels
- Encode All Features with Object Type

In [12]:
def transform_label(label, encoder):
    if label in encoder.classes_:
        return encoder.transform([label])[0]
    else:
        return -1
    

def encode_features(string_features, df):
    """
    Encode features and insert back to datafame.
    """
    encoder_dict = {}
    
    for feature in tqdm(string_features):
        print('Encode {}'.format(feature))
        # Get index for inserting encoded feature
        index_col = df.columns.get_loc(feature)
           
        # Encode Feature and insert
        encoder = preprocessing.LabelEncoder()
        
        encoder.fit(df[feature].dropna().unique())
        
        # Transform
        df.insert(index_col+1,
                 '{}_encoded'.format(feature),
                  df[feature].map(lambda x: transform_label(x, encoder),
                                                  na_action='ignore'))
        
        
        encoder_dict[feature] = encoder
        
        

string_features_identity = get_string_features(train_identity)
%time encoders = encode_features(string_features_identity, train_identity)

  0%|          | 0/18 [00:00<?, ?it/s]

Encode id_12


  6%|▌         | 1/18 [00:08<02:22,  8.41s/it]

Encode id_15


 11%|█         | 2/18 [00:17<02:16,  8.51s/it]

Encode id_16


 17%|█▋        | 3/18 [00:24<02:03,  8.24s/it]

Encode id_23


 22%|██▏       | 4/18 [00:25<01:22,  5.87s/it]

Encode id_27


 28%|██▊       | 5/18 [00:25<00:54,  4.21s/it]

Encode id_28


 33%|███▎      | 6/18 [00:33<01:04,  5.40s/it]

Encode id_29


 39%|███▉      | 7/18 [00:41<01:08,  6.22s/it]

Encode id_30_os_version


 44%|████▍     | 8/18 [01:01<01:41, 10.18s/it]

Encode id_30_os


 50%|█████     | 9/18 [01:07<01:21,  9.10s/it]

Encode id_31_annotation


 56%|█████▌    | 10/18 [01:14<01:07,  8.48s/it]

Encode id_31_developer


 61%|██████    | 11/18 [01:40<01:35, 13.71s/it]

Encode id_34


 67%|██████▋   | 12/18 [01:45<01:06, 11.11s/it]

Encode id_35


 72%|███████▏  | 13/18 [01:53<00:50, 10.18s/it]

Encode id_36


 78%|███████▊  | 14/18 [02:01<00:38,  9.53s/it]

Encode id_37


 83%|████████▎ | 15/18 [02:09<00:27,  9.10s/it]

Encode id_38


 89%|████████▉ | 16/18 [02:18<00:17,  8.81s/it]

Encode DeviceType


 94%|█████████▍| 17/18 [02:26<00:08,  8.63s/it]

Encode DeviceInfo_map


100%|██████████| 18/18 [03:59<00:00, 34.06s/it]

CPU times: user 4min, sys: 1.05 s, total: 4min 1s
Wall time: 3min 59s





### Drop Un-encoded Features

In [13]:
train_identity = train_identity.drop(columns=string_features_identity)

# Merge Transaction and Identity Files

In [14]:
print(train.shape)
print(train_identity.shape)

(590540, 394)
(144233, 45)


In [15]:
train = pd.merge(train, train_identity, on='TransactionID', how='left')

In [16]:
train.shape

(590540, 438)

# Prepare Features and Labels for training

In [17]:
y = train['isFraud']
train = train.drop(columns=['TransactionID', 'TransactionDT'])

print('{:,}'.format(train.memory_usage().sum()))

2,064,527,840


# Check whether it has string features

In [18]:
for col in train.columns:
    if train[col].dtype == np.dtype('object'):
        print(col)

# Train using XGBoost

In [19]:
import xgboost as xgb
import sklearn
from multiprocessing import cpu_count


print(cpu_count())

4


In [20]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_thread=-1,
                            seed=seed)

%time model.fit(train, y, verbose=True)

CPU times: user 5min 32s, sys: 5.15 s, total: 5min 37s
Wall time: 5min 36s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, n_thread=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=27, silent=None, subsample=1,
       verbosity=1)

# Save Model

In [22]:
filename_model = './models/xgboost/transaction_identity.model'
model.save_model(filename_model)

# Predict

In [23]:
%time pred = model.predict(train)
%time pred_prob = model.predict_proba(train)

CPU times: user 3.12 s, sys: 3.57 s, total: 6.68 s
Wall time: 6.72 s
CPU times: user 3.07 s, sys: 3.59 s, total: 6.66 s
Wall time: 6.66 s


In [24]:
accuracy = sklearn.metrics.accuracy_score(y, pred)
print()
print('Accuracy: {}'.format(accuracy))

print(sklearn.metrics.classification_report(y, pred))
print(sklearn.metrics.roc_auc_score(y, pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    569877
           1       1.00      1.00      1.00     20663

   micro avg       1.00      1.00      1.00    590540
   macro avg       1.00      1.00      1.00    590540
weighted avg       1.00      1.00      1.00    590540

1.0


# Read Test Data

In [25]:
test = pd.read_csv('./datasets/test_transaction.csv')

In [26]:
test_identity = pd.read_csv('./datasets/test_identity.csv')

In [27]:
print(test.shape)
print(test_identity.shape)

(506691, 393)
(141907, 41)


# Encode Features of Test Transaction

In [32]:
string_features = get_string_features(test)
print(string_features)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


In [33]:
%time encode_string_features(test, encoder_dict)

CPU times: user 2.5 s, sys: 0 ns, total: 2.5 s
Wall time: 2.54 s


### Drop old string features before training

In [34]:
test = test.drop(columns=string_features)

# Extract Features in Identity File

In [35]:
feature_list = insert_features(extractors, test_identity)

Extract features: id_30
CPU times: user 342 ms, sys: 24 µs, total: 342 ms
Wall time: 356 ms
Extract features: id_30
CPU times: user 295 ms, sys: 0 ns, total: 295 ms
Wall time: 310 ms
Extract features: id_31
CPU times: user 17.8 s, sys: 261 ms, total: 18.1 s
Wall time: 17.8 s
Extract features: id_31
CPU times: user 17.2 s, sys: 101 ms, total: 17.3 s
Wall time: 17.1 s
Extract features: id_31
CPU times: user 17.4 s, sys: 288 ms, total: 17.7 s
Wall time: 17.3 s
Extract features: id_33
CPU times: user 61.2 ms, sys: 0 ns, total: 61.2 ms
Wall time: 61 ms
Extract features: id_33
CPU times: user 60.9 ms, sys: 0 ns, total: 60.9 ms
Wall time: 60.7 ms
Extract features: DeviceInfo
CPU times: user 3.1 s, sys: 0 ns, total: 3.1 s
Wall time: 3.09 s


# Drop Features

In [36]:
test_identity = test_identity.drop(columns=feature_list)

# Encode Identity Features

In [None]:
string_features_identity = get_string_features(train_identity)
%time encoders = encode_features(string_features_identity, train_identity, encode_dict)