# Train using XGBoost

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from tqdm import tqdm
import pickle

# Read Datasets

In [2]:
%time train = pd.read_csv('./datasets/train_transaction.csv')

CPU times: user 21.2 s, sys: 2.72 s, total: 23.9 s
Wall time: 23.2 s


In [3]:
%time train_identity = pd.read_csv('./datasets/train_identity.csv')

CPU times: user 598 ms, sys: 8.09 ms, total: 606 ms
Wall time: 598 ms


# Encode
- Special decoding of R_emaildomain and P_emaildomain
    - Combine these 2 features and fit them.

In [4]:
def get_string_features(df):
    """
    Get features with numpy.dtype of 'object'
    """
    string_features = []
    for feature in df.columns:
        if df[feature].dtype == np.dtype('object'):
            string_features.append(feature)
            
    return string_features


string_features = get_string_features(train)
print(string_features)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


# Encode String Features of Transaction File

In [5]:
import pdb

def gen_encoder(series, encode_nan=False):
    """
    Args:
        series (pandas.Series):
        
    Return:
        encoder (sklearn.preprocessing.LabelEncoder):
    """
    
    encoder = preprocessing.LabelEncoder()
    
    if encode_nan:
        encoder.fit(series)
    else:
        encoder.fit(series[series.notnull()])
    
    return encoder


def get_string_features(df):
    """
    Args:
        df (pandas.DataFrame):
    
    Return:
        string_features (list of str):
    """
    string_features = []
    
    for col in df.columns:
        if df[col].dtype is np.dtype('object'):
            string_features.append(col)
            
    return string_features
    

def generate_encoders(df, feature_names):
    encoder_dict = {}
    
    # fit 
    for feature in feature_names:
        
        # P_emaildomain and R_emaildomain needs to be encoded together.
        if feature is not 'P_emaildomain' and \
            feature is not 'R_emaildomain':
            
#             pdb.set_trace()
            
            # encode labels
            encoder = gen_encoder(df[feature], encode_nan=False)
            
            # record encoder
            encoder_dict[feature] = encoder
    
    
    # Combine P_emaildomain and R_emaildomain
    encoder = gen_encoder(pd.concat([df['P_emaildomain'], df['R_emaildomain']],ignore_index=True),
                         encode_nan=False)
    
    encoder_dict['P_emaildomain'] = encoder
    encoder_dict['R_emaildomain'] = encoder
    
    return encoder_dict
    

def encode_feature(series, encoder):
    """
    Args:
        series (pandas.Series):
        encoder (sklearn.preprocessing.LabelEncoder):
    
    Return:
        series_encoded (pandas.Series):
    """
    
    return pd.Series(
                    encoder.transform(series[series.notnull()]),
                    index=series[series.notnull()].index)


def encode_string_features(df, encoder_dict):
    """
    Args:
        df (pandas.DataFrame):
        encoder_dict (dict): encoders w.r.t. string features.
    """
    
    for feature, encoder in encoder_dict.items():
        # get encoder
        encoder = encoder_dict[feature]

        # Start encoding
        new_feature_name = '{}_encoded'.format(feature)
        new_index = df.columns.get_loc(feature) + 1
        
        df.insert(new_index, 
                  new_feature_name, 
                  encode_feature(df[feature], encoder))
            
            
encoder_dict_transaction = generate_encoders(train, string_features)

In [6]:
%time encode_string_features(train, encoder_dict_transaction)

CPU times: user 3.11 s, sys: 7.83 ms, total: 3.12 s
Wall time: 2.74 s


### Drop old string features before training

In [7]:
train = train.drop(columns=string_features)

# Encode String Features of Identity File

In [8]:
import re

def get_os(info):
    version = get_os_version(info)
    
    if version is np.nan or version is None:
        return info
    
    return info.strip(version).strip()


def get_os_version(info):
    
    results = re.findall('\d+\.\d\.*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+\_\d+\_*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+', info)
    if len(results) > 0:
        return results[0]
    
    return np.nan


def split_browser_info(info):
    
    if info is np.nan:
        return pd.Series([np.nan, np.nan, np.nan])
    
    known_developers = ['android browser',
                       'chrome',
                       'edge',
                        'firefox',
                        'google search application',
                        'google',
                        'Generic/Android',
                       'ie',
                        'line',
                        'Microsoft',
                        'opera',
                        'samsung',
                       'safari',
                        'ZTE/Blade',
                       'other']
    
    developer = np.nan
    version = np.nan
    
    # Grep version
    results = re.findall('\d+\.\d+', info)
    if len(results) > 0:
        str_version = results[0]
        version = float(str_version)
        info = info.replace(str_version, '')
        
    # Grep Developer
    for d in known_developers:
        if d in info:
            developer = d
            info = info.replace(d, '')
    
    # Remove 'for'
    if 'for' in info:
        info = info.replace('for', '')
    
    # remove consecutive whitespaces and keep only 1 whitespace.
    info = ' '.join(info.split())
    if len(info.strip()) == 0:
        info = np.nan
    
    if developer is np.nan:
        developer = info
        info = np.nan
        
    return pd.Series([developer, version, info])


def get_manufacture(device_info, return_nan=True):
    
    if device_info is None:
        return device_info
    
    starts_dict = {'0P': 'HTC',
                   '2P': 'HTC',
                   '40': 'ALCATEL',
                   '50': 'ALCATEL',
                   '60': 'ALCATEL',
                   '70': 'ALCATEL',
                   '80': 'ALCATEL',
                   '90': 'ALCATEL',
                   'Android': 'os-Android', #############
                   'ASUS': 'ASUS',
                   'Aquaris': 'Aquaris',
                   'ALCATEL': 'ALCATEL',
                   'AX': 'Bmobile', # Mexico
                   'B1-': 'Acer',
                   'B3-': 'Acer',
                   'BB': 'BlackBerry',
                   'BLADE': 'ZTE',
                   'BLN': 'HUAWEI',
                   'BLU': 'BLU', # American
                   'BND': 'HUAWEI',
                   'Build/': 'factory-image', ###############
                   'BV': 'Blackview',
                   'C1': 'sony',
                   'C2': 'sony',
                   'C6': 'sony',
                   'D2': 'sony',
                   'D5': 'sony',
                   'D6': 'sony',
                   'E2': 'sony',
                   'E501': 'Hyundai',
                   'E53': 'sony',
                   'E55': 'sony',
                   'E56': 'sony',
                   'E58': 'sony',
                   'E6': 'sony',
                   'E8': 'sony',
                   'en-': 'encoding',  ####################
                   'es-': 'encoding',  ####################
                   'F3': 'sony',
                   'F5': 'sony',
                   'F80': 'F2-mobile',
                   'F81': 'sony',
                   'F83': 'sony',
                   'H1': 'sony',
                   'H3': 'sony',
                   'H5': 'sony',
                   'IdeaTab': 'Lenovo',
                   'G255': 'Hyundai',
                   'G527': 'HUAWEI',
                   'G620': 'HUAWEI',
                   'G630': 'HUAWEI',
                   'G814': 'sony',
                   'G8341': 'sony',
                   'G3': 'sony',
                   'GT-': 'samsung',
                   'HTC': 'HTC',
                   'HUAWEI': 'HUAWEI',
                   'Hisense': 'Hisense',
                   'Ilium': 'Lanix', # Italian Company
                   'iOS': 'os-ios', ###################
                   'iPhone': 'iPhone', #######################
                   'iris': 'Lava',
                   'K1': 'koobee',
                   'K8': 'koobee',
                   'K9': 'koobee',
                   'Linux': 'os-linux', ######################
                   'KF': 'Amazon',  # Kindle Fire
                   'Lenovo': 'Lenovo',
                   'LG': 'LG',
                   'M431': 'Morphe',
                   'M4': 'M4',
                   'MacOS': 'MacOS',
                   'Mi ': 'xiaomi',
                   'Microsoft': 'Microsoft',  #############
                   'Moto': 'Moto',
                   'Nexus': 'google',
                   'ONE': 'ONEPLUS',
                   'P0': 'Asus',
                   'P4': 'Polaroid',
                   'P5': 'Polaroid',
                   'Pixel': 'google',
                   'Q': 'verizon',
                   'Redmi': 'xiaomi',
                   'RCT': 'RCA',
                   'rv:': 'firefox user-agent', ######################
                   'SAMSUNG': 'samsung',
                   'SCH-': 'samsung',
                   'SGH-': 'samsung',
                   'SGP': 'sony',
                   'SLA': 'HUAWEI',
                   'SPH-': 'samsung',
                   'SM-': 'samsung',
                   'STV100': 'BlackBerry',
                   'TA-': 'karbonn',
                   'verykool': 'verykool',
                   'VK': 'LG',
                   'VS': 'LG',
                   'Win': 'Windows',
                   'XT1': 'Motorola',
                   'Z410': 'Acer',
                   'Z5': 'ZTE',
                   'Z7': 'ZTE',
                   'Z8': 'ZTE',
                   'Z9 PLUS': 'QMobile',
                   'Z95': 'ZTE',
                   'Z96': 'ZTE',
                   'Z97': 'ZTE',
                   'Z98': 'ZTE',
                   'ZA': 'Zonda',
                   'ZTE': 'ZTE'}
    
    substring_dict = {'HUAWEI': 'HUAWEI',
                     'Build': 'substring-build'
                     }
    
    for prefix, manufacture in starts_dict.items():
        if device_info.lower().startswith(prefix.lower()):
            return manufacture
        
        
    for substring, manufacture in substring_dict.items():
        if substring.lower() in device_info.lower():
            return manufacture

    if return_nan:
        return
    else:
        return device_info
    
    
def insert_feature(feature_org, feature_new, extractor, df):
    """
    Args:
        feature_new (string):
        feature_org (string):
        df (pandas.DataFrame):
    """
    
    


extractors = [
              ['id_30', 'id_30_os', get_os],
              ['id_30', 'id_30_os_version', get_os_version],
              ['id_31', 'id_31_developer', lambda x: split_browser_info(x)[0]],
              ['id_31', 'id_31_version', lambda x: split_browser_info(x)[1]],
              ['id_31', 'id_31_annotation', lambda x: split_browser_info(x)[2]],
              ['id_33', 'id_33_width', lambda s: float(s.split('x')[0])],
              ['id_33', 'id_33_height', lambda s: float(s.split('x')[1])],
              ['DeviceInfo', 'DeviceInfo_map', lambda x: get_manufacture(x, return_nan=False)]
             ]

def insert_features(extractors, df):
    """
    Extract features of id_30, id_31, id_33, id_34.
    """
    feature_list = []
    
    for feature_org, feature_new, extractor in tqdm(extractors):
        print('Extract features: {}'.format(feature_org))
        
        feature_list.append(feature_org)
        
        # get index of original feature
        index_feature_org = df.columns.get_loc(feature_org)

        # insert feature
        %time df.insert(index_feature_org+1, \
                    feature_new, \
                     df[feature_org].map(extractor, na_action='ignore'))
    
    
    return feature_list


feature_list = insert_features(extractors, train_identity)

  0%|          | 0/8 [00:00<?, ?it/s]

Extract features: id_30


 12%|█▎        | 1/8 [00:00<00:02,  2.72it/s]

CPU times: user 366 ms, sys: 489 µs, total: 366 ms
Wall time: 366 ms
Extract features: id_30


 25%|██▌       | 2/8 [00:00<00:02,  2.85it/s]

CPU times: user 309 ms, sys: 1.93 ms, total: 311 ms
Wall time: 311 ms
Extract features: id_31


 38%|███▊      | 3/8 [00:18<00:27,  5.55s/it]

CPU times: user 17.7 s, sys: 298 ms, total: 18 s
Wall time: 17.7 s
Extract features: id_31


 50%|█████     | 4/8 [00:36<00:36,  9.21s/it]

CPU times: user 17.8 s, sys: 180 ms, total: 18 s
Wall time: 17.7 s
Extract features: id_31


 88%|████████▊ | 7/8 [00:53<00:08,  8.23s/it]

CPU times: user 17.7 s, sys: 101 ms, total: 17.8 s
Wall time: 17.6 s
Extract features: id_33
CPU times: user 63.2 ms, sys: 0 ns, total: 63.2 ms
Wall time: 62.8 ms
Extract features: id_33
CPU times: user 58.9 ms, sys: 3.91 ms, total: 62.8 ms
Wall time: 62.6 ms
Extract features: DeviceInfo


100%|██████████| 8/8 [00:57<00:00,  6.72s/it]

CPU times: user 3.21 s, sys: 0 ns, total: 3.21 s
Wall time: 3.21 s





### Drop Original Features

In [9]:
feature_list

['id_30', 'id_30', 'id_31', 'id_31', 'id_31', 'id_33', 'id_33', 'DeviceInfo']

In [10]:
train_identity = train_identity.drop(columns=feature_list)

# Check whether it has features with numpy object

In [11]:
for col in train_identity.columns:
    if train_identity[col].dtype == np.dtype('object'):
        print(col)

id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30_os_version
id_30_os
id_31_annotation
id_31_developer
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo_map


### Encode Labels
- Encode All Features with Object Type

In [12]:
def transform_label(label, encoder):
    if label in encoder.classes_:
        return encoder.transform([label])[0]
    else:
        return -1
    

def encode_features(string_features, df, encoder_dict=None):
    """
    Encode features and insert back to datafame.
    """
    if encoder_dict is None:
        encoder_dict = {}
        create_encoders = True
    else:
        create_encoders = False
        
    
    for feature in tqdm(string_features):
        print('Encode {}'.format(feature))
        # Get index for inserting encoded feature
        index_col = df.columns.get_loc(feature)
           
        if create_encoders:
            # Encode Feature and insert
            encoder = preprocessing.LabelEncoder()
            encoder.fit(df[feature].dropna().unique())
            
            # add encoder to dictionary
            encoder_dict[feature] = encoder
        else:
            encoder = encoder_dict[feature]
            
        
        # Transform
        df.insert(index_col+1,
                 '{}_encoded'.format(feature),
                  df[feature].map(lambda x: transform_label(x, encoder),
                                                  na_action='ignore'))
        
        
    return encoder_dict
        
        

string_features_identity = get_string_features(train_identity)
%time encoder_dict_identity = encode_features(string_features_identity, train_identity)

  0%|          | 0/18 [00:00<?, ?it/s]

Encode id_12


  6%|▌         | 1/18 [00:08<02:25,  8.55s/it]

Encode id_15


 11%|█         | 2/18 [00:17<02:17,  8.62s/it]

Encode id_16


 17%|█▋        | 3/18 [00:24<02:04,  8.32s/it]

Encode id_23


 22%|██▏       | 4/18 [00:25<01:23,  5.93s/it]

Encode id_27


 28%|██▊       | 5/18 [00:25<00:55,  4.25s/it]

Encode id_28


 33%|███▎      | 6/18 [00:33<01:05,  5.47s/it]

Encode id_29


 39%|███▉      | 7/18 [00:42<01:09,  6.30s/it]

Encode id_30_os_version


 44%|████▍     | 8/18 [01:01<01:43, 10.35s/it]

Encode id_30_os


 50%|█████     | 9/18 [01:08<01:23,  9.26s/it]

Encode id_31_annotation


 56%|█████▌    | 10/18 [01:15<01:09,  8.64s/it]

Encode id_31_developer


 61%|██████    | 11/18 [01:41<01:36, 13.78s/it]

Encode id_34


 67%|██████▋   | 12/18 [01:46<01:06, 11.15s/it]

Encode id_35


 72%|███████▏  | 13/18 [01:54<00:51, 10.26s/it]

Encode id_36


 78%|███████▊  | 14/18 [02:02<00:38,  9.62s/it]

Encode id_37


 83%|████████▎ | 15/18 [02:11<00:27,  9.19s/it]

Encode id_38


 89%|████████▉ | 16/18 [02:19<00:17,  8.87s/it]

Encode DeviceType


 94%|█████████▍| 17/18 [02:27<00:08,  8.69s/it]

Encode DeviceInfo_map


100%|██████████| 18/18 [03:59<00:00, 33.74s/it]

CPU times: user 4min, sys: 1.13 s, total: 4min 1s
Wall time: 3min 59s





### Drop Un-encoded Features

In [13]:
train_identity = train_identity.drop(columns=string_features_identity)

# Merge Transaction and Identity Files

In [14]:
print(train.shape)
print(train_identity.shape)

(590540, 394)
(144233, 45)


In [15]:
train = pd.merge(train, train_identity, on='TransactionID', how='left')

In [16]:
train.shape

(590540, 438)

# Prepare Features and Labels for training

In [17]:
y = train['isFraud']
train = train.drop(columns=['TransactionID', 'TransactionDT', 'isFraud'])

print('{:,}'.format(train.memory_usage().sum()))

2,059,803,520


# Check whether it has string features

In [18]:
for col in train.columns:
    if train[col].dtype == np.dtype('object'):
        print(col)

# Train using XGBoost

In [19]:
import xgboost as xgb
import sklearn
from multiprocessing import cpu_count


print(cpu_count())

4


In [20]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_thread=cpu_count()-1,
                            seed=seed)

%time model.fit(train, y, verbose=True)

CPU times: user 8min 3s, sys: 5 s, total: 8min 8s
Wall time: 8min 7s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, n_thread=3, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=27, silent=None, subsample=1,
       verbosity=1)

# Save Model

In [21]:
filename_encoder_identity = './models/encoders_identity.pkl'
filename_encoder_transaction = './models/encoders_transaction.pkl'

with open(filename_encoder_identity, 'wb') as f:
    pickle.dump(encoder_dict_identity, f)
    
with open(filename_encoder_transaction, 'wb') as f:
    pickle.dump(encoder_dict_transaction, f)
    



In [22]:
filename_model = './models/xgboost/transaction_identity.pkl'

with open(filename_model, 'wb') as f:
    pickle.dump(model, f)

# Predict

In [23]:
%time pred = model.predict(train)
%time pred_prob = model.predict_proba(train)

CPU times: user 3.94 s, sys: 3.71 s, total: 7.65 s
Wall time: 7.65 s
CPU times: user 4.06 s, sys: 3.6 s, total: 7.66 s
Wall time: 7.66 s


In [24]:
accuracy = sklearn.metrics.accuracy_score(y, pred)
print()
print('Accuracy: {}'.format(accuracy))

print(sklearn.metrics.classification_report(y, pred))
print(sklearn.metrics.roc_auc_score(y, pred))


Accuracy: 0.9740085345615876
              precision    recall  f1-score   support

           0       0.97      1.00      0.99    569877
           1       0.89      0.29      0.44     20663

   micro avg       0.97      0.97      0.97    590540
   macro avg       0.93      0.65      0.71    590540
weighted avg       0.97      0.97      0.97    590540

0.6460776744403206


# Delete Train Datasets

In [25]:
del train
del train_identity\

import gc
gc.collect()

2080

# Read Test Data

In [26]:
test = pd.read_csv('./datasets/test_transaction.csv')

In [27]:
test_identity = pd.read_csv('./datasets/test_identity.csv')

In [28]:
print(test.shape)
print(test_identity.shape)

(506691, 393)
(141907, 41)


In [None]:
del test_2
gc.collect()

In [55]:
test_2 = pd.read_csv('./datasets/test_transaction.csv')

In [57]:
test_transaction_id = test_2['TransactionID']

# Load Model

In [29]:
filename_encoder_identity = './models/encoders_identity.pkl'
filename_encoder_transaction = './models/encoders_transaction.pkl'
filename_model = './models/xgboost/transaction_identity.pkl'

with open(filename_encoder_identity, 'rb') as f:
    encoder_dict_identity = pickle.load(f)
    
with open(filename_encoder_transaction, 'rb') as f:
    encoder_dict_transaction = pickle.load(f)
    
    


In [30]:

with open(filename_model, 'rb') as f:
    model = pickle.load(f)

In [31]:
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
       n_estimators=100, n_jobs=1, n_thread=3, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=27, silent=None, subsample=1,
       verbosity=1)


# Encode test_transaction

In [32]:
%time encode_string_features(test, encoder_dict_transaction)

CPU times: user 2.49 s, sys: 0 ns, total: 2.49 s
Wall time: 2.5 s


### Drop old string features before training

In [33]:
string_features = get_string_features(test)
print(string_features)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


In [34]:
test = test.drop(columns=string_features)

# Extract Features in Identity File

In [35]:
feature_list = insert_features(extractors, test_identity)

  0%|          | 0/8 [00:00<?, ?it/s]

Extract features: id_30


 12%|█▎        | 1/8 [00:00<00:02,  2.97it/s]

CPU times: user 336 ms, sys: 0 ns, total: 336 ms
Wall time: 335 ms
Extract features: id_30


 25%|██▌       | 2/8 [00:00<00:01,  3.10it/s]

CPU times: user 288 ms, sys: 0 ns, total: 288 ms
Wall time: 288 ms
Extract features: id_31


 38%|███▊      | 3/8 [00:17<00:26,  5.39s/it]

CPU times: user 17.3 s, sys: 139 ms, total: 17.4 s
Wall time: 17.2 s
Extract features: id_31


 50%|█████     | 4/8 [00:35<00:36,  9.03s/it]

CPU times: user 17.6 s, sys: 238 ms, total: 17.8 s
Wall time: 17.5 s
Extract features: id_31


 88%|████████▊ | 7/8 [00:52<00:08,  8.03s/it]

CPU times: user 17.1 s, sys: 71.6 ms, total: 17.1 s
Wall time: 17.1 s
Extract features: id_33
CPU times: user 57.4 ms, sys: 3.9 ms, total: 61.3 ms
Wall time: 60.9 ms
Extract features: id_33
CPU times: user 61.2 ms, sys: 36 µs, total: 61.2 ms
Wall time: 61 ms
Extract features: DeviceInfo


100%|██████████| 8/8 [00:55<00:00,  6.55s/it]

CPU times: user 3.12 s, sys: 0 ns, total: 3.12 s
Wall time: 3.12 s





# Drop Features

In [36]:
test_identity = test_identity.drop(columns=feature_list)

# Encode Identity Features

In [37]:
string_features_identity = get_string_features(test_identity)
%time encode_features(string_features_identity, test_identity, encoder_dict_identity)

  0%|          | 0/18 [00:00<?, ?it/s]

Encode id_12


  6%|▌         | 1/18 [00:08<02:21,  8.33s/it]

Encode id_15


 11%|█         | 2/18 [00:16<02:14,  8.39s/it]

Encode id_16


 17%|█▋        | 3/18 [00:24<02:01,  8.11s/it]

Encode id_23


 22%|██▏       | 4/18 [00:24<01:20,  5.78s/it]

Encode id_27


 28%|██▊       | 5/18 [00:24<00:53,  4.14s/it]

Encode id_28


 33%|███▎      | 6/18 [00:33<01:04,  5.34s/it]

Encode id_29


 39%|███▉      | 7/18 [00:41<01:08,  6.18s/it]

Encode id_30_os_version


 44%|████▍     | 8/18 [00:55<01:25,  8.54s/it]

Encode id_30_os


 50%|█████     | 9/18 [01:01<01:10,  7.83s/it]

Encode id_31_annotation


 56%|█████▌    | 10/18 [01:07<00:59,  7.38s/it]

Encode id_31_developer


 61%|██████    | 11/18 [01:33<01:30, 12.99s/it]

Encode id_34


 67%|██████▋   | 12/18 [01:38<01:03, 10.52s/it]

Encode id_35


 72%|███████▏  | 13/18 [01:46<00:48,  9.77s/it]

Encode id_36


 78%|███████▊  | 14/18 [01:54<00:36,  9.23s/it]

Encode id_37


 83%|████████▎ | 15/18 [02:02<00:26,  8.85s/it]

Encode id_38


 89%|████████▉ | 16/18 [02:10<00:17,  8.59s/it]

Encode DeviceType


 94%|█████████▍| 17/18 [02:18<00:08,  8.45s/it]

Encode DeviceInfo_map


100%|██████████| 18/18 [03:54<00:00, 34.63s/it]

CPU times: user 3min 54s, sys: 820 ms, total: 3min 55s
Wall time: 3min 54s





{'id_12': LabelEncoder(),
 'id_15': LabelEncoder(),
 'id_16': LabelEncoder(),
 'id_23': LabelEncoder(),
 'id_27': LabelEncoder(),
 'id_28': LabelEncoder(),
 'id_29': LabelEncoder(),
 'id_30_os_version': LabelEncoder(),
 'id_30_os': LabelEncoder(),
 'id_31_annotation': LabelEncoder(),
 'id_31_developer': LabelEncoder(),
 'id_34': LabelEncoder(),
 'id_35': LabelEncoder(),
 'id_36': LabelEncoder(),
 'id_37': LabelEncoder(),
 'id_38': LabelEncoder(),
 'DeviceType': LabelEncoder(),
 'DeviceInfo_map': LabelEncoder()}

### Drop Un-encoded Features

In [38]:
test_identity = test_identity.drop(columns=string_features_identity)

# Merge Transaction and Identity Files

In [39]:
print(test.shape)
print(test_identity.shape)

(506691, 393)
(141907, 45)


In [40]:
test = pd.merge(test, test_identity, on='TransactionID', how='left')

In [41]:
test.shape

(506691, 437)

# Prepare Features and Labels for training

In [42]:
test = test.drop(columns=['TransactionID', 'TransactionDT'])

print('{:,}'.format(test.memory_usage().sum()))

1,767,338,208


# Check whether it has string features

In [43]:
for col in test.columns:
    if test[col].dtype == np.dtype('object'):
        print(col)

# Predict

In [44]:
%time pred_test = model.predict(test)
%time pred_prob_test = model.predict_proba(test)

CPU times: user 4.81 s, sys: 3.33 s, total: 8.14 s
Wall time: 8.14 s
CPU times: user 4.91 s, sys: 3.58 s, total: 8.49 s
Wall time: 8.49 s


In [46]:
pred_prob_test[:5, :]

array([[0.99471414, 0.00528584],
       [0.98792773, 0.01207224],
       [0.98220813, 0.01779185],
       [0.99529856, 0.00470144],
       [0.9920213 , 0.00797867]], dtype=float32)

In [48]:
pred_test[:5]

array([0, 0, 0, 0, 0])

# Save Data

In [60]:
test_result_df = pd.DataFrame(test_transaction_id, columns=['TransactionID'])

In [61]:
test_result_df['isFraud'] = pred_prob_test[:, 1]

In [62]:
test_result_df.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.005286
1,3663550,0.012072
2,3663551,0.017792
3,3663552,0.004701
4,3663553,0.007979


In [64]:
filename_prediction = './prediction_test.csv'
test_result_df.to_csv(filename_prediction, index=False)