# Train using XGBoost

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing


# Read Datasets

In [2]:
%time train = pd.read_csv('./datasets/train_transaction.csv')

CPU times: user 21.1 s, sys: 2.59 s, total: 23.7 s
Wall time: 24.9 s


In [None]:
%time train_identity = pd.read_csv('./datasets/train_identity.csv')

# Encode
- Special decoding of R_emaildomain and P_emaildomain
    - Combine these 2 features and fit them.

In [3]:
def get_string_features(df):
    """
    Get features with numpy.dtype of 'object'
    """
    string_features = []
    for feature in df.columns:
        if df[feature].dtype == np.dtype('object'):
            string_features.append(feature)
            
    return string_features


string_features = get_string_features(train)

# Encode String Features of Transaction File

In [4]:
import pdb

def gen_encoder(series, encode_nan=False):
    """
    Args:
        series (pandas.Series):
        
    Return:
        encoder (sklearn.preprocessing.LabelEncoder):
    """
    
    encoder = preprocessing.LabelEncoder()
    
    if encode_nan:
        encoder.fit(series)
    else:
        encoder.fit(series[series.notnull()])
    
    return encoder


def get_string_features(df):
    """
    Args:
        df (pandas.DataFrame):
    
    Return:
        string_features (list of str):
    """
    string_features = []
    
    for col in df.columns:
        if df[col].dtype is np.dtype('object'):
            string_features.append(col)
            
    return string_features
    

def generate_encoders(df, feature_names):
    encoder_dict = {}
    
    # fit 
    for feature in feature_names:
        
        # P_emaildomain and R_emaildomain needs to be encoded together.
        if feature is not 'P_emaildomain' and \
            feature is not 'R_emaildomain':
            
#             pdb.set_trace()
            
            # encode labels
            encoder = gen_encoder(df[feature], encode_nan=False)
            
            # record encoder
            encoder_dict[feature] = encoder
    
    
    # Combine P_emaildomain and R_emaildomain
    encoder = gen_encoder(pd.concat([df['P_emaildomain'], df['R_emaildomain']],ignore_index=True),
                         encode_nan=False)
    
    encoder_dict['P_emaildomain'] = encoder
    encoder_dict['R_emaildomain'] = encoder
    
    return encoder_dict
    

def encode_feature(series, encoder):
    """
    Args:
        series (pandas.Series):
        encoder (sklearn.preprocessing.LabelEncoder):
    
    Return:
        series_encoded (pandas.Series):
    """
    
    return pd.Series(
                    encoder.transform(series[series.notnull()]),
                    index=series[series.notnull()].index)


def encode_string_features(df, encoder_dict):
    """
    Args:
        df (pandas.DataFrame):
        encoder_dict (dict): encoders w.r.t. string features.
    """
    
    for feature, encoder in encoder_dict.items():
        # get encoder
        encoder = encoder_dict[feature]

        # Start encoding
        new_feature_name = '{}_encoded'.format(feature)
        new_index = df.columns.get_loc(feature) + 1
        
        df.insert(new_index, 
                  new_feature_name, 
                  encode_feature(df[feature], encoder))
            
            
encoder_dict = generate_encoders(train, string_features)

In [5]:
%time encode_string_features(train, encoder_dict)

CPU times: user 3.14 s, sys: 20.2 ms, total: 3.16 s
Wall time: 2.77 s


### Drop old string features before training

In [6]:
train = train.drop(columns=string_features)

# Encode String Features of Identity File

In [None]:
import re

def get_os(info):
    version = get_os_version(info)
    
    if version is np.nan or version is None:
        return info
    
    return info.strip(version).strip()


def get_os_version(info):
    
    results = re.findall('\d+\.\d\.*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+\_\d+\_*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+', info)
    if len(results) > 0:
        return results[0]
    
    return np.nan


def split_browser_info(info):
    
    if info is np.nan:
        return pd.Series([np.nan, np.nan, np.nan])
    
    known_developers = ['android browser',
                       'chrome',
                       'edge',
                        'firefox',
                        'google search application',
                        'google',
                        'Generic/Android',
                       'ie',
                        'line',
                        'Microsoft',
                        'opera',
                        'samsung',
                       'safari',
                        'ZTE/Blade',
                       'other']
    
    developer = np.nan
    version = np.nan
    
    # Grep version
    results = re.findall('\d+\.\d+', info)
    if len(results) > 0:
        str_version = results[0]
        version = float(str_version)
        info = info.replace(str_version, '')
        
    # Grep Developer
    for d in known_developers:
        if d in info:
            developer = d
            info = info.replace(d, '')
    
    # Remove 'for'
    if 'for' in info:
        info = info.replace('for', '')
    
    # remove consecutive whitespaces and keep only 1 whitespace.
    info = ' '.join(info.split())
    if len(info.strip()) == 0:
        info = np.nan
    
    if developer is np.nan:
        developer = info
        info = np.nan
        
    return pd.Series([developer, version, info])


def get_manufacture(device_info, return_nan=True):
    
    if device_info is None:
        return device_info
    
    starts_dict = {'0P': 'HTC',
                   '2P': 'HTC',
                   '40': 'ALCATEL',
                   '50': 'ALCATEL',
                   '60': 'ALCATEL',
                   '70': 'ALCATEL',
                   '80': 'ALCATEL',
                   '90': 'ALCATEL',
                   'Android': 'os-Android', #############
                   'ASUS': 'ASUS',
                   'Aquaris': 'Aquaris',
                   'ALCATEL': 'ALCATEL',
                   'AX': 'Bmobile', # Mexico
                   'B1-': 'Acer',
                   'B3-': 'Acer',
                   'BB': 'BlackBerry',
                   'BLADE': 'ZTE',
                   'BLN': 'HUAWEI',
                   'BLU': 'BLU', # American
                   'BND': 'HUAWEI',
                   'Build/': 'factory-image', ###############
                   'BV': 'Blackview',
                   'C1': 'sony',
                   'C2': 'sony',
                   'C6': 'sony',
                   'D2': 'sony',
                   'D5': 'sony',
                   'D6': 'sony',
                   'E2': 'sony',
                   'E501': 'Hyundai',
                   'E53': 'sony',
                   'E55': 'sony',
                   'E56': 'sony',
                   'E58': 'sony',
                   'E6': 'sony',
                   'E8': 'sony',
                   'en-': 'encoding',  ####################
                   'es-': 'encoding',  ####################
                   'F3': 'sony',
                   'F5': 'sony',
                   'F80': 'F2-mobile',
                   'F81': 'sony',
                   'F83': 'sony',
                   'H1': 'sony',
                   'H3': 'sony',
                   'H5': 'sony',
                   'IdeaTab': 'Lenovo',
                   'G255': 'Hyundai',
                   'G527': 'HUAWEI',
                   'G620': 'HUAWEI',
                   'G630': 'HUAWEI',
                   'G814': 'sony',
                   'G8341': 'sony',
                   'G3': 'sony',
                   'GT-': 'samsung',
                   'HTC': 'HTC',
                   'HUAWEI': 'HUAWEI',
                   'Hisense': 'Hisense',
                   'Ilium': 'Lanix', # Italian Company
                   'iOS': 'os-ios', ###################
                   'iPhone': 'iPhone', #######################
                   'iris': 'Lava',
                   'K1': 'koobee',
                   'K8': 'koobee',
                   'K9': 'koobee',
                   'Linux': 'os-linux', ######################
                   'KF': 'Amazon',  # Kindle Fire
                   'Lenovo': 'Lenovo',
                   'LG': 'LG',
                   'M431': 'Morphe',
                   'M4': 'M4',
                   'MacOS': 'MacOS',
                   'Mi ': 'xiaomi',
                   'Microsoft': 'Microsoft',  #############
                   'Moto': 'Moto',
                   'Nexus': 'google',
                   'ONE': 'ONEPLUS',
                   'P0': 'Asus',
                   'P4': 'Polaroid',
                   'P5': 'Polaroid',
                   'Pixel': 'google',
                   'Q': 'verizon',
                   'Redmi': 'xiaomi',
                   'RCT': 'RCA',
                   'rv:': 'firefox user-agent', ######################
                   'SAMSUNG': 'samsung',
                   'SCH-': 'samsung',
                   'SGH-': 'samsung',
                   'SGP': 'sony',
                   'SLA': 'HUAWEI',
                   'SPH-': 'samsung',
                   'SM-': 'samsung',
                   'STV100': 'BlackBerry',
                   'TA-': 'karbonn',
                   'verykool': 'verykool',
                   'VK': 'LG',
                   'VS': 'LG',
                   'Win': 'Windows',
                   'XT1': 'Motorola',
                   'Z410': 'Acer',
                   'Z5': 'ZTE',
                   'Z7': 'ZTE',
                   'Z8': 'ZTE',
                   'Z9 PLUS': 'QMobile',
                   'Z95': 'ZTE',
                   'Z96': 'ZTE',
                   'Z97': 'ZTE',
                   'Z98': 'ZTE',
                   'ZA': 'Zonda',
                   'ZTE': 'ZTE'}
    
    substring_dict = {'HUAWEI': 'HUAWEI',
                     'Build': 'substring-build'
                     }
    
    for prefix, manufacture in starts_dict.items():
        if device_info.lower().startswith(prefix.lower()):
            return manufacture
        
        
    for substring, manufacture in substring_dict.items():
        if substring.lower() in device_info.lower():
            return manufacture

    if return_nan:
        return
    else:
        return device_info
    
    
def insert_feature(feature_org, feature_new, extractor, df):
    """
    Args:
        feature_new (string):
        feature_org (string):
        df (pandas.DataFrame):
    """
    
    


extractors = [
              ['id_30', 'id_30_os', get_os],
              ['id_30', 'id_30_os_version', get_os_version],
              ['id_31', 'id_31_developer', lambda x: split_browser_info(x)[0]],
              ['id_31', 'id_31_version', lambda x: split_browser_info(x)[1]],
              ['id_31', 'id_31_annotation', lambda x: split_browser_info(x)[2]],
              ['id_33', 'id_33_width', lambda s: float(s.split('x')[0])],
              ['id_33', 'id_33_height', lambda s: float(s.split('x')[1])],
              ['DeviceInfo', 'DeviceInfo_map', lambda x: get_manufacture(x, return_nan=False)]
             ]

def insert_features(extractors, df):
    """
    Extract features of id_30, id_31, id_33, id_34.
    """
    feature_list = []
    
    for feature_org, feature_new, extractor in extractors:
        print('Extract features: {}'.format(feature_org))
        
        feature_list.append(feature_org)
        
        # get index of original feature
        index_feature_org = df.columns.get_loc(feature_org)

        # insert feature
        %time df.insert(index_feature_org+1, \
                    feature_new, \
                     df[feature_org].map(extractor, na_action='ignore'))
    
    
    return feature_list


feature_list = insert_features(extractors, train_identity)

### Drop Original Features

In [None]:
train_identity = train_identity.drop(columns=feature_list)

### Encode Labels
- Encode All Features with Object Type

In [None]:
string_features = get_string_features(train_identity)

print(string_features)

In [None]:
def transform_label(label, encoder):
    if label in encoder.classes_:
        return encoder.transform([label])[0]
    else:
        return -1
    

def encode_features(string_features, df):
    """
    Encode features and insert back to datafame.
    """
    encoder_dict = {}
    
    for feature in tqdm(string_features):
        print('Encode {}'.format(feature))
        # Get index for inserting encoded feature
        index_col = df.columns.get_loc(feature)
           
        # Encode Feature and insert
        encoder = preprocessing.LabelEncoder()
        
        encoder.fit(df[feature].dropna().unique())
        
        # Transform
        df.insert(index_col+1,
                 '{}_encoded'.format(feature),
                  df[feature].map(lambda x: transform_label(x, encoder),
                                                  na_action='ignore'))
        
        
        encoder_dict[feature] = encoder
        

%time encoders = encode_features(string_features, train_identity)

### Drop Un-encoded Features

In [None]:
train_identity = train_identity.drop(columns=string_features)

# Merge Transaction and Identity Files

In [None]:
train = pd.merge(train, train_identity, on='TransactionID', how='left')

# Prepare Features and Labels for training

In [7]:
y = train['isFraud']
train = train.drop(columns=['TransactionID', 'TransactionDT'])

print('{:,}'.format(train.memory_usage().sum()))

1,851,933,520


# Check whether it has string features

In [14]:
for col in train.columns:
    if train[col].dtype == np.dtype('object'):
        print(col)

# Train using XGBoost

In [8]:
import xgboost as xgb
import sklearn
from multiprocessing import cpu_count


print(cpu_count())

4


In [9]:
seed = 27
model = xgb.XGBClassifier(objective='binary:logistic',
                            n_thread=-1,
                            seed=seed)

%time model.fit(train, y, verbose=True)

CPU times: user 5min 33s, sys: 4.73 s, total: 5min 38s
Wall time: 5min 37s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, n_thread=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=27, silent=None, subsample=1,
       verbosity=1)