# Experiment 07: HIGGS boson (GPU version)

This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson. The dataset consists of 11 million of observations. More information about the data can be found in [loaders.py](libs/loaders.py).  

For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 6](06_airline_GPU.ipynb).

In [20]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
import json
import seaborn
import matplotlib.pyplot as plt
import pkg_resources
import warnings
from libs.loaders import load_higgs
from libs.timer import Timer
from libs.metrics import classification_metrics_binary, classification_metrics_binary_prob, binarize_prediction

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

warnings.filterwarnings("ignore", category=DeprecationWarning) 
% matplotlib inline
% load_ext autoreload
% autoreload 2

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data loading and management

In [3]:
%%time
df = load_higgs()
print(df.shape)

INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare


(11000000, 29)
CPU times: user 1min 13s, sys: 15.2 s, total: 1min 28s
Wall time: 5min 38s


In [4]:
df.head(5)

Unnamed: 0,boson,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


Due to memory issues, we are going to take a subset of the dataset. 

In [5]:
subset = 1e6
df_small = df.sample(n=subset).reset_index(drop=True)

  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)


In [5]:
def generate_feables(df):
    X = df[df.columns.difference(['boson'])]
    y = df['boson']
    return X,y

In [6]:
%%time
X, y = generate_feables(df)
#X, y = generate_feables(df_small)
print(X.shape)
print(y.shape)

(11000000, 28)
(11000000,)
CPU times: user 364 ms, sys: 500 ms, total: 864 ms
Wall time: 964 ms


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=0.3)
print(X_train.shape)
print(X_test.shape)

(7700000, 28)
(3300000, 28)


Let's put the data in the XGBoost format.

In [8]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

Now, we'll do the same for LightGBM.

In [9]:
lgb_train = lgb.Dataset(X_train.values, y_train.values, free_raw_data=False)
lgb_test = lgb.Dataset(X_test.values, y_test.values, reference=lgb_train, free_raw_data=False)

### XGBoost
Let's start by computing the standard version of XGBoost in a GPU.

In [10]:
results_dict = dict()
num_rounds = 100

In [13]:
params = {'max_depth':3, 
          'objective':'binary:logistic', 
          'min_child_weight':1, 
          'learning_rate':0.1, 
          'colsample_bytree':0.80, 
          'scale_pos_weight':2, 
          'gamma':0.1, 
          'reg_lamda':1, 
          'subsample':1,
          'tree_method':'exact', 
          'updater':'grow_gpu'
          }

In [14]:
with Timer() as train_t:
    xgb_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)
    
with Timer() as test_t:
    y_prob_xgb = xgb_clf_pipeline.predict(dtest)

XGBoostError: b'[16:21:21] /home/hoaphumanoid/repos/xgboost/plugin/updater_gpu/src/updater_gpu.cc:40: GPU plugin exception: /home/hoaphumanoid/repos/xgboost/plugin/updater_gpu/src/device_helpers.cuh(363): out of memory\n\n\nStack trace returned 10 entries:\n[bt] (0) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f380377b29c]\n[bt] (1) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(_ZN7xgboost4tree8GPUMakerINS0_9GradStatsEE6UpdateERKSt6vectorINS_9bst_gpairESaIS5_EEPNS_7DMatrixERKS4_IPNS_7RegTreeESaISD_EE+0x411) [0x7f3803906af1]\n[bt] (2) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(_ZN7xgboost3gbm6GBTree13BoostNewTreesERKSt6vectorINS_9bst_gpairESaIS3_EEPNS_7DMatrixEiPS2_ISt10unique_ptrINS_7RegTreeESt14default_deleteISB_EESaISE_EE+0x8c3) [0x7f38038105f3]\n[bt] (3) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(_ZN7xgboost3gbm6GBTree7DoBoostEPNS_7DMatrixEPSt6vectorINS_9bst_gpairESaIS5_EEPNS_11ObjFunctionE+0x86d) [0x7f38038116bd]\n[bt] (4) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(_ZN7xgboost11LearnerImpl13UpdateOneIterEiPNS_7DMatrixE+0x22b) [0x7f38038f876b]\n[bt] (5) /anaconda/envs/strata/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x27) [0x7f380376d2b7]\n[bt] (6) /anaconda/envs/strata/lib/python3.5/lib-dynload/_ctypes.so(ffi_call_unix64+0x4c) [0x7f386c113370]\n[bt] (7) /anaconda/envs/strata/lib/python3.5/lib-dynload/_ctypes.so(ffi_call+0x1f5) [0x7f386c112b15]\n[bt] (8) /anaconda/envs/strata/lib/python3.5/lib-dynload/_ctypes.so(_ctypes_callproc+0x3dc) [0x7f386c10a5dc]\n[bt] (9) /anaconda/envs/strata/lib/python3.5/lib-dynload/_ctypes.so(+0x9c43) [0x7f386c102c43]\n'

Once the training and test is finised, let's compute some metrics.

In [17]:
y_pred_xgb = binarize_prediction(y_prob_xgb)

In [18]:
report_xgb = classification_metrics_binary(y_test, y_pred_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)
report_xgb.update(report2_xgb)

In [19]:
results_dict['xgb']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': report_xgb 
}

In [15]:
del xgb_clf_pipeline #clear GPU memory (1054Mb) 

NameError: name 'xgb_clf_pipeline' is not defined

Now let's try with XGBoost histogram.

In [16]:
params = {'max_depth':0, 
          'objective':'binary:logistic', 
          'min_child_weight':30, 
          'learning_rate':0.1, 
          'colsample_bytree':0.80, 
          'scale_pos_weight':2, 
          'gamma':0.1, 
          'reg_lamda':1, 
          'subsample':1,
          'tree_method':'hist', 
          'max_leaves':2**5, 
          'grow_policy':'lossguide', 
          'updater':'grow_gpu_hist'
         }

In [17]:
with Timer() as t_train:
    xgb_hist_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)
    
with Timer() as t_test:
    y_prob_xgb_hist = xgb_hist_clf_pipeline.predict(dtest)

In [21]:
y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)

In [22]:
report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)
report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)
report_xgb_hist.update(report2_xgb_hist)

In [23]:
results_dict['xgb_hist']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': report_xgb_hist
}

In [24]:
del xgb_hist_clf_pipeline #clear GPU memory (214Mb)

### LightGBM
After the XGBoost version is finished, let's try LightGBM in GPU. 

In [25]:
params = {'num_leaves': 2**5,
         'learning_rate': 0.1,
         'scale_pos_weight': 1,
         'min_split_gain': 0.1,
         'min_child_weight': 1,
         'reg_lambda': 1,
         'subsample': 1,
         'objective':'binary',
         'device': 'gpu',
         'task': 'train'
         }

In [26]:
with Timer() as train_t:
    lgbm_clf_pipeline = lgb.train(params, lgb_train, num_boost_round=num_rounds)
    
with Timer() as test_t:
    y_prob_lgbm = lgbm_clf_pipeline.predict(X_test.values)

As we did before, let's obtain some performance metrics.

In [27]:
y_pred_lgbm = binarize_prediction(y_prob_lgbm)

In [28]:
report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)
report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)
report_lgbm.update(report2_lgbm)

In [29]:
results_dict['lgbm']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': report_lgbm 
}

In [30]:
del lgbm_clf_pipeline #clear GPU memory (135Mb)

Finally, we show the results

In [31]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.8123820215279185,
            "Accuracy": 0.7321569696969698,
            "F1": 0.7483299004061996,
            "Precision": 0.7452335894209192,
            "Recall": 0.751452047963759
        },
        "test_time": 2.48556369400103,
        "train_time": 36.84274147600081
    },
    "xgb_hist": {
        "performance": {
            "AUC": 0.8115216931113615,
            "Accuracy": 0.6943206060606061,
            "F1": 0.7609238804038742,
            "Precision": 0.6497603615644316,
            "Recall": 0.9179745153216292
        },
        "test_time": 2.412243650000164,
        "train_time": 53.07074312600162
    }
}


We can observe that LightGBM is faster than XGBoost in both versions. But also, XGBoost with the leaf-wise implementation is faster than with the depth-wise implementation. The metric performance for LigtGBM and XGBoost hist is similar, however for standard XGBoost is lower.

Final advice: go leaf-wise :-)