# Experiment 06: HIGGS boson (GPU version)

This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson. The dataset consists of 11 million of observations. More information about the data can be found in [loaders.py](libs/loaders.py).  

In [4]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
import json
import seaborn
import matplotlib.pyplot as plt
import pkg_resources
import warnings
from libs.loaders import load_higgs
from libs.timer import Timer

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

warnings.filterwarnings("ignore", category=DeprecationWarning) 
% matplotlib inline
% load_ext autoreload
% autoreload 2

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data loading and management

In [3]:
%%time
df = load_higgs()
print(df.shape)

INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare


(11000000, 29)
CPU times: user 1min 16s, sys: 10.8 s, total: 1min 26s
Wall time: 5min 29s


In [5]:
df.head(5)

Unnamed: 0,boson,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [8]:
subset = 1e6
df_small = df.sample(n=subset).reset_index(drop=True)

  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)


In [7]:
def generate_feables(df):
    X = df[df.columns.difference(['boson'])]
    y = df['boson']
    return X,y

In [9]:
%%time
#X, y = generate_feables(df)
X, y = generate_feables(df_small)
print(X.shape)
print(y.shape)

(1000000, 28)
(1000000,)
CPU times: user 48 ms, sys: 4 ms, total: 52 ms
Wall time: 52.6 ms


In [10]:
test_size = y.shape[0]/10
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=int(test_size))
print(X_train.shape)
print(X_test.shape)

(900000, 28)
(100000, 28)


In [11]:
%%time
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

CPU times: user 592 ms, sys: 80 ms, total: 672 ms
Wall time: 671 ms


### XGBoost
Let's start y computing the standard version of XGBoost.

In [12]:
results_dict = dict()

In [13]:
params = {'max_depth':8, 'num_boost_round':200, 'min_child_weight':30, 'eta':0.1, 
          'colsample_bytree':0.80, 'scale_pos_weight':2, 'gamma':0.1, 'reg_lamda':1, 
          'subsample':1,'tree_method':'exact', 'updater':'grow_gpu'}

In [14]:
with Timer() as train_t:
    xgb_clf_pipeline = xgb.train(params, dtrain)
    
with Timer() as test_t:
    y_prob_xgb = xgb_clf_pipeline.predict(dtest)

In [15]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary(y_true, y_pred):
    m_acc = accuracy_score(y_true, y_pred)
    m_f1 = f1_score(y_true, y_pred)
    m_precision = precision_score(y_true, y_pred)
    m_recall = recall_score(y_true, y_pred)
    report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1}
    return report

In [16]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_binary_prob(y_true, y_prob):
    m_auc = roc_auc_score(y_true, y_prob)
    report = {'AUC':m_auc}
    return report

In [17]:
def binarize_prediction(y, threshold=0.5):
    y_pred = np.where(y > threshold, 1, 0)
    return y_pred

In [18]:
y_pred_xgb = binarize_prediction(y_prob_xgb)

In [19]:
report_xgb = classification_metrics_binary(y_test, y_pred_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)
report_xgb.update(report2_xgb)

In [20]:
results_dict['xgb']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': report_xgb 
}

In [21]:
del xgb_clf_pipeline #clear GPU memory (1054Mb) 

Now let's try with XGBoost histogram.

In [22]:
params = {'max_depth':0, 'num_boost_round':200, 'min_child_weight':30, 'eta':0.1, 
          'colsample_bytree':0.80, 'scale_pos_weight':2, 'gamma':0.1, 'reg_lamda':1, 
          'subsample':1,'tree_method':'hist', 'max_leaves':255, 'grow_policy':'lossguide', 
          'updater':'grow_gpu_hist'}

In [23]:
with Timer() as t_train:
    xgb_hist_clf_pipeline = xgb.train(params, dtrain)
    
with Timer() as t_test:
    y_prob_xgb_hist = xgb_hist_clf_pipeline.predict(dtest)

In [24]:
y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)

In [25]:
report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)
report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)
report_xgb_hist.update(report2_xgb_hist)

In [26]:
results_dict['xgb_hist']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': report_xgb_hist
}

In [27]:
del xgb_hist_clf_pipeline #clear GPU memory

### LightGBM

In [28]:
lgbm_clf_pipeline = LGBMClassifier(num_leaves=2**8, 
                                    learning_rate=0.1, 
                                    scale_pos_weight=1,
                                    n_estimators=200,
                                    min_split_gain=0.1,
                                    min_child_weight=30,
                                    reg_lambda=1,
                                    subsample=1,
                                    device='gpu'
                                    )

In [29]:
with Timer() as train_t:
    lgbm_clf_pipeline.fit(X_train, y_train)
    
with Timer() as test_t:
    y_prob_lgbm = lgbm_clf_pipeline.predict(X_test)

In [36]:
params = {
    'num_leaves': 2**8,
    'learning_rate': 0.1,
    'scale_pos_weight': 1,
    'min_split_gain': 0.1,
    'min_child_weight': 30,
    'reg_lambda': 1,
    'subsample': 1,
    'device': 'gpu',
    'task': 'train'
}

In [45]:
lgb_train = lgb.Dataset(X_train.values, y_train.values, free_raw_data=False)
lgb_test = lgb.Dataset(X_test.values, y_test.values, reference=lgb_train, free_raw_data=False)

In [52]:
lgbm_clf_pipeline.predict??

In [50]:
with Timer() as train_t:
    lgbm_clf_pipeline = lgb.train(params, lgb_train, num_boost_round=200)
    
with Timer() as test_t:
    y_prob_lgbm = lgbm_clf_pipeline.predict(X_test.values)

In [53]:
y_pred_lgbm = binarize_prediction(y_prob_lgbm)

In [54]:
report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)
report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)
report_lgbm.update(report2_lgbm)

In [55]:
results_dict['lgbm']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': report_lgbm 
}

with sklean API:
    "lgbm": {
        "performance": {
            "AUC": 0.7492321373308166,
            "Accuracy": 0.75075,
            "F1": 0.7670451890275246,
            "Precision": 0.7593168276525665,
            "Recall": 0.7749324873000586
        },
        "test_time": 0.23797125497367233,
        "train_time": 27.024242482031696
    },
    

In [56]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.829210976809148,
            "Accuracy": 0.74694,
            "F1": 0.7638220032105125,
            "Precision": 0.7550696558723129,
            "Recall": 0.7727796347704569
        },
        "test_time": 0.14522848901106045,
        "train_time": 20.864964423992205
    },
    "xgb": {
        "performance": {
            "AUC": 0.7912223264207421,
            "Accuracy": 0.6538,
            "F1": 0.7424413760266635,
            "Precision": 0.6125234769159005,
            "Recall": 0.9423073291409363
        },
        "test_time": 0.02099366398761049,
        "train_time": 9.863963771029375
    },
    "xgb_hist": {
        "performance": {
            "AUC": 0.7981349735116925,
            "Accuracy": 0.66299,
            "F1": 0.7471811914389239,
            "Precision": 0.6198038532384129,
            "Recall": 0.9404566313523313
        },
        "test_time": 0.0629038589540869,
        "train_time": 4.395623447024263

In [34]:
del lgbm_clf_pipeline