# Experiment 06: HIGGS boson 

This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson. The dataset consists of 11 million of observations. More information about the data can be found in [loaders.py](libs/loaders.py).  

For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb).

In [2]:
import json
import sys
import matplotlib.pyplot as plt
import pkg_resources
from libs.loaders import load_higgs
from libs.timer import Timer
from libs.utils import get_number_processors
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

System version: 3.5.3 |Anaconda 4.4.0 (64-bit)| (default, Mar  6 2017, 11:58:13) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2


In [3]:
%%time
df = load_higgs()
print(df.shape)

INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare


(11000000, 29)
CPU times: user 1min 12s, sys: 6.31 s, total: 1min 18s
Wall time: 4min 15s


In [4]:
df.head()

Unnamed: 0,boson,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [5]:
num_rounds = 200
number_processors = get_number_processors()
print(number_processors)

24


In [6]:
xgb_clf_pipeline = XGBClassifier(max_depth=5, 
                                learning_rate=0.1, 
                                scale_pos_weight=2,
                                n_estimators=num_rounds,
                                gamma=0.1,
                                min_child_weight=1,
                                reg_lambda=1,
                                subsample=1,
                                nthread=number_processors
                                )

In [7]:
xgb_hist_clf_pipeline = XGBClassifier(max_depth=0, 
                                     learning_rate=0.1, 
                                     scale_pos_weight=2,
                                     n_estimators=num_rounds,
                                     gamma=0.1,
                                     min_child_weight=1,
                                     reg_lambda=1,
                                     subsample=1,
                                     max_leaves=2**5,
                                     grow_policy='lossguide',
                                     tree_method='hist',
                                     nthread=number_processors
                                     )

In [8]:
lgbm_clf_pipeline = LGBMClassifier(num_leaves=2**5, 
                                  learning_rate=0.1, 
                                  scale_pos_weight=2,
                                  n_estimators=num_rounds,
                                  min_split_gain=0.1,
                                  min_child_weight=1,
                                  reg_lambda=1,
                                  subsample=1,
                                  nthread=number_processors
                                  )

In [9]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'AUC': roc_auc_score,
    'F1': f1_score,
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [10]:
def generate_feables(df):
    X = df[df.columns.difference(['boson'])]
    y = df['boson']
    return X,y

In [11]:
X, y = generate_feables(df)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=500000)

In [13]:
results_dict = dict()

### XGBoost

In [14]:
with Timer() as train_t:
    xgb_clf_pipeline.fit(X_train,y_train)
    
with Timer() as test_t:
    y_pred = xgb_clf_pipeline.predict(X_test)

In [15]:
results_dict['xgb']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

In [16]:
with Timer() as t_train:
    xgb_hist_clf_pipeline.fit(X_train,y_train)

In [17]:
with Timer() as t_test:
    y_pred = xgb_hist_clf_pipeline.predict(X_test)

In [18]:
results_dict['xgb_hist']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

### LightGBM

In [19]:
with Timer() as train_t:
    lgbm_clf_pipeline.fit(X_train, y_train)
    
with Timer() as test_t:
    y_pred = lgbm_clf_pipeline.predict(X_test)

In [20]:
results_dict['lgbm']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

In [21]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.694682949690134,
            "Accuracy": 0.707758,
            "F1": 0.7680747894958216,
            "Precision": 0.6627597069095391,
            "Recall": 0.9131831219806763
        },
        "test_time": 0.7120589099995414,
        "train_time": 119.34003880199998
    },
    "xgb": {
        "performance": {
            "AUC": 0.6859901403358623,
            "Accuracy": 0.699694,
            "F1": 0.7635493812093622,
            "Precision": 0.6551156676187414,
            "Recall": 0.9149984903381643
        },
        "test_time": 0.55617916600022,
        "train_time": 2996.1667750769993
    },
    "xgb_hist": {
        "performance": {
            "AUC": 0.6941216899970567,
            "Accuracy": 0.70721,
            "F1": 0.767674555527519,
            "Precision": 0.6623426413523601,
            "Recall": 0.9128434480676328
        },
        "test_time": 0.6464068210007099,
        "train_time": 121.21175534400027