# Experiment 06: HIGGS boson

This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson.  

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
import json
import seaborn
import matplotlib.pyplot as plt
import pkg_resources
import warnings
from libs.loaders import load_higgs
from libs.timer import Timer
from libs.utils import get_number_processors

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

warnings.filterwarnings("ignore", category=DeprecationWarning) 
% matplotlib inline
% load_ext autoreload
% autoreload 2

System version: 3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:09:58) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2


### Data loading and management

In [2]:
%%time
df = load_higgs()

MOUNT_POINT not found in environment. Defaulting to /fileshare
CPU times: user 1min 20s, sys: 5.69 s, total: 1min 26s
Wall time: 5min 11s


In [3]:
print(df.shape)
df.head()

(11000000, 29)


Unnamed: 0,boson,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [4]:
%%time
X = df[df.columns.difference(['boson'])]
y = df['boson']
print(X.shape)
print(y.shape)

(11000000, 28)
(11000000,)
CPU times: user 420 ms, sys: 632 ms, total: 1.05 s
Wall time: 1.05 s


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=500000)

In [6]:
pipeline_steps = [('scale', StandardScaler())]
continuous_pipeline = Pipeline(steps=pipeline_steps)
featurisers = [('continuous', continuous_pipeline)]

### XGBoost

In [7]:
results_dict = dict()

In [8]:
number_processors = get_number_processors()
print(number_processors)

20


In [9]:
xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                  ('clf', XGBClassifier(max_depth=8, 
                                                        learning_rate=0.1, 
                                                        scale_pos_weight=1,
                                                        n_estimators=100,
                                                        gamma=0.1,
                                                        min_child_weight=1,
                                                        reg_lambda=1,
                                                        subsample=1,
                                                        nthread=number_processors
                                                        ))])

In [10]:
with Timer() as train_t:
    xgb_clf_pipeline.fit(X_train,y_train)
    
with Timer() as test_t:
    y_pred = xgb_clf_pipeline.predict(X_test)

In [11]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'AUC': roc_auc_score,
    'F1': f1_score,
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [12]:
results_dict['xgb']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

In [13]:
xgb_hist_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', XGBClassifier(max_depth=0, 
                                                                learning_rate=0.1, 
                                                                scale_pos_weight=1,
                                                                n_estimators=100,
                                                                gamma=0.1,
                                                                min_child_weight=1,
                                                                reg_lambda=1,
                                                                subsample=1,
                                                                max_leaves=2**8,
                                                                grow_policy='lossguide',
                                                                tree_method='hist',
                                                                nthread=number_processors
                                                                ))])

In [14]:
with Timer() as t_train:
    xgb_hist_clf_pipeline.fit(X_train,y_train)

In [15]:
with Timer() as t_test:
    y_pred = xgb_hist_clf_pipeline.predict(X_test)

In [16]:
results_dict['xgb_hist']={
    'train_time': t_train.interval,
    'test_time': t_test.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

### LightGBM

In [17]:
lgbm_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                      ('clf', LGBMClassifier(num_leaves=2**8, 
                                                            learning_rate=0.1, 
                                                            scale_pos_weight=1,
                                                            n_estimators=100,
                                                            min_split_gain=0.1,
                                                            min_child_weight=1,
                                                            reg_lambda=1,
                                                            subsample=1,
                                                            nthread=number_processors
                                                            ))])

In [18]:
with Timer() as train_t:
    lgbm_clf_pipeline.fit(X_train, y_train)
    
with Timer() as test_t:
    y_pred = lgbm_clf_pipeline.predict(X_test)

In [19]:
results_dict['lgbm']={
    'train_time': train_t.interval,
    'test_time': test_t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          y_pred) 
}

In [20]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "AUC": 0.7466549865702785,
            "Accuracy": 0.74803,
            "F1": 0.763997459860742,
            "Precision": 0.758443703067293,
            "Recall": 0.769633152173913
        },
        "test_time": 0.9962920039979508,
        "train_time": 156.9801655629999
    },
    "xgb": {
        "performance": {
            "AUC": 0.7399493145683087,
            "Accuracy": 0.741266,
            "F1": 0.7573493691162375,
            "Precision": 0.7528012797327178,
            "Recall": 0.761952747584541
        },
        "test_time": 0.6953976299992064,
        "train_time": 566.0032262919995
    },
    "xgb_hist": {
        "performance": {
            "AUC": 0.7470536454582302,
            "Accuracy": 0.748424,
            "F1": 0.7643545734186085,
            "Precision": 0.7588360449632126,
            "Recall": 0.7699539553140097
        },
        "test_time": 0.7701418910000939,
        "train_time": 154.91502592400502
 