# XGBoost example - Higgs boson 

In this notebook we will use [XGBoost](https://xgboost.readthedocs.io/en/latest/) to classify Higgs boson signal from background events and compare the performance of this model against similar ones. The dataset is taken from [Kaggle Higgs Boson Challenge](https://www.kaggle.com/c/higgs-boson/) from the [HiggsML challenge](https://higgsml.lal.in2p3.fr/) and is a modified version of the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UCI repository. The dataset consists of 250'000 events.

In this example it'll be clear that the XGBoost model is really fast and efficient, since it can train on the full dataset in a few seconds.

In [1]:
# First of all lets import the libraries we need
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

In [None]:
# Now lets load the data
df = pd.read_csv('data/higgs-boson/training.zip', compression='zip')

# Lets see what we have
df.head()

In [None]:
df.columns

Some details to get started:

- all variables are floating point, except PRI_jet_num which is integer
- variables prefixed with PRI (for PRImitives) are “raw” quantities about the bunch collision as measured by the detector.
- variables prefixed with DER (for DERived) are quantities computed from the primitive features, which were selected by  the physicists of ATLAS
- it can happen that for some entries some variables are meaningless or cannot be computed; in this case, their value is −999.0, which is outside the normal range of all variables

In [None]:
# Lets see the shape of the data
print(df.shape)

# Lets see the data info
df.info()

In [None]:
# mapping "-999.0" to NaN
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.nan if x == -999.0 else x)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['Label'] = (df['Label'] == 's').apply(float)
df = df.iloc[:, 1:]

In [None]:
# Correlation matrix
corr = df.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=1, square=True)
plt.show()

In [None]:
# displaying the labels and their percentages
print(df['Label'].value_counts())
print('-'*35)
print(df['Label'].value_counts(normalize=True))

In [None]:
df.iloc[:, -2:].corr()
plt.figure(figsize=(5, 4))
sns.heatmap(df.iloc[:, -2:].corr(), annot=True, fmt=".2f")
plt.show()

In [None]:
X = df.drop(['Weight', 'Label'], axis=1)
y = df['Label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

## Dataset preparation

In [None]:
# Since normally model don't work well with missing values, we will replace them with the mean of the column
X_train_imputed = X_train.fillna(X_train.mean())
X_test_imputed = X_test.fillna(X_test.mean())
# But we will also create a copy of the data without the imputation
X_train_no_imputed = X_train.copy()

# We then scale the data
scaler = StandardScaler()
X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
X_test_imputed_scaled = scaler.transform(X_test_imputed)

X_train_no_imputed_scaled = scaler.fit_transform(X_train_no_imputed)
X_test_no_imputed_scaled = scaler.transform(X_test_imputed)

## Decision tree

In [None]:
# simple decision tree classifier
dt_clf = DecisionTreeClassifier(max_depth=5, random_state=21)
dt_clf.fit(X_train_imputed_scaled, y_train)
y_pred = dt_clf.predict(X_test_imputed_scaled)
print("Accuracy score using Decision Tree Classifier: ", accuracy_score(y_test, y_pred))
print("ROC AUC score using Decision Tree Classifier: ", roc_auc_score(y_test, y_pred))

## Random forest

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=21)
rf_clf.fit(X_train_imputed_scaled, y_train)
y_pred = rf_clf.predict(X_test_imputed_scaled)
print("Accuracy score using Random Forest Classifier: ", accuracy_score(y_test, y_pred))
print("ROC AUC score using Random Forest Classifier: ", roc_auc_score(y_test, y_pred))

## AdaBoost

In [None]:
# # imprting adaboost classifier
# from sklearn.ensemble import AdaBoostClassifier
# # now lets use the AdaBoost Classifier
# ada_clf = AdaBoostClassifier(n_estimators=200, random_state=21)
# ada_clf.fit(X_train, y_train)
# y_pred = ada_clf.predict(X_test)
# print("Accuracy score using AdaBoost Classifier: ", accuracy_score(y_test, y_pred))
# print("ROC AUC score using AdaBoost Classifier: ", roc_auc_score(y_test, y_pred))

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=200, random_state=21)
ada_clf.fit(X_train_imputed_scaled, y_train)
y_pred = ada_clf.predict(X_test_imputed_scaled)
print("Accuracy score using AdaBoost Classifier: ", accuracy_score(y_test, y_pred))
print("ROC AUC score using AdaBoost Classifier: ", roc_auc_score(y_test, y_pred))

## XGBoost

In [None]:
# # Using XGBoost Classifier
# xgb_clf = xgb.XGBClassifier(n_estimators=200, 
#                             max_depth=5,
#                             learning_rate=0.1,
#                             subsample=0.5,
#                             # colsample_bytree=0.5,
#                             gamma=5,
#                             n_jobs=-1, random_state=21)
# xgb_clf.fit(X_train, y_train)
# y_pred = xgb_clf.predict(X_test)
# print("Accuracy score using XGBoost Classifier: ", accuracy_score(y_test, y_pred))
# print("ROC AUC score using XGBoost Classifier: ", roc_auc_score(y_test, y_pred))

In [None]:
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, random_state=21)
xgb_clf.fit(X_train_imputed_scaled, y_train)
y_pred = xgb_clf.predict(X_test_imputed_scaled)
print("Imputed Accuracy score using XGBoost Classifier: ", accuracy_score(y_test, y_pred))
print("Imputed ROC AUC score using XGBoost Classifier: ", roc_auc_score(y_test, y_pred))

xgb_clf2 = XGBClassifier(n_estimators=200, max_depth=5, random_state=21)
xgb_clf2.fit(X_train_no_imputed_scaled, y_train)
y_pred = xgb_clf2.predict(X_test_no_imputed_scaled)
print("Not Imputed Accuracy score using XGBoost Classifier: ", accuracy_score(y_test, y_pred))
print("Not Imputed ROC AUC score using XGBoost Classifier: ", roc_auc_score(y_test, y_pred))

In [None]:
params = {'objective': 'binary:logistic', 'max_depth': 5, 'random_state':21}
xgb_dmatrix_imputed = xgb.DMatrix(data=X_train_imputed_scaled, label=y_train)
xgb_dmatrix_imputed_test = xgb.DMatrix(data=X_test_imputed_scaled, label=y_test)
xgb_clf3 = xgb.train(params=params, dtrain=xgb_dmatrix_imputed, num_boost_round=200)
xgb_clf3_pred = xgb_clf3.predict(xgb_dmatrix_imputed_test)
print("Imputed Accuracy score using XGBoost Classifier: ", accuracy_score(y_test, xgb_clf3_pred.round()))
print("Imputed ROC AUC score using XGBoost Classifier: ", roc_auc_score(y_test, xgb_clf3_pred))

xgb_dmatrix_no_imputed = xgb.DMatrix(data=X_train_no_imputed_scaled, label=y_train)
xgb_dmatrix_no_imputed_test = xgb.DMatrix(data=X_test_no_imputed_scaled, label=y_test)
xgb_clf4 = xgb.train(params=params, dtrain=xgb_dmatrix_no_imputed, num_boost_round=200)
xgb_clf4_pred = xgb_clf4.predict(xgb_dmatrix_no_imputed_test)
print("Not Imputed Accuracy score using XGBoost Classifier: ", accuracy_score(y_test, xgb_clf4_pred.round()))
print("Not Imputed ROC AUC score using XGBoost Classifier: ", roc_auc_score(y_test, xgb_clf4_pred))

## LightGBM

In [None]:
# importing lightgbm
import lightgbm as lgb

# creating the dataset
lgb_train = lgb.Dataset(data=X_train_imputed_scaled, label=y_train)
lgb_test = lgb.Dataset(data=X_test_imputed_scaled, label=y_test)

# defining the parameters
lgb_params = {'objective': 'binary', 'random_state': 21, 'metric': 'auc'}

# training the model
lgb_clf = lgb.train(params=lgb_params, train_set=lgb_train, num_boost_round=200)

# predicting on the test set
lgb_clf_pred = lgb_clf.predict(X_test_imputed_scaled)

# calculating the accuracy
print("Accuracy score using LightGBM Classifier: ", accuracy_score(y_test, lgb_clf_pred.round()))
print("ROC AUC score using LightGBM Classifier: ", roc_auc_score(y_test, lgb_clf_pred))