# XGBoost - Extreme Gradient Boosting

XGBoost is a supervised learning algorithm that implements a process called boosting to yield accurate models.

In [None]:
# This automatically time every cell's execution
!pip install ipython-autotime
%load_ext autotime

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time

In [None]:
# Fetch dataset using sklearn
cov = fetch_covtype()
X = cov.data
y = cov.target

In [None]:
# Create 0.75/0.25 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,random_state=42)

In [None]:
# Convert input data from numpy to XGBoost format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

num_round = 10
maxdepth = 6
# base parameters
param = {'tree_method': 'gpu_hist',
         'grow_policy': 'depthwise',
         'max_depth': maxdepth,
         'random_state': 1234,
         'objective': 'multi:softmax', # Specify multiclass classification
         'num_class': 8, # Number of possible output classes
         'base_score': 0.5,
         'booster': 'gbtree',
         'colsample_bylevel': 1,
         'colsample_bytree': 1,
         'gamma': 0,
         'learning_rate': 0.1, 
         'max_delta_step': 0,
         'min_child_weight': 1,
         'missing': None,
         'n_estimators': 3,
         'scale_pos_weight': 1,
         'silent': True,
         'subsample': 1,
         'verbose': True,
         'n_jobs': -1
         }

In [None]:
# GPU HIST DEPTHWISE
param['tree_method'] = 'gpu_hist'
param['grow_policy'] = 'depthwise'
param['max_depth'] = maxdepth
param['max_leaves'] = 0
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

In [None]:
# GPU HIST LOSSGUIDE
param['tree_method'] = 'gpu_hist'
param['grow_policy'] = 'lossguide'
param['max_depth'] = 0
param['max_leaves'] = np.power(2,maxdepth)
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

In [None]:
# CPU HIST DEPTHWISE
param['tree_method'] = 'hist'
param['grow_policy'] = 'depthwise'
param['max_depth'] = maxdepth
param['max_leaves'] = 0
cpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

In [None]:
# CPU HIST LOSSGUIDE
param['tree_method'] = 'hist'
param['grow_policy'] = 'lossguide'
param['max_depth'] = 0
param['max_leaves'] = np.power(2,maxdepth)
cpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

In [None]:
# SKLEARN API XGBOOST PARAMETERS
kwargs = {'grow_policy': 'depthwise',
          'eval_metric': 'mlogloss',
          'num_class': 8,
          'objective': 'multi:softmax',
          'n_estimators': num_round,
          'tree_method': "gpu_hist",
          'grow_policy': 'depthwise',
          'max_depth': maxdepth,
          'random_state': 1234,
          'n_jobs': -1,
          'silent': True,
          'debug_verbose': 2,
          'verbose': True}
eval_metric = 'mlogloss'

In [None]:
# SKLEARN GPU HIST DEPTHWISE
kwargs['tree_method'] = 'gpu_hist'
kwargs['grow_policy'] = "depthwise"
kwargs['max_depth'] = maxdepth
kwargs['max_leaves'] = 0
model = xgb.XGBClassifier(**kwargs)
model.fit(X=X_train, y=y_train, verbose=True, eval_set=[(X_test, y_test)], eval_metric=eval_metric)
print(model.get_params())

In [None]:
del model

In [None]:
# SKLEARN GPU HIST LOSSGUIDE
kwargs['tree_method'] = 'gpu_hist'
kwargs['grow_policy'] = "lossguide"
kwargs['max_depth'] = 0
kwargs['max_leaves'] = np.power(2,maxdepth)
model = xgb.XGBClassifier(**kwargs)
model.fit(X=X_train, y=y_train, verbose=True, eval_set=[(X_test, y_test)], eval_metric=eval_metric)
print(model.get_params())

In [None]:
# SKLEARN CPU HIST DEPTHWISE
kwargs['tree_method'] = 'hist'
kwargs['grow_policy'] = "depthwise"
kwargs['max_depth'] = maxdepth
kwargs['max_leaves'] = 0
model = xgb.XGBClassifier(**kwargs)
model.fit(X=X_train, y=y_train, verbose=True, eval_set=[(X_test, y_test)], eval_metric=eval_metric)
print(model.get_params())

In [None]:
# SKLEARN CPU HIST LOSSGUIDE
kwargs['tree_method'] = 'hist'
kwargs['grow_policy'] = "lossguide"
kwargs['max_depth'] = 0
kwargs['max_leaves'] = np.power(2,maxdepth)
model = xgb.XGBClassifier(**kwargs)
model.fit(X=X_train, y=y_train, verbose=True, eval_set=[(X_test, y_test)], eval_metric=eval_metric)
print(model.get_params())

In [None]:
###############################################
#
# GPU DEMO of feature importance
#
###############################################
# Prediction
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Test Set Accuracy: %.2f%%" % (accuracy * 100.0))

# Retrieve performance metrics
import matplotlib.pyplot as plt
results = model.evals_result()
epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots(figsize=(14, 14))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
ax.legend()
plt.ylabel('Multi-Class LogLoss')
plt.title('XGBoost Multi-Class LogLoss')
plt.show()

# plot feature importance using built-in function
from xgboost import plot_importance
# plot feature importance
fig, ax = plt.subplots(figsize=(14, 14))
plot_importance(model, ax=ax)
plt.show()

In [None]:
# new version using xgbfi-C++
def calc_varimp(context, model, columns, how_many, n_trees, from_depth, to_depth, **kwargs):
    t0 = time.time()
    booster = model._Booster
    booster.feature_names = columns

    max_xgbfi_tree_depth = 7
    max_xgbfi_tree_deepening = -1

    params = model.get_params()
    n_jobs = params['n_jobs']
    print("n_jobs=%d" % n_jobs)
    df_imp = booster.get_feature_interactions(to_depth,
                                              max_xgbfi_tree_depth,
                                              max_xgbfi_tree_deepening,
                                              n_trees,
                                              nthread=n_jobs)
    if df_imp.shape[0] == 0:
        df_imp = try_calc_varimp_gblinear(booster=booster, columns=columns, **kwargs)

    if df_imp.shape[0] == 0:
        df_imp['fi'] = columns
        df_imp['fi_depth'] = 0
        df_imp['gain'] = 1.0

    depth_from_to = (from_depth <= df_imp.fi_depth) & (df_imp.fi_depth <= to_depth)
    df_imp = df_imp[['fi', 'fi_depth', 'gain']].loc[depth_from_to].reset_index(drop=True)
    df_imp['fi_depth'] = df_imp['fi_depth'].astype(int)

    df_imp_feats = df_imp.loc[df_imp.fi_depth == 0, 'fi'].values
    missing_feats = np.setdiff1d(columns, df_imp_feats)
    if len(missing_feats) > 0 and from_depth == 0:
        df_missing_feats = pd.DataFrame()
        df_missing_feats['fi'] = missing_feats
        df_missing_feats['fi_depth'] = 0
        df_missing_feats['gain'] = 0.0
        df_imp = pd.concat((df_imp, df_missing_feats)) \
            .sort_values(by=['fi_depth', 'gain'], ascending=[True, False]) \
            .reset_index(drop=True)

    df_grp = df_imp.groupby('fi_depth')
    df_imp['gain'] = df_grp['gain'].transform(lambda x: (x / x.max()))
    df_imp = df_grp.head(how_many).reset_index(drop=True)
    df_imp.columns = ['Interaction', 'Depth', 'Gain']

    t1 = time.time()
    return df_imp


def try_calc_varimp_gblinear(booster, columns, **kwargs):
    df_imp = pd.DataFrame()
    dump = booster.get_dump()[0]
    if 'weight:' not in dump:
        return df_imp
    n_models = len(dump.split('\nweight:')[0].split('\n')[1:])
    imp = []
    for w in dump.split('\n')[2 + n_models:-1]:
        imp.append(abs(np.float("{:.6f}".format(float(w)))))
    imp = np.array(imp).reshape((-1, n_models)).mean(axis=1)
    df_imp['fi'] = columns
    df_imp['fi_depth'] = 0
    df_imp['gain'] = imp
    df_imp = df_imp.sort_values(by='gain', ascending=False).reset_index(drop=True)
    return df_imp

In [None]:
context=None
import pandas as pd
pd_X_train = pd.DataFrame(X_train)
booster = model._Booster
#booster.feature_names = [x for x in str(pd_X_train.columns.values)]
booster.feature_names = ["c"+str(x) for x in pd_X_train.columns.values]
kwargs = {}
# params: context, model, columns, N, n_trees, from_depth, to_depth, **kwargs)
Nimportance = 450
Ntrees = model.best_ntree_limit
from_depth = 0
to_depth = maxdepth
imp_features = calc_varimp(context, model, booster.feature_names, Nimportance, Ntrees, from_depth, to_depth, **kwargs)

In [None]:
imp_features[imp_features.Depth==0]

In [None]:
imp_features[imp_features.Depth==1]

In [None]:
###############################################
#
# GPU DEMO of feature importance
#
###############################################
num_round_more = 200
maxdepth_more = 6
# SKLEARN GPU HIST LOSSGUIDE
kwargs['tree_method'] = 'gpu_hist'
kwargs['grow_policy'] = "lossguide"
kwargs['max_depth'] = 0
kwargs['max_leaves'] = np.power(2,maxdepth_more)
kwargs['n_estimators'] = num_round_more
kwargs['n_jobs'] = 1
model = xgb.XGBClassifier(**kwargs)
eval_set = [(X_train, y_train),(X_test, y_test)]
model.fit(X=X_train, y=y_train, verbose=True, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=20)
print("Number of trees for best model: %d" % model.best_ntree_limit)
print("Model parameters: %s" % str(model.get_params()))


In [None]:
%%time
dofinalcputest=1
if dofinalcputest==1:
    ###############################################
    #
    # CPU Model (compare time to GPU DEMO of feature importance) .
    #
    ###############################################
    # SKLEARN CPU HIST LOSSGUIDE
    kwargs['tree_method'] = 'hist'
    kwargs['grow_policy'] = "lossguide"
    kwargs['max_depth'] = 0
    kwargs['max_leaves'] = np.power(2,maxdepth_more)
    kwargs['n_estimators'] = num_round_more
    model = xgb.XGBClassifier(**kwargs)
    eval_set = [(X_train, y_train),(X_test, y_test)]
    model.fit(X=X_train, y=y_train, verbose=True, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=20)
    print("Number of trees for best model: %d" % model.best_ntree_limit)
    print("Model parameters: %s" % str(model.get_params()))