# Using Bayes Optimization for Hyperparameter Tuning in XGBoost

In [21]:
# Dataset
from sklearn import datasets

# dependencies
import numpy as np
import pandas as pd

# Standardize data
from sklearn.preprocessing import StandardScaler

# Model and performance evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval


Going to use the Breast Cancer dataset from the SKlearn datasets

In [2]:
# Load Data
data = datasets.load_breast_cancer()

# Put in df
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['target']=data.target

# Check the data info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In Target, a 0 represents negative, 1 represents positive dx for Breast Cancer

In [3]:
# Check the target value dist
df['target'].value_counts(normalize=True)

1    0.627417
0    0.372583
Name: target, dtype: float64

So 62.7% of patients here are positive for breast cancer, and 37% are negative

In [5]:
# Set up Training and Test data
# Test size of 0.2 creates an 80/20 split, with 80% in training
X_train, X_test, y_train, y_test = \
    train_test_split(df[df.columns.difference(['target'])], df['target'] \
    , test_size=0.2 \
    , random_state=42)

# Check the number of records in training and test sets
print(f'The number of records in training has {len(X_train)} records.')
print(f'The testing set has {len(X_test)} records.')


The number of records in training has 455 records.
The testing set has 114 records.


Standardization is used to rescale the features into the same scale, and is only done to the training data set. Just using StandardScaler here 

In [6]:
# Initiate Scaler
sc=StandardScaler()

# Standardize the training dataset
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train)
                                   , index=X_train.index
                                   , columns=X_train.columns)

# Standardize the testing dataset
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test)
                                  , index=X_test.index
                                  , columns=X_test.columns)

# Summary statistics after standardization
X_train_transformed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
area error,455.0,1.5616320000000002e-17,1.001101,-0.705091,-0.464164,-0.325347,0.077435,10.641841
compactness error,455.0,-2.401498e-15,1.001101,-1.258102,-0.694353,-0.280607,0.358304,5.905671
concave points error,455.0,3.279428e-16,1.001101,-1.891775,-0.668493,-0.126279,0.437566,6.504667
concavity error,455.0,2.347329e-16,1.001101,-1.022218,-0.55134,-0.207836,0.303371,11.310294
fractal dimension error,455.0,5.050905e-16,1.001101,-1.050856,-0.573964,-0.218908,0.24534,9.34587
mean area,455.0,-2.627935e-16,1.001101,-1.365036,-0.660205,-0.289597,0.319339,5.208312
mean compactness,455.0,1.003349e-15,1.001101,-1.607228,-0.777087,-0.24134,0.528128,3.964311
mean concave points,455.0,5.78048e-16,1.001101,-1.26991,-0.734905,-0.391123,0.673757,4.022271
mean concavity,455.0,9.782163e-16,1.001101,-1.119899,-0.750539,-0.344646,0.547387,4.256736
mean fractal dimension,455.0,-3.347993e-15,1.001101,-1.776889,-0.709792,-0.177285,0.464223,4.815921


In [7]:
# Summary Stats before standardization
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
area error,455.0,40.071299,47.236319,6.802,18.17,24.72,43.725,542.2
compactness error,455.0,0.025635,0.018607,0.002252,0.01273,0.02042,0.032295,0.1354
concave points error,455.0,0.011894,0.006294,0.0,0.007691,0.0111,0.014645,0.05279
concavity error,455.0,0.032824,0.032146,0.0,0.01512,0.02615,0.042565,0.396
fractal dimension error,455.0,0.00382,0.002787,0.000895,0.002222,0.003211,0.004504,0.02984
mean area,455.0,654.377582,354.943187,170.4,420.3,551.7,767.6,2501.0
mean compactness,455.0,0.103619,0.05247,0.01938,0.06289,0.09097,0.1313,0.3114
mean concave points,455.0,0.04828,0.03806,0.0,0.02034,0.03341,0.073895,0.2012
mean concavity,455.0,0.088898,0.079468,0.0,0.02932,0.06154,0.13235,0.4268
mean fractal dimension,455.0,0.062757,0.00721,0.04996,0.057645,0.06148,0.0661,0.09744


XGBoost Classifier with no hyperparameter tuning

Usually, a subset of hyperparameters will be tuned: 
- base_score
- booster
- colsample_bylevel
- colsample_bynode
- colsample_bytree
- gamma
- learning_rate
- max_delta_step
- max_depth
- min_child_weight
- missing
- n_estimators
- n_jobs
- nthread
- 'objective': 'binary:logistic'
- random_state
- reg_alpha
- reg_lambda
- scale_pos_weight
- seed
- silent
- subsample
- verbosity


In [14]:
# Initiate XGBoost Classifier
xgboost = XGBClassifier()

# Print the default setting
xgboost.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [16]:
# Train the model 
xgboost = XGBClassifier(seed=0).fit(X_train_transformed, y_train)

# Make prediction
xgboost_predict = xgboost.predict(X_test_transformed)

# Get predicted probability 
xgboost_predict_prob = xgboost.predict_proba(X_test)[:,1]

In [22]:
# Get performance metrics 
precision, recall, fscore, support = score(y_test, xgboost_predict)



# print
print(f'The recall value for the baseline mode is {recall[1]:.4f}')

The recall value for the baseline mode is 0.9859


# Grid Search for XGBoost

In [23]:
# Define Search Space
param_grid = {
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [0.3, 0.5, 0.8],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
}

# Set up score
scoring = ['recall']

# Set up k-fold cross-validation
kfold=StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [24]:
# Define grid search
grid_search = GridSearchCV(estimator=xgboost,
                           param_grid=param_grid,
                           scoring=scoring,
                           refit='recall',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)
# Fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train)
# Print grid search summary
grid_result
# Print the best score and the corresponding hyperparameters
print(f'The best score is {grid_result.best_score_:.4f}')
print('The best score standard deviation is', round(
    grid_result.cv_results_['std_test_recall'][grid_result.best_index_], 4))
print(f'The best hyperparameters are {grid_result.best_params_}')


The best score is 0.9860
The best score standard deviation is 0.005
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 0}


In [27]:
# Make prediction using the best model
grid_predict = grid_search.predict(X_test_transformed)
# Get predicted probabilities
grid_predict_prob = grid_search.predict_proba(X_test_transformed)[:, 1]
# Get performance metrics
precision, recall, fscore, support = score(y_test, grid_predict)
# Print result
print(f'The recall value for the xgboost grid search is {recall[1]:.4f}')


The recall value for the xgboost grid search is 0.9859


In [28]:
# Define the search space
param_grid = {
    # Learning rate shrinks the weights to make the boosting process more conservative
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    # Maximum depth of the tree, increasing it increases the model complexity.
    "max_depth": range(3, 21, 3),
    # Gamma specifies the minimum loss reduction required to make a split.
    "gamma": [i/10.0 for i in range(0, 5)],
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [i/10.0 for i in range(3, 10)],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [1e-5, 1e-2, 0.1, 1, 10, 100]}
# Set up score
scoring = ['recall']
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)


In [29]:
# Define random search
random_search = RandomizedSearchCV(estimator=xgboost,
                                   param_distributions=param_grid,
                                   n_iter=48,
                                   scoring=scoring,
                                   refit='recall',
                                   n_jobs=-1,
                                   cv=kfold,
                                   verbose=0)
# Fit grid search
random_result = random_search.fit(X_train_transformed, y_train)
# Print grid search summary
random_result
# Print the best score and the corresponding hyperparameters
print(f'The best score is {random_result.best_score_:.4f}')
print('The best score standard deviation is', round(
    random_result.cv_results_['std_test_recall'][random_result.best_index_], 4))
print(f'The best hyperparameters are {random_result.best_params_}')


The best score is 1.0000
The best score standard deviation is 0.0
The best hyperparameters are {'reg_lambda': 1e-05, 'reg_alpha': 1e-05, 'max_depth': 15, 'learning_rate': 0.001, 'gamma': 0.1, 'colsample_bytree': 0.6}


# Bayesian Optimization for XGBoost

This uses hyperopt 

space = space over where to search
objective function to minimize


In [30]:
# Space

space = {
    'learning_rate': hp.choice('learning_rate', [0.0001, 0.001, 0.01, 0.1, 1]),
    'max_depth': hp.choice('max_depth', range(3, 21, 3)),
    'gamma': hp.choice('gamma', [i/10.0 for i in range(0, 5)]),
    'colsample_bytree': hp.choice('colsample_bytree', [i/10.0 for i in range(3, 10)]),
    'reg_alpha': hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'reg_lambda': hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}


In [31]:
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)


In [34]:
# Objective function
def objective(params):

    xgboost = XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost,
                            X=X_train_transformed,
                            y=y_train,
                            cv=kfold,
                            scoring='recall',
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}


In [35]:
# Optimize
best = fmin(fn=objective, space=space, algo=tpe.suggest,
            max_evals=48, trials=Trials())


100%|██████████| 48/48 [00:04<00:00, 10.46trial/s, best loss: -1.0]


In [36]:
# Print the index of the best parameters
print(best)


{'colsample_bytree': 6, 'gamma': 3, 'learning_rate': 0, 'max_depth': 3, 'reg_alpha': 2, 'reg_lambda': 0}


In [37]:
# Print the values of the best parameters
print(space_eval(space, best))


{'colsample_bytree': 0.9, 'gamma': 0.3, 'learning_rate': 0.0001, 'max_depth': 12, 'reg_alpha': 0.1, 'reg_lambda': 1e-05}


Now we can apply the hyperparameters to the XGBClassifier to make predictions

In [38]:
# Train model using the best parameters
xgboost_bo = XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best)['colsample_bytree'], 
                           gamma=space_eval(space, best)['gamma'], 
                           learning_rate=space_eval(space, best)['learning_rate'], 
                           max_depth=space_eval(space, best)['max_depth'], 
                           reg_alpha=space_eval(space, best)['reg_alpha'],
                           reg_lambda=space_eval(space, best)['reg_lambda']
                           ).fit(X_train_transformed,y_train)

In [39]:
# Make prediction using the best model
bayesian_opt_predict = xgboost_bo.predict(X_test_transformed)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_bo.predict_proba(X_test_transformed)[:, 1]
# Get performance metrics
precision, recall, fscore, support = score(y_test, bayesian_opt_predict)
# Print result
print(
    f'The recall value for the xgboost Bayesian optimization is {recall[1]:.4f}')


The recall value for the xgboost Bayesian optimization is 1.0000


  _warn_prf(average, modifier, msg_start, len(result))
