## Example XGBoost implementation using the Iris Data set

In [None]:
## Import necessary packages
import pandas as pd
import numpy as np
import xgboost
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error

## For plotting
import matplotlib.pyplot as plt
from seaborn import set_style
## This sets the plot style
## to have a grid on a white background
set_style("whitegrid")

## Brief overview of XGBoost 

Inputs 
- `max_depth`: max depth of a tree; increasing this makes the model more complex (typically between 3-10)

- `n_estimators`: Number of trees to fit, higher values = improved performance when paried with lower learning rate (typically 100-1000+)
- `learning_rate`: Shrinks contribution of each tree; lower values means slower model (typically 0.01-0.3)
- `objective`: Specifies learning task; for us binary:logistic means binary classification with logistic regression
- `subsample`: Fraction of training data samples for each tree (prevents overfitting)
- `colsample_bytree`: Fraction of features randomly samples for each tree (prevents overfitting)
- `gamma`: minimum loss reduction required to make a further partition on a leaf node. Higher gamma = more conservative
- `min_child_weight`: Minimum sum of instance weights (hessian) needed in a child; regularization tool
- `early_stopping_rounds`: when given a validation set in the .fit method, stops training early once rmse stops improving after a given number of rounds

In [14]:
#load data
data = load_iris()

#test split
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)

# test parameter values using optuna (this goes inside of an objective function)
# eg. Try out different values of max_depth from 2 to 10 (e.g., 2, 3, 4, ..., 10)
# max_depth        = trial.suggest_int("max_depth", 3, 10, step=1)
# learning_rate    = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
# n_estimators     = trial.suggest_int("n_estimators", 100, 1000, step=50)
# subsample        = trial.suggest_float("subsample", 0.5, 1.0)
# colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
# gamma            = trial.suggest_float("gamma", 0.0, 10.0)
# min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

# model = XGBClassifier(max_depth=max_depth,
#                                   learning_rate=learning_rate,
#                                   objective = 'binary:logistic',
#                                   n_estimators=n_estimators,
#                                   subsample=subsample,
#                                   colsample_bytree=colsample_bytree,
#                                   gamma=gamma,
#                                   min_child_weight=min_child_weight)
# create model instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

# fit model
bst.fit(X_train, y_train)

# make predictions
preds = bst.predict(X_test)

#compute error
error = 1-mean_squared_error(y_test,preds)

error

0.9333333333333333

A nice feature of `xgboost`'s model is that it automatically records the performance at each training step on a validation set, provided we give the model the validation set.

In [15]:
## make a data set
np.random.seed(220)
X = np.linspace(-2,2,200)

y = X**2 + np.random.randn(200)   #perturbed quadratic

## Here I will generate a validation set because the data are randomly generated
## in practice you would need to split the data
X_val = np.linspace(-2,2,200)
y_val = X_val**2 + np.random.randn(200)

## make an XGBRegressor object
## n_estimators = 500, max_depth = 1, learning_rate = .1
xgb_reg = xgboost.XGBRegressor(n_estimators=500,
                          max_depth=1,
                          learning_rate=.1)

## fit the model, including an eval_set
xgb_reg.fit(X.reshape(-1,1), y, eval_set=[(X_val.reshape(-1,1), y_val)])

[0]	validation_0-rmse:1.54899
[1]	validation_0-rmse:1.52290
[2]	validation_0-rmse:1.50193
[3]	validation_0-rmse:1.47916
[4]	validation_0-rmse:1.45973
[5]	validation_0-rmse:1.44298
[6]	validation_0-rmse:1.42224
[7]	validation_0-rmse:1.40793
[8]	validation_0-rmse:1.38755
[9]	validation_0-rmse:1.37459
[10]	validation_0-rmse:1.35682
[11]	validation_0-rmse:1.34557
[12]	validation_0-rmse:1.33130
[13]	validation_0-rmse:1.32148
[14]	validation_0-rmse:1.30755
[15]	validation_0-rmse:1.29891
[16]	validation_0-rmse:1.28486
[17]	validation_0-rmse:1.27751
[18]	validation_0-rmse:1.26761
[19]	validation_0-rmse:1.26103
[20]	validation_0-rmse:1.24782
[21]	validation_0-rmse:1.24212
[22]	validation_0-rmse:1.23019
[23]	validation_0-rmse:1.22604
[24]	validation_0-rmse:1.21845
[25]	validation_0-rmse:1.21415
[26]	validation_0-rmse:1.20378
[27]	validation_0-rmse:1.19984
[28]	validation_0-rmse:1.19375
[29]	validation_0-rmse:1.18998
[30]	validation_0-rmse:1.18107
[31]	validation_0-rmse:1.17800
[32]	validation_0-

In [16]:
## demonstrate .evals_result()
xgb_reg.evals_result()

{'validation_0': OrderedDict([('rmse',
               [1.548992973005192,
                1.522898460043541,
                1.5019322572941625,
                1.4791558934893079,
                1.459727081610891,
                1.4429755160429023,
                1.422241926442193,
                1.4079251994662687,
                1.387546516325034,
                1.3745858729659668,
                1.3568164708290134,
                1.3455746704935363,
                1.331297495073885,
                1.3214799250800593,
                1.307554816763263,
                1.2989090118283606,
                1.2848600190832,
                1.2775103476131882,
                1.267609503957405,
                1.2610326527471278,
                1.2478243476133408,
                1.2421230021266132,
                1.2301866017071799,
                1.226041235796189,
                1.2184516736152855,
                1.2141506219616427,
                1.203777134355357,
  

In [17]:
## get the 'rmse'
xgb_reg.evals_result()['validation_0']['rmse']

[1.548992973005192,
 1.522898460043541,
 1.5019322572941625,
 1.4791558934893079,
 1.459727081610891,
 1.4429755160429023,
 1.422241926442193,
 1.4079251994662687,
 1.387546516325034,
 1.3745858729659668,
 1.3568164708290134,
 1.3455746704935363,
 1.331297495073885,
 1.3214799250800593,
 1.307554816763263,
 1.2989090118283606,
 1.2848600190832,
 1.2775103476131882,
 1.267609503957405,
 1.2610326527471278,
 1.2478243476133408,
 1.2421230021266132,
 1.2301866017071799,
 1.226041235796189,
 1.2184516736152855,
 1.2141506219616427,
 1.203777134355357,
 1.1998376997409927,
 1.1937490626417429,
 1.1899789685735782,
 1.181068435994735,
 1.1779978675417935,
 1.1717220205218764,
 1.1686770265656818,
 1.1639865204327748,
 1.1614914391530609,
 1.1558727688413208,
 1.1533699354275393,
 1.1476304026128281,
 1.146934120717663,
 1.141865057658279,
 1.1413456838955172,
 1.136666607985323,
 1.1349010839708962,
 1.1317052199500124,
 1.1313765625141952,
 1.1272020820922395,
 1.1270087591338531,
 1.123148

### SMOTE - Synthetic Minority Over-sampling Technique
Gives a bunch of extra of fake data made by linear interpolation which can beef up classes; reasonable if classes live in convex classes; depends on what the data looks like

Make sure synthetic data does not end up in training or test to stop data leakage.