# XGBoost

In [1]:
import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report, accuracy_score, r2_score

import random

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
min_range = 0
max_range = 500

In [3]:
rand_int_min = -50
rand_int_max = 50

## XGBoost Classifier

In [4]:
# 3 columns
x = [[r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max)] for r in range(min_range,max_range)]
x

[[-17, -2, 36],
 [-40, 51, -16],
 [-21, 50, 10],
 [46, -31, -5],
 [40, -21, -13],
 [-4, -8, 48],
 [-13, 21, 12],
 [-16, 23, 21],
 [-10, 15, 24],
 [9, 16, 52],
 [32, 24, -31],
 [51, -5, 6],
 [-12, 7, -10],
 [53, -3, -25],
 [-36, 17, -5],
 [62, 5, -25],
 [-12, 31, 1],
 [60, 8, 19],
 [53, 9, 31],
 [-12, 66, -5],
 [60, 4, 41],
 [-1, 8, 50],
 [34, 5, -25],
 [-17, -6, 6],
 [27, -21, 10],
 [29, 32, 21],
 [1, 11, 14],
 [-16, 50, 62],
 [56, 33, 73],
 [55, 42, 78],
 [70, 51, -19],
 [74, 73, 0],
 [72, 13, -2],
 [0, 57, -12],
 [-11, 74, 35],
 [62, 36, 23],
 [-8, -12, 2],
 [24, 86, 19],
 [82, 62, 14],
 [52, 83, 88],
 [1, 57, 10],
 [17, 31, 0],
 [-6, 2, 41],
 [21, 55, 91],
 [45, 32, 9],
 [81, 52, 56],
 [-4, 82, 15],
 [15, 12, 20],
 [-2, 12, 88],
 [32, 35, 92],
 [44, 63, 18],
 [7, 94, 37],
 [36, 84, 4],
 [100, 65, 85],
 [57, 16, 28],
 [19, 44, 67],
 [36, 16, 105],
 [73, 85, 33],
 [87, 48, 91],
 [30, 16, 36],
 [39, 90, 101],
 [55, 101, 92],
 [39, 36, 57],
 [73, 89, 62],
 [96, 68, 88],
 [40, 78, 101],


In [5]:
y = [random.randint(0, 1) for r in range(min_range,max_range)]
y

[1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,


In [6]:
x_train_clf, x_test_clf, y_train_clf, y_test_clf = train_test_split(x, y, train_size=.7, test_size=.3)

In [7]:
xgb_clf = xgb.XGBClassifier()

In [8]:
xgb_clf.fit(x_train_clf, y_train_clf)

In [9]:
xgb_pred_class = xgb_clf.predict(x_test_clf)

In [10]:
mean_squared_error(y_test_clf, xgb_pred_class)

0.56

In [11]:
mean_absolute_error(y_test_clf, xgb_pred_class)

0.56

In [12]:
accuracy_score(y_test_clf, xgb_pred_class)

0.44

In [13]:
cr = classification_report(y_test_clf, xgb_pred_class)
print(cr)

              precision    recall  f1-score   support

           0       0.40      0.54      0.46        67
           1       0.49      0.36      0.42        83

    accuracy                           0.44       150
   macro avg       0.45      0.45      0.44       150
weighted avg       0.45      0.44      0.44       150



## XGBoost Regressor

In [14]:
# 3 columns
x = [[r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max)] for r in range(min_range,max_range)]
x

[[-44, 23, 36],
 [-43, -45, -7],
 [-22, 47, 13],
 [10, 13, 0],
 [7, -12, -20],
 [51, -31, 4],
 [23, -33, -34],
 [5, 4, 57],
 [28, 31, 57],
 [52, -12, -41],
 [17, -26, 19],
 [-34, -16, 2],
 [25, -4, -31],
 [-6, 52, -13],
 [40, 3, -33],
 [25, 34, 21],
 [50, 44, 37],
 [13, 33, 45],
 [30, 45, -18],
 [45, -20, 24],
 [22, 6, -24],
 [20, 58, 38],
 [-13, 68, -15],
 [12, -25, 25],
 [0, 58, 10],
 [38, 31, 16],
 [-23, 43, -24],
 [67, 34, 32],
 [14, 72, 55],
 [13, 46, 71],
 [21, 67, 60],
 [42, 44, 59],
 [16, 68, 70],
 [21, 14, 56],
 [64, -7, 32],
 [-6, 3, 48],
 [-4, 60, 4],
 [87, 42, 37],
 [45, -4, 54],
 [8, 82, 49],
 [19, 83, 5],
 [-1, 9, 57],
 [33, 61, 46],
 [87, 39, 51],
 [33, 1, 42],
 [18, 91, 92],
 [39, 76, -4],
 [82, 12, 7],
 [95, 98, 91],
 [45, 95, 91],
 [19, 8, 96],
 [48, 26, 67],
 [52, 99, 53],
 [70, 102, 66],
 [88, 95, 36],
 [93, 76, 43],
 [41, 11, 77],
 [95, 17, 28],
 [85, 28, 105],
 [22, 24, 96],
 [52, 55, 35],
 [38, 69, 90],
 [91, 102, 75],
 [107, 68, 44],
 [57, 91, 45],
 [26, 102, 83

In [15]:
y = [r+random.randint(rand_int_min, rand_int_max) for r in range(min_range,max_range)]
y

[-8,
 5,
 -44,
 50,
 -24,
 -32,
 -33,
 -14,
 37,
 44,
 -35,
 53,
 -31,
 23,
 -21,
 -9,
 -18,
 -29,
 66,
 -6,
 5,
 -23,
 44,
 -7,
 20,
 31,
 34,
 72,
 -8,
 14,
 -1,
 -9,
 70,
 24,
 23,
 7,
 29,
 48,
 46,
 89,
 12,
 10,
 60,
 68,
 -2,
 87,
 54,
 53,
 10,
 33,
 27,
 68,
 3,
 64,
 15,
 28,
 31,
 52,
 91,
 93,
 68,
 18,
 71,
 74,
 94,
 51,
 58,
 35,
 20,
 82,
 62,
 39,
 108,
 29,
 95,
 114,
 50,
 64,
 47,
 48,
 109,
 60,
 108,
 45,
 90,
 113,
 134,
 97,
 107,
 69,
 59,
 88,
 133,
 138,
 128,
 142,
 58,
 120,
 48,
 57,
 82,
 62,
 93,
 63,
 115,
 57,
 102,
 64,
 126,
 133,
 147,
 138,
 128,
 151,
 151,
 120,
 108,
 151,
 82,
 83,
 104,
 75,
 100,
 119,
 112,
 92,
 139,
 156,
 132,
 101,
 129,
 134,
 134,
 101,
 178,
 173,
 154,
 136,
 89,
 100,
 185,
 158,
 99,
 145,
 158,
 124,
 158,
 101,
 104,
 158,
 169,
 134,
 132,
 103,
 135,
 111,
 124,
 112,
 163,
 123,
 165,
 170,
 131,
 114,
 188,
 214,
 207,
 141,
 200,
 148,
 204,
 137,
 188,
 198,
 221,
 188,
 135,
 160,
 175,
 206,
 158,
 219,
 

In [16]:
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x, y, train_size=.7, test_size=.3)

In [17]:
xgb_reg = xgb.XGBRegressor()

In [18]:
xgb_reg.fit(x_train_reg, y_train_reg)

In [19]:
xgb_pred_reg = xgb_reg.predict(x_test_reg)

In [20]:
mean_squared_error(y_test_reg, xgb_pred_reg)

1831.3426435715473

In [21]:
mean_absolute_error(y_test_reg, xgb_pred_reg)

35.468424733479814

In [22]:
r2_score(y_test_reg, xgb_pred_reg)

0.9089201734319076

## XGBoost Pipeline

### XGBoost Classification Pipeline

In [23]:
xgb_clf_pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier())
])

In [24]:
xgb_clf_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'xgb', 'xgb__objective', 'xgb__base_score', 'xgb__booster', 'xgb__callbacks', 'xgb__colsample_bylevel', 'xgb__colsample_bynode', 'xgb__colsample_bytree', 'xgb__device', 'xgb__early_stopping_rounds', 'xgb__enable_categorical', 'xgb__eval_metric', 'xgb__feature_types', 'xgb__gamma', 'xgb__grow_policy', 'xgb__importance_type', 'xgb__interaction_constraints', 'xgb__learning_rate', 'xgb__max_bin', 'xgb__max_cat_threshold', 'xgb__max_cat_to_onehot', 'xgb__max_delta_step', 'xgb__max_depth', 'xgb__max_leaves', 'xgb__min_child_weight', 'xgb__missing', 'xgb__monotone_constraints', 'xgb__multi_strategy', 'xgb__n_estimators', 'xgb__n_jobs', 'xgb__num_parallel_tree', 'xgb__random_state', 'xgb__reg_alpha', 'xgb__reg_lambda', 'xgb__sampling_method', 'xgb__scale_pos_weight', 'xgb__subsample', 'xgb__tree_method', 'xgb__validate_parameters', 'xgb__verbosity'])

In [25]:
xgb_clf_param_grid = {
    'xgb__booster': ['gbtree', 'gblinear', 'dart'],
    'xgb__eval_metric': ['rmse', 'mae', 'mape', 'logloss', 'mlogloss', 'auc'],
    'xgb__learning_rate': [.3, .5, .7],
    'xgb__objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:logistic']
}

In [26]:
xgb_clf_random_search = RandomizedSearchCV(xgb_clf_pipeline, xgb_clf_param_grid)

In [27]:
xgb_clf_random_search.fit(x_train_clf, y_train_clf)

In [28]:
xgb_clf_rand_search_pred = xgb_clf_random_search.predict(x_test_clf)
xgb_clf_rand_search_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [29]:
xgb_clf_random_search.best_params_

{'xgb__objective': 'reg:squarederror',
 'xgb__learning_rate': 0.7,
 'xgb__eval_metric': 'auc',
 'xgb__booster': 'gblinear'}

In [30]:
mean_squared_error(y_test_clf, xgb_clf_rand_search_pred)

0.54

In [31]:
mean_absolute_error(y_test_clf, xgb_clf_rand_search_pred)

0.54

In [32]:
accuracy_score(y_test_clf, xgb_clf_rand_search_pred)

0.46

In [33]:
cr = classification_report(y_test_clf, xgb_clf_rand_search_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.45      0.99      0.62        67
           1       0.75      0.04      0.07        83

    accuracy                           0.46       150
   macro avg       0.60      0.51      0.34       150
weighted avg       0.62      0.46      0.31       150



### XGBoost Regression Pipeline

In [34]:
# Regressor
xgb_reg_pipeline = Pipeline([
    ('xgb', xgb.XGBRegressor())
])

In [35]:
xgb_reg_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'xgb', 'xgb__objective', 'xgb__base_score', 'xgb__booster', 'xgb__callbacks', 'xgb__colsample_bylevel', 'xgb__colsample_bynode', 'xgb__colsample_bytree', 'xgb__device', 'xgb__early_stopping_rounds', 'xgb__enable_categorical', 'xgb__eval_metric', 'xgb__feature_types', 'xgb__gamma', 'xgb__grow_policy', 'xgb__importance_type', 'xgb__interaction_constraints', 'xgb__learning_rate', 'xgb__max_bin', 'xgb__max_cat_threshold', 'xgb__max_cat_to_onehot', 'xgb__max_delta_step', 'xgb__max_depth', 'xgb__max_leaves', 'xgb__min_child_weight', 'xgb__missing', 'xgb__monotone_constraints', 'xgb__multi_strategy', 'xgb__n_estimators', 'xgb__n_jobs', 'xgb__num_parallel_tree', 'xgb__random_state', 'xgb__reg_alpha', 'xgb__reg_lambda', 'xgb__sampling_method', 'xgb__scale_pos_weight', 'xgb__subsample', 'xgb__tree_method', 'xgb__validate_parameters', 'xgb__verbosity'])

In [36]:
xgb_reg_param_grid = {
    'xgb__booster': ['gbtree', 'gblinear', 'dart'],
    'xgb__eval_metric': ['rmse', 'mae', 'mape', 'logloss', 'mlogloss', 'auc'],
    'xgb__learning_rate': [.3, .5, .7],
    'xgb__objective': ['reg:squarederror']
}

In [37]:
xgb_reg_random_search = RandomizedSearchCV(xgb_reg_pipeline, xgb_reg_param_grid)

In [38]:
xgb_reg_random_search.fit(x_train_reg, y_train_reg)

In [39]:
xgb_reg_rand_search_pred = xgb_reg_random_search.predict(x_test_clf)
xgb_reg_rand_search_pred

array([169.71283  , 397.48013  , 288.9757   , 285.88507  , 444.74982  ,
        64.83197  , 479.73395  , 170.38171  , 434.52682  ,  39.446503 ,
       148.05966  , 451.01434  , 242.95358  , 399.82065  ,   7.5192223,
        -6.9212036, 323.6119   , 186.42505  , 385.8202   , 352.05386  ,
       261.07715  , 279.03326  ,  11.642881 , 185.64493  , 194.9308   ,
         7.2727485,  74.30623  ,  85.94693  , 423.133    , 418.16324  ,
       311.42847  , 500.97958  , 142.86317  , 327.4142   , 184.00941  ,
       433.59558  , 204.3021   , 451.34195  , 333.01822  , 241.25764  ,
        84.2757   , 354.1009   , 338.0478   , 243.06029  , 351.43536  ,
       272.21115  , 372.1661   , 374.65546  , 418.05725  ,  27.11039  ,
        37.28652  ,  77.95873  , 401.15808  ,  10.761579 , 253.73666  ,
       138.50272  , 487.47345  , 128.80862  , 236.35721  ,  98.08833  ,
        63.926994 , 162.29413  ,  30.943253 , 327.17612  , 125.333664 ,
       285.69916  , 277.7818   , 215.29945  , 319.6936   , 208.9

In [40]:
xgb_reg_random_search.best_params_

{'xgb__objective': 'reg:squarederror',
 'xgb__learning_rate': 0.3,
 'xgb__eval_metric': 'mape',
 'xgb__booster': 'gblinear'}

In [41]:
mean_squared_error(y_test_reg, xgb_reg_rand_search_pred)

38806.031404964466

In [42]:
mean_absolute_error(y_test_reg, xgb_reg_rand_search_pred)

159.96138086636861

In [43]:
r2_score(y_test_reg, xgb_reg_rand_search_pred)

-0.9299755960834899