# Extreme Gradient Boost


### Initialization

In [6]:
from skopt import BayesSearchCV
from xgboost import XGBRegressor
from utility import start, train, display
import numpy as np
# Load data and split into train and test sets
X_train, X_test, y_train, y_test, df = start("OneHotEncodedData2.csv")

# Define the parameter search space
param_space = {
    'n_estimators': (2800, 3400),
    'max_depth': (9, 11),
    'learning_rate': (0.01, 0.2),
    'gamma': (0.0, 0.5),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'reg_alpha': (0.0, 1.0),  # Regularization parameter alpha
    'reg_lambda': (0.0, 1.0),  # Regularization parameter lambda
    'min_child_weight': (1, 10),  # Minimum sum of instance weight (Hessian) needed in a child
}

# Initialize XGBoost regressor
xgb = XGBRegressor()

# Initialize Bayesian optimization search with the defined callback
bayes_search = BayesSearchCV(
    xgb,
    param_space,
    n_iter=50,  # Number of parameter settings that are sampled
    cv=3,       # Cross-validation folds
    scoring='neg_mean_absolute_percentage_error',  # MAPE as the metric to minimize
    n_jobs=-1,
)

# Perform the Bayesian optimization search
bayes_search.fit(X_train, y_train)

# Display the best hyperparameters and MAPE
print("Best hyperparameters found:", bayes_search.best_params_)
print("Best MAPE found:", abs(bayes_search.best_score_))


Best hyperparameters found: OrderedDict([('colsample_bytree', 0.8828571520137358), ('gamma', 0.001882060327464297), ('learning_rate', 0.018959255879334863), ('max_depth', 10), ('min_child_weight', 1), ('n_estimators', 3392), ('reg_alpha', 0.5866618490815292), ('reg_lambda', 0.2494616485401136), ('subsample', 0.8897907708048053)])
Best MAPE found: 0.02974146647749794


In [2]:
from xgboost import XGBRegressor
from utility import start, train, display
from collections import OrderedDict

# Load data and split into train and test sets
X_train, X_test, y_train, y_test, df = start("OneHotEncodedData2.csv")

# Define the best hyperparameters
model = XGBRegressor(
    n_estimators=3392,
    max_depth=10,
    learning_rate=0.018959255879334863,
    gamma=0.001882060327464297,
    subsample=0.8897907708048053,
    colsample_bytree=0.8828571520137358,
    reg_alpha=0.5866618490815292,
    reg_lambda=0.2494616485401136,
    min_child_weight=1,
)

# Train the model and make predictions
y_pred, y_pred_original, y_test_original = train(model, X_train, X_test, y_train, y_test)


Training Set Scores:
Mean Absolute Error (MAE): 0.61 M
Mean Absolute Percentage Error (MAPE): 15.13 %
R-squared (R^2): 0.97
Mean Squared Error (MSE): 2836139.47 M

Test Set Scores:
Mean Absolute Error (MAE): 1.61 M
Mean Absolute Percentage Error (MAPE): 45.78 %
R-squared (R^2): 0.83
Mean Squared Error (MSE): 18799472.96 M
