Notebook containing Gradient Boosting Regressor model

In [17]:
# train defualt GBR model on dataset and get accuracy and error metrics

%store -r X y
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# create a default KNR pipelinei with standardised scaling
default_gbr = Pipeline([
    ('scaler', StandardScaler()),
    ('default_gbr', GradientBoostingRegressor())
])

Use GridSearchCV to find optimal hyperparameter tuning for 'n_estimators' and 'learning_rate'

In [18]:
from sklearn.model_selection import KFold, cross_validate
import numpy as np

scoring = {
    "neg_mse": "neg_mean_squared_error",
    "neg_mae": "neg_mean_absolute_error",
    "r2": "r2"
}

ten_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)

default_cv = cross_validate(
    default_gbr, 
    X, y, 
    cv=ten_fold_cv, 
    scoring=scoring, 
    return_train_score=True
)

print(default_cv.keys())

# evaulate default gbr models performance - get mean over 10 folds
train_mse = -default_cv["train_neg_mse"]
train_rmse = np.sqrt(train_mse).mean()              
train_mae = -default_cv["train_neg_mae"].mean()
train_r2  = default_cv["train_r2"].mean()


test_mse  = -default_cv["test_neg_mse"]
test_rmse = np.sqrt(test_mse).mean()
test_mae  = -default_cv["test_neg_mae"].mean()
test_r2   = default_cv["test_r2"].mean()

print(f"Default GBR Model Performance over 10-Fold CV:\n")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Train MAE:  {train_mae:.4f}")
print(f"Train R2:   {train_r2:.4f}\n")

print(f"Test RMSE:  {test_rmse:.4f}")
print(f"Test MAE:   {test_mae:.4f}")
print(f"Test R2:    {test_r2:.4f}")  

dict_keys(['fit_time', 'score_time', 'test_neg_mse', 'train_neg_mse', 'test_neg_mae', 'train_neg_mae', 'test_r2', 'train_r2'])
Default GBR Model Performance over 10-Fold CV:

Train RMSE: 16.9457
Train MAE:  12.6222
Train R2:   0.9653

Test RMSE:  27.5452
Test MAE:   20.0686
Test R2:    0.9017


In [19]:
from sklearn.model_selection import GridSearchCV

grid_gbr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('tuned_gbr', GradientBoostingRegressor())
])

param_grid = {
    'tuned_gbr__n_estimators': [100, 200, 300],
    'tuned_gbr__learning_rate': [0.01, 0.1, .25, 0.5, 0.75]
}

gridsearch_gbr = GridSearchCV(grid_gbr_pipeline, param_grid, cv=10, scoring='neg_mean_squared_error')
gridsearch_gbr.fit(X, y)

best_params = gridsearch_gbr.best_params_
print(f"Best hyperparameters: {best_params}")

Best hyperparameters: {'tuned_gbr__learning_rate': 0.1, 'tuned_gbr__n_estimators': 300}


In [20]:

improved_gbr = Pipeline([
    ('scaler', StandardScaler()),
    ('improved_gbr', GradientBoostingRegressor(learning_rate=0.1, n_estimators=300))
])

ten_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)

improved_cv = cross_validate(
    improved_gbr, 
    X, y, 
    cv=ten_fold_cv, 
    scoring=scoring, 
    return_train_score=True
)

improved_train_mse = -improved_cv["train_neg_mse"]
improved_train_rmse = np.sqrt(improved_train_mse).mean()              
improved_train_mae = -improved_cv["train_neg_mae"].mean()
improved_train_r2  = improved_cv["train_r2"].mean()

improved_test_mse  = -improved_cv["test_neg_mse"]
improved_test_rmse = np.sqrt(improved_test_mse).mean()
improved_test_mae  = -improved_cv["test_neg_mae"].mean()
improved_test_r2   = improved_cv["test_r2"].mean()

print(f"Improved GBR Model Performance over 10-Fold CV:\n")
print(f"Train RMSE: {improved_train_rmse:.4f}")
print(f"Train MAE:  {improved_train_mae:.4f}")
print(f"Train R2:   {improved_train_r2:.4f}\n")
print(f"Test RMSE:  {improved_test_rmse:.4f}")
print(f"Test MAE:   {improved_test_mae:.4f}")
print(f"Test R2:    {improved_test_r2:.4f}")


Improved GBR Model Performance over 10-Fold CV:

Train RMSE: 8.9433
Train MAE:  6.5464
Train R2:   0.9903

Test RMSE:  25.2479
Test MAE:   17.9178
Test R2:    0.9178
