# **Gradient Boosting Regressor**


In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [143]:
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor

In [144]:
diabetes = datasets.load_diabetes(as_frame=True)

In [145]:
diabetes

{'data':           age       sex       bmi        bp        s1        s2        s3  \
 0    0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
 1   -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
 2    0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
 3   -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
 4    0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   
 ..        ...       ...       ...       ...       ...       ...       ...   
 437  0.041708  0.050680  0.019662  0.059744 -0.005697 -0.002566 -0.028674   
 438 -0.005515  0.050680 -0.015906 -0.067642  0.049341  0.079165 -0.028674   
 439  0.041708  0.050680 -0.015906  0.017293 -0.037344 -0.013840 -0.024993   
 440 -0.045472 -0.044642  0.039062  0.001215  0.016318  0.015283 -0.028674   
 441 -0.045472 -0.044642 -0.073030 -0.081413  0.083740  0.027809  0.173816   
 
            s4        s5        s6  
 0   -0.002592  0

In [146]:
diabetes.target

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [147]:
X = diabetes.data
y = diabetes.target

In [148]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [149]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

In [150]:
y_pred = gbr.predict(X_test)

In [151]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score

print("GBR Score:", gbr.score(X_test, y_test))
print("GBR Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("GBR R2 Score:", r2_score(y_test, y_pred))
print("GBR Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("GBR Accuracy Score:", accuracy_score(y_test, y_pred.round()))
print("GBR Coefficients:", gbr.feature_importances_)

GBR Score: 0.44962563125287147
GBR Mean Squared Error: 2915.9669241163197
GBR R2 Score: 0.44962563125287147
GBR Mean Absolute Error: 44.59551313508417
GBR Accuracy Score: 0.0
GBR Coefficients: [0.04913972 0.01248971 0.3934786  0.08295087 0.03862403 0.06063496
 0.03685171 0.02852313 0.24964032 0.04766697]


In [152]:
# Hyperparameter tuning
# min_samples_split: Minimum number of samples required to split an internal node.
# min_samples_leaf: Minimum number of samples required to be at a leaf node.
# n_estimators: Number of boosting stages to be run.
# learning_rate: Step size shrinkage used in update to prevent overfitting.
# max_depth: Maximum depth of the individual regression estimators.
# criterion: The function to measure the quality of a split.
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.2, 0.1, 0.01],
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}

In [153]:
from sklearn.model_selection import GridSearchCV
# Grid search for hyperparameter tuning
# GridSearchCV performs an exhaustive search over specified parameter values for an estimator.

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=4, verbose=2, n_jobs=4, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 324 candidates, totalling 1296 fits


In [154]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 100}

In [158]:
gbr_2 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, min_samples_split=6, min_samples_leaf=1)
gbr_2.fit(X_train, y_train)

In [159]:
y_pred_2 = gbr_2.predict(X_test)
print(y_pred_2)

[163.78068724 183.96457365 167.44643978 292.18383608 111.06032423
 113.17612319 264.61664405 186.65837874 138.57863655 147.32453351
 102.92481204 190.73862295  98.32913937 230.8789565  117.55275149
 110.1601684  221.51320216 275.12953506 191.77363329 233.08451283
 194.91651369  81.74602667  68.98257139 198.83262605 154.43734287
 192.38156832 198.59932812 178.65716486  75.4595113  105.30247891
 170.23595251 122.44911095 146.68202523 186.98850279 148.78597901
 212.59684176 123.29793671 123.62993479 175.98927557  69.10997297
  65.46018434 102.2689789  178.7084644  185.20619102 184.62681287
  76.10535568  87.33197858 108.71156019  81.59366117 163.08942113
 138.72933833  95.77430563 144.50794632 105.18868704 188.9774076
 144.3404191   98.88672544 222.48784775  90.40651599  89.07725598
 176.52032683 194.24169906 123.09351248 104.63365645 136.35287933
 219.62749809 167.83014447 186.21893847 141.73289661 132.00514042
 161.7598642  195.28117974 222.71973233 105.77705405  84.95516159
 179.750938

In [160]:
print("GBR-2 Score:", grid_search.score(X_test, y_test))
print("GBR-2 Mean Squared Error:", mean_squared_error(y_test, y_pred_2))
print("GBR-2 R2 Score:", r2_score(y_test, y_pred_2))
print("GBR-2 Mean Absolute Error:", mean_absolute_error(y_test, y_pred_2))
print("GBR-2 Accuracy Score:", accuracy_score(y_test, y_pred_2.round()))

GBR-2 Score: -2782.994225946642
GBR-2 Mean Squared Error: 2798.5874922967464
GBR-2 R2 Score: 0.4717804198265354
GBR-2 Mean Absolute Error: 42.66214968975878
GBR-2 Accuracy Score: 0.011235955056179775
