### Contents
1. read data
2. identify response variable and predictors
3. split into training and testing datasets
4. define the model
5. choose hyperparameters to tune
6. identify the best hyperparameters using gridsearch
7. make predictions based on these hyperparameters
8. assess model performance

In [31]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [32]:
base_dir = os.path.dirname(os.path.abspath("..\data"))
path = os.path.join(base_dir, r"data")
os.chdir(path)

In [33]:
data = pd.read_csv("data.csv")

In [34]:
# drop date columns
data = data.drop(columns=["DATE","MONTHDATE"], errors="ignore")
data.head()

Unnamed: 0,YEAR,MONTH,DAY,TOTALDEMAND,MAX,ACCMAX,RAIN,RAINPERIOD,SOLAR,RRP,FORECASTDEMAND,OUTPUT,QUANTITY,QUANTITYMONTHCUM,QUANTITYTOTALCUM
0,2010,1,1,7793.463681,29.6,1.0,0.2,1.0,14.6,20.364894,7747.906211,1.53,1.0,1.0,20.0
1,2010,1,1,7793.463681,29.6,1.0,0.2,1.0,14.6,20.364894,7747.906211,1.53,1.0,2.0,21.0
2,2010,1,2,8012.314097,29.5,1.0,0.0,1.0,18.4,20.478125,7773.818583,1.53,1.0,1.0,20.0
3,2010,1,2,8012.314097,29.5,1.0,0.0,1.0,18.4,20.478125,7773.818583,1.53,1.0,2.0,21.0
4,2010,1,3,7393.354514,21.0,1.0,15.2,1.0,7.3,20.277083,7462.384786,1.53,1.0,1.0,20.0


In [36]:
# identify response variable and predictors
X = data.drop(['TOTALDEMAND'], axis=1)
y = data['TOTALDEMAND']

In [37]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# define model
model = GradientBoostingRegressor()

In [39]:
# hyperparameters
params = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5]
}

In [40]:
# identify best hyperparameters using gridsearch
grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [41]:
# best hyperparameters
print(grid.best_params_)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}


In [42]:
# train model using best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [43]:
# predictions on test data
y_pred = best_model.predict(X_test)

In [44]:
# print model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error: {:.2f}".format(mae))
print("Mean Squared Error: {:.2f}".format(mse))
print("Root Mean Squared Error: {:.2f}".format(rmse))
print("R-squared: {:.2f}".format(r2))

Mean Absolute Error: 37.15
Mean Squared Error: 6761.75
Root Mean Squared Error: 82.23
R-squared: 0.99
