## Contents
* read data
* data normalisation
* identify response variable and predictors
* split into training and testing datasets
* define the model
* choose hyperparameters to tune
* identify the best hyperparameters using gridsearch
* make predictions based on these hyperparameters
* assess model performance

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler 

In [2]:
base_dir = os.path.dirname(os.path.abspath("..\data"))
path = os.path.join(base_dir, r"data")
os.chdir(path) 

In [3]:
df = pd.read_csv("data.csv")

In [4]:
# drop columns
df = df.drop(columns=[
    "DATE","MONTHDATE","FORECASTDEMAND","ACCMAX","RAINPERIOD","QUANTITY","QUANTITYMONTHCUM","QUANTITYTOTALCUM"
], errors="ignore")
df.head()

Unnamed: 0,YEAR,MONTH,DAY,TOTALDEMAND,MAX,RAIN,SOLAR,RRP,OUTPUT
0,2010,1,1,7793.463681,29.6,0.2,14.6,20.364894,1.53
1,2010,1,1,7793.463681,29.6,0.2,14.6,20.364894,1.53
2,2010,1,2,8012.314097,29.5,0.0,18.4,20.478125,1.53
3,2010,1,2,8012.314097,29.5,0.0,18.4,20.478125,1.53
4,2010,1,3,7393.354514,21.0,15.2,7.3,20.277083,1.53


In [5]:
# define scaler and transform data
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
data.head()

Unnamed: 0,YEAR,MONTH,DAY,TOTALDEMAND,MAX,RAIN,SOLAR,RRP,OUTPUT
0,0.0,0.0,0.0,0.319521,0.502857,0.001253,0.409639,0.001964,0.003338
1,0.0,0.0,0.0,0.319521,0.502857,0.001253,0.409639,0.001964,0.003338
2,0.0,0.0,0.033333,0.3593,0.5,0.0,0.524096,0.002038,0.003338
3,0.0,0.0,0.033333,0.3593,0.5,0.0,0.524096,0.002038,0.003338
4,0.0,0.0,0.066667,0.246796,0.257143,0.095238,0.189759,0.001906,0.003338


In [6]:
# identify response variable and predictors
X = data.drop(['TOTALDEMAND'], axis=1)
y = data['TOTALDEMAND'] 

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# define model
model = GradientBoostingRegressor()

In [9]:
# hyperparameters
params = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5]
}

In [10]:
# identify best hyperparameters using gridsearch
grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [11]:
# best hyperparameters
print(grid.best_params_)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}


In [12]:
# train model using best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [13]:
# predictions on test data
y_pred = best_model.predict(X_test)

In [14]:
# print model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error: {:.2f}".format(mae))
print("Mean Squared Error: {:.2f}".format(mse))
print("Root Mean Squared Error: {:.2f}".format(rmse))
print("R-squared: {:.2f}".format(r2))

Mean Absolute Error: 0.02
Mean Squared Error: 0.00
Root Mean Squared Error: 0.04
R-squared: 0.94
