## Contents
* read data
* data normalisation
* identify response variable and predictors
* split into training and testing datasets
* define the model
* choose hyperparameters to tune
* identify the best hyperparameters using gridsearch
* make predictions based on these hyperparameters
* assess model performance

In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
base_dir = os.path.dirname(os.path.abspath("..\data"))
path = os.path.join(base_dir, r"data")
os.chdir(path) 

In [3]:
data = pd.read_csv("Cleaned data\data.csv")

In [4]:
# drop columns
df = data.drop(columns=[
    "DATE","MONTHDATE","FORECASTDEMAND","MIN","ACCMIN","MAX","ACCMAX","RAINPERIOD","QUANTITY","QUANTITYMONTHCUM","QUANTITYTOTALCUM","HUMIDITYMIN","HUMIDITYMAX", "RAIN", "WEEKEND"
], errors="ignore")
df.head()

Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,TOTALDEMAND,HOLIDAY,SOLAR,RRP,OUTPUT,WEEKDAY,TEMPAVE
0,2016,1,1,0.656341,15.902439,6853.633437,1.0,32.2,38.472917,46.93,4,21.95
1,2016,1,2,0.656341,15.902439,6727.613958,0.0,21.7,36.907292,46.93,5,21.0
2,2016,1,3,0.688837,14.488372,6616.406076,0.0,10.3,31.997083,46.93,6,21.55
3,2016,1,4,0.679545,22.477273,7367.750278,0.0,6.4,33.424583,46.93,0,20.9
4,2016,1,5,0.768837,22.581395,7462.242014,0.0,4.4,33.053958,46.93,1,19.05


In [5]:
#one hot encoding weekday
def one_hot_encode(df, col, names):
    array = np.array(df[col])
    integer_encoded = array.reshape(len(array), 1)
    onehotencoder = OneHotEncoder()
    onehot_encoded = onehotencoder.fit_transform(integer_encoded).toarray()
    df_encoded = pd.DataFrame(onehot_encoded, columns=names)
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(columns=[col], errors="ignore")
    return df

df = one_hot_encode(df, col="WEEKDAY", names=['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'])
df.head()

Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,TOTALDEMAND,HOLIDAY,SOLAR,RRP,OUTPUT,TEMPAVE,MON,TUE,WED,THU,FRI,SAT,SUN
0,2016,1,1,0.656341,15.902439,6853.633437,1.0,32.2,38.472917,46.93,21.95,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2016,1,2,0.656341,15.902439,6727.613958,0.0,21.7,36.907292,46.93,21.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2016,1,3,0.688837,14.488372,6616.406076,0.0,10.3,31.997083,46.93,21.55,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2016,1,4,0.679545,22.477273,7367.750278,0.0,6.4,33.424583,46.93,20.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2016,1,5,0.768837,22.581395,7462.242014,0.0,4.4,33.053958,46.93,19.05,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# define scaler and transform data
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
data.head() 

Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,TOTALDEMAND,HOLIDAY,SOLAR,RRP,OUTPUT,TEMPAVE,MON,TUE,WED,THU,FRI,SAT,SUN
0,0.0,0.0,0.0,0.680083,0.33229,0.169856,1.0,1.0,0.00961,0.024987,0.638081,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.033333,0.680083,0.33229,0.14369,0.0,0.663462,0.008577,0.024987,0.610465,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.066667,0.713754,0.302742,0.1206,0.0,0.298077,0.005337,0.024987,0.626453,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.1,0.704126,0.469674,0.276604,0.0,0.173077,0.006279,0.024987,0.607558,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.133333,0.796647,0.47185,0.296224,0.0,0.108974,0.006035,0.024987,0.553779,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
def sliding_window(data, window_size, target):
    X = []
    y = []
    for i in range(len(data) - window_size):
        X.append(data.iloc[i:i+window_size].values.flatten())
        y.append(data.iloc[i+window_size][target])
    return X, y 

In [8]:
# identify response variable and predictors
days_ahead = 1
X, y = sliding_window(data=data, window_size=days_ahead, target="TOTALDEMAND") 

In [9]:
# train test split
train_size = 0.8
split = int(train_size * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [10]:
# Define the XGBoost model
model = xgb.XGBRegressor()

In [11]:
# hyperparameters
params = {
    'max_depth': [3, 7],
    'learning_rate': [0.1, 0.001],
    'n_estimators': [100, 1000]
}

In [12]:
# identify best hyperparameters using gridsearch
grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train) 

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,


In [13]:
# best hyperparameters and score
print("Best Hyperparameters: ", grid.best_params_)
print("Best Score: ", grid.best_score_) 

Best Hyperparameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best Score:  0.7784522477410093


In [14]:
# train model using best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [15]:
# predictions on test data 
y_pred = best_model.predict(X_test)

In [16]:
# print model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error: {:.2f}".format(mae))
print("Mean Squared Error: {:.2f}".format(mse))
print("Root Mean Squared Error: {:.2f}".format(rmse))
print("R-squared: {:.2f}".format(r2)) 

Mean Absolute Error: 0.05
Mean Squared Error: 0.01
Root Mean Squared Error: 0.07
R-squared: 0.84
