## Contents
* read data
* data normalisation
* identify response variable and predictors
* split into training and testing datasets
* define the model
* choose hyperparameters to tune
* identify the best hyperparameters using gridsearch
* make predictions based on these hyperparameters
* assess model performance

In [3]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler 

In [4]:
base_dir = os.path.dirname(os.path.abspath("..\data"))
path = os.path.join(base_dir, r"data")
os.chdir(path) 

In [5]:
df = pd.read_csv("Cleaned data\data.csv")

In [6]:
# drop columns
df = df.drop(columns=[
    "DATE","MONTHDATE","FORECASTDEMAND","MIN","ACCMIN","MAX","ACCMAX","RAINPERIOD","QUANTITY","QUANTITYMONTHCUM","QUANTITYTOTALCUM","HUMIDITYMIN","HUMIDITYMAX", "RAIN"
], errors="ignore")
df.head()

Unnamed: 0,YEAR,MONTH,DAY,TOTALDEMAND,SOLAR,RRP,OUTPUT,HUMIDITYAVE,TEMPAVE
0,2010,1,1,7793.463681,14.6,20.364894,1.53,0.67587,24.95
1,2010,1,1,7793.463681,14.6,20.364894,1.53,0.67587,24.95
2,2010,1,1,7793.463681,14.6,20.364894,1.53,0.735918,24.95
3,2010,1,1,7793.463681,14.6,20.364894,1.53,0.832766,24.95
4,2010,1,1,7793.463681,14.6,20.364894,1.53,0.706042,24.95


In [7]:
# define scaler and transform data
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
data.head() 

Unnamed: 0,YEAR,MONTH,DAY,TOTALDEMAND,SOLAR,RRP,OUTPUT,HUMIDITYAVE,TEMPAVE
0,0.0,0.0,0.0,0.319521,0.409639,0.001964,0.003338,0.61303,0.657609
1,0.0,0.0,0.0,0.319521,0.409639,0.001964,0.003338,0.61303,0.657609
2,0.0,0.0,0.0,0.319521,0.409639,0.001964,0.003338,0.688724,0.657609
3,0.0,0.0,0.0,0.319521,0.409639,0.001964,0.003338,0.810804,0.657609
4,0.0,0.0,0.0,0.319521,0.409639,0.001964,0.003338,0.651063,0.657609


In [8]:
# identify response variable and predictors
X = data.drop(['TOTALDEMAND'], axis=1)
y = data['TOTALDEMAND'] 

In [9]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# define model
model = xgb.XGBRegressor()

In [15]:
# hyperparameters
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}

In [16]:
# identify best hyperparameters using gridsearch
grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
# best hyperparameters and score
print("Best Hyperparameters: ", grid.best_params_)
print("Best Score: ", grid.best_score_)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}


In [None]:
# train model using best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
# predictions on test data
y_pred = best_model.predict(X_test)

In [None]:
# print model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Mean Absolute Error: {:.2f}".format(mae))
print("Mean Squared Error: {:.2f}".format(mse))
print("Root Mean Squared Error: {:.2f}".format(rmse))
print("R-squared: {:.2f}".format(r2))
print('Accuracy: {:.2f}', format(accuracy))
print('Precision: {:.2f}', format(precision))
print('Recall: {:.2f}', format(recall))
print('F1-score: {:.2f}', format(f1))

NameError: name 'y_pred' is not defined