# XGBoost Regression Analysis

Author(s):
* Héctor Ochoa Ortiz

## Imports, setup

In [56]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Training

In [2]:
# Read the dataset
df = pd.read_csv('../../dataset/KAG_energydata_complete.csv')

In [3]:
df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [4]:
# Extract hour and day of the week from the date, so we only keep numeric values
df["date"] = pd.to_datetime(df["date"])
df["hour"] = df["date"].dt.hour
df["dayofweek"] = df["date"].dt.dayofweek
df = df.drop(columns=["date"])

In [13]:
# Create train and test subsets
df_train: pd.DataFrame
df_test: pd.DataFrame
df_train, df_test = train_test_split(df, test_size=.2, random_state=hash("ResearchMethodology2024")%(2**32))
print(df_train.shape, df_test.shape)

(15788, 30) (3947, 30)


In [14]:
# Transform to DMatrix
train_matrix = xgb.DMatrix(df_train.drop(columns=["Appliances"]), label=df_train["Appliances"])
test_matrix = xgb.DMatrix(df_test.drop(columns=["Appliances"]), label=df_test["Appliances"])

In [53]:
# Set training parameters, https://xgboost.readthedocs.io/en/stable/parameter.html
training_parameters = {
    "booster": "gbtree",
    "max_depth": 6,
    "eta": 0.1,  # learning rate, range [0,1]
    "objective": "reg:squarederror",
    "nthread": 4,
    "eval_metric": ["mae", "rmse"] 
}
rounds = 10000
evaluations = [(test_matrix, "eval"), (train_matrix, "train")]

In [54]:
# Train the model
model = xgb.train(
    params=training_parameters,
    dtrain=train_matrix,
    num_boost_round=rounds,
    evals=evaluations)

[0]	eval-mae:58.89798	eval-rmse:100.47518	train-mae:57.92234	train-rmse:100.05601
[1]	eval-mae:57.10261	eval-rmse:98.44907	train-mae:56.04079	train-rmse:98.00217
[2]	eval-mae:55.70182	eval-rmse:96.84986	train-mae:54.50966	train-rmse:96.29375
[3]	eval-mae:54.40478	eval-rmse:95.43459	train-mae:53.14787	train-rmse:94.77311
[4]	eval-mae:53.40398	eval-rmse:94.36654	train-mae:52.01069	train-rmse:93.49882
[5]	eval-mae:52.45118	eval-rmse:93.26436	train-mae:50.98946	train-rmse:92.32266
[6]	eval-mae:51.37841	eval-rmse:91.89337	train-mae:49.82690	train-rmse:90.85858
[7]	eval-mae:50.50416	eval-rmse:90.82892	train-mae:48.85224	train-rmse:89.62067
[8]	eval-mae:49.88018	eval-rmse:90.07729	train-mae:48.14551	train-rmse:88.80498
[9]	eval-mae:49.21035	eval-rmse:89.24942	train-mae:47.47175	train-rmse:88.00228
[10]	eval-mae:48.52695	eval-rmse:88.35775	train-mae:46.72444	train-rmse:87.02012
[11]	eval-mae:47.90260	eval-rmse:87.58190	train-mae:46.09299	train-rmse:86.18849
[12]	eval-mae:47.46098	eval-rmse:87.

## Prediction, evaluation

In [58]:
prediction = model.predict(test_matrix)
truth = test_matrix.get_label()

In [59]:
mae = mean_absolute_error(truth, prediction)
mse = mean_squared_error(truth, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(truth, prediction)

In [60]:
report = f"""
Regression Report:
------------------
Mean Absolute Error (MAE): {mae:.4f}
Mean Squared Error (MSE): {mse:.4f}
Root Mean Squared Error (RMSE): {rmse:.4f}
R-squared (R²): {r2:.4f}
"""

print(report)


Regression Report:
------------------
Mean Absolute Error (MAE): 31.8750
Mean Squared Error (MSE): 4151.0898
Root Mean Squared Error (RMSE): 64.4290
R-squared (R²): 0.6072

