# Linear Regression Analysis

Authors:
* Federico Maria Cruciani
* Héctor Ochoa Ortiz

## Imports, setup

In [3]:
import pandas as pd
import xgboost as xgb
from numpy import array
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from urllib.request import urlretrieve

## Training

In [30]:
# Read the dataset
df = pd.read_csv('../../dataset/KAG_energydata_complete.csv')

In [31]:
df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [32]:
# Extract hour and day of the week from the date, so we only keep numeric values
df["date"] = pd.to_datetime(df["date"])
df["hour"] = df["date"].dt.hour
df["dayofweek"] = df["date"].dt.dayofweek
df = df.drop(columns=["date"])

In [33]:
# Create train and test subsets
df_train: pd.DataFrame
df_test: pd.DataFrame
df_train, df_test = train_test_split(df, test_size=.3, random_state=hash("ResearchMethodology2024")%(2**32))
print(df_train.shape, df_test.shape)

(13814, 30) (5921, 30)


In [34]:
# Transform to DMatrix
train_matrix = xgb.DMatrix(df_train.drop(columns=["Appliances"]), label=df_train["Appliances"])
test_matrix = xgb.DMatrix(df_test.drop(columns=["Appliances"]), label=df_test["Appliances"])

In [48]:
# Set training parameters
training_parameters = {
    "booster":"gbtree",
    "max_depth": 2,
    "eta": 0.3,
    "objective": "reg:squarederror",
    "nthread": 4
}
rounds = 100
evaluations = [(test_matrix, "eval"), (train_matrix, "train")]

In [49]:
# Train the model
model = xgb.train(
    params=training_parameters,
    dtrain=train_matrix,
    num_boost_round=rounds,
    evals=evaluations)

[0]	eval-rmse:101.71758	train-rmse:98.41002
[1]	eval-rmse:99.89485	train-rmse:96.76530
[2]	eval-rmse:98.77043	train-rmse:95.64862
[3]	eval-rmse:97.77838	train-rmse:94.78371
[4]	eval-rmse:97.16216	train-rmse:94.20067
[5]	eval-rmse:96.62754	train-rmse:93.70949
[6]	eval-rmse:96.24996	train-rmse:93.33637
[7]	eval-rmse:95.88921	train-rmse:92.96180
[8]	eval-rmse:95.56095	train-rmse:92.48555
[9]	eval-rmse:95.36567	train-rmse:92.26449
[10]	eval-rmse:94.85964	train-rmse:91.83256
[11]	eval-rmse:94.39723	train-rmse:91.43000
[12]	eval-rmse:94.16449	train-rmse:91.26605
[13]	eval-rmse:93.83890	train-rmse:90.98342
[14]	eval-rmse:93.53357	train-rmse:90.68957
[15]	eval-rmse:93.33094	train-rmse:90.54143
[16]	eval-rmse:93.13480	train-rmse:90.30334
[17]	eval-rmse:92.88991	train-rmse:89.99139
[18]	eval-rmse:92.72686	train-rmse:89.82293
[19]	eval-rmse:92.57460	train-rmse:89.57336
[20]	eval-rmse:92.34472	train-rmse:89.29889
[21]	eval-rmse:92.21273	train-rmse:89.10319
[22]	eval-rmse:91.92052	train-rmse:88.892