In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error
)

import joblib

In [2]:
df = pd.read_csv("../data/Energy_consumption.csv")
df.head()

Unnamed: 0,Timestamp,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,DayOfWeek,Holiday,EnergyConsumption
0,2022-01-01 00:00:00,25.139433,43.431581,1565.693999,5,On,Off,2.774699,Monday,No,75.364373
1,2022-01-01 01:00:00,27.731651,54.225919,1411.064918,1,On,On,21.831384,Saturday,No,83.401855
2,2022-01-01 02:00:00,28.704277,58.907658,1755.715009,2,Off,Off,6.764672,Sunday,No,78.270888
3,2022-01-01 03:00:00,20.080469,50.371637,1452.316318,1,Off,On,8.623447,Wednesday,No,56.51985
4,2022-01-01 04:00:00,23.097359,51.401421,1094.130359,9,On,Off,3.071969,Friday,No,70.811732


In [3]:
df.shape

(1000, 11)

In [4]:
df.columns

Index(['Timestamp', 'Temperature', 'Humidity', 'SquareFootage', 'Occupancy',
       'HVACUsage', 'LightingUsage', 'RenewableEnergy', 'DayOfWeek', 'Holiday',
       'EnergyConsumption'],
      dtype='object')

In [5]:
df.info

<bound method DataFrame.info of                Timestamp  Temperature   Humidity  SquareFootage  Occupancy  \
0    2022-01-01 00:00:00    25.139433  43.431581    1565.693999          5   
1    2022-01-01 01:00:00    27.731651  54.225919    1411.064918          1   
2    2022-01-01 02:00:00    28.704277  58.907658    1755.715009          2   
3    2022-01-01 03:00:00    20.080469  50.371637    1452.316318          1   
4    2022-01-01 04:00:00    23.097359  51.401421    1094.130359          9   
..                   ...          ...        ...            ...        ...   
995  2022-02-11 11:00:00    28.619382  48.850160    1080.087000          5   
996  2022-02-11 12:00:00    23.836647  47.256435    1705.235156          4   
997  2022-02-11 13:00:00    23.005340  48.720501    1320.285281          6   
998  2022-02-11 14:00:00    25.138365  31.306459    1309.079719          3   
999  2022-02-11 15:00:00    23.051165  42.615421    1018.140606          6   

    HVACUsage LightingUsage  Re

In [6]:
# Drop kolom waktu
df = df.drop(columns=["Timestamp"])

# Drop missing value
df = df.dropna()

In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

In [8]:
df.dtypes

Temperature          float64
Humidity             float64
SquareFootage        float64
Occupancy              int64
HVACUsage              int64
LightingUsage          int64
RenewableEnergy      float64
DayOfWeek              int64
Holiday                int64
EnergyConsumption    float64
dtype: object

In [9]:
X = df.drop("EnergyConsumption", axis=1)
y = df["EnergyConsumption"]

X.head()

Unnamed: 0,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,DayOfWeek,Holiday
0,25.139433,43.431581,1565.693999,5,1,0,2.774699,1,0
1,27.731651,54.225919,1411.064918,1,1,1,21.831384,2,0
2,28.704277,58.907658,1755.715009,2,0,0,6.764672,3,0
3,20.080469,50.371637,1452.316318,1,0,1,8.623447,6,0
4,23.097359,51.401421,1094.130359,9,1,0,3.071969,0,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

In [14]:
print("Linear Regression")
print("R2 :", r2_score(y_test, y_pred_lr))

mse = mean_squared_error(y_test, y_pred_lr)
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("MAE :", mean_absolute_error(y_test, y_pred_lr))


Linear Regression
R2 : 0.6173375340134639
RMSE: 5.063064626573129
MAE : 4.074665519964768


In [15]:
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest")
print("R2 :", r2_score(y_test, y_pred_rf))

mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("MAE :", mean_absolute_error(y_test, y_pred_rf))


Random Forest
R2 : 0.5567175344287212
RMSE: 5.449363949399949
MAE : 4.372525568297179


In [17]:
joblib.dump(rf, "../models/energy_model.pkl")
joblib.dump(X.columns.tolist(), "../models/feature_columns.pkl")

['../models/feature_columns.pkl']

In [18]:
X.columns.tolist()

['Temperature',
 'Humidity',
 'SquareFootage',
 'Occupancy',
 'HVACUsage',
 'LightingUsage',
 'RenewableEnergy',
 'DayOfWeek',
 'Holiday']