In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
appliances_energy_prediction = fetch_ucirepo(id=374) 
  
# data (as pandas dataframes) 
X = appliances_energy_prediction.data.features 
y = appliances_energy_prediction.data.targets 
  
# metadata 
print(appliances_energy_prediction.metadata) 
  
# variable information 
print(appliances_energy_prediction.variables) 


{'uci_id': 374, 'name': 'Appliances Energy Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction', 'data_url': 'https://archive.ics.uci.edu/static/public/374/data.csv', 'abstract': 'Experimental data used to create regression models of appliances energy use in a low energy building.', 'area': 'Computer Science', 'tasks': ['Regression'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 19735, 'num_features': 28, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Appliances'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2017, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C5VC8G', 'creators': ['Luis Candanedo'], 'intro_paper': {'ID': 398, 'type': 'NATIVE', 'title': 'Data driven prediction models of energy use of appliances in a low-energy house', 'authors': 'L. Candanedo, V. Feldheim, Dominique Deramaix', 'venue': 'Energy and Buildings,

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.values.ravel()

In [4]:
date_format = '%Y-%m-%d%H:%M:%S'

X_train['date'] = pd.to_datetime(X_train['date'], format=date_format)
X_val['date'] = pd.to_datetime(X_val['date'], format=date_format)

In [5]:
X_train['dayofweek'] = X_train['date'].dt.dayofweek
X_train['hour'] = X_train['date'].dt.hour
X_train['month'] = X_train['date'].dt.month
X_train['year'] = X_train['date'].dt.year
X_train['day'] = X_train['date'].dt.day
X_train.drop(columns=['date'], inplace=True)

In [6]:
X_val['dayofweek'] = X_val['date'].dt.dayofweek
X_val['hour'] = X_val['date'].dt.hour
X_val['month'] = X_val['date'].dt.month
X_val['year'] = X_val['date'].dt.year
X_val['day'] = X_val['date'].dt.day
X_val.drop(columns=['date'], inplace=True)

In [7]:
X_train['day_sin'] = np.sin(2 * np.pi * X_train['day']/31)
X_train['day_cos'] = np.cos(2 * np.pi * X_train['day']/31)
X_train['month_sin'] = np.sin(2 * np.pi * X_train['month']/12)
X_train['month_cos'] = np.cos(2 * np.pi * X_train['month']/12)
X_train['hour_sin'] = np.sin(2 * np.pi * X_train['hour']/24)
X_train['hour_cos'] = np.cos(2 * np.pi * X_train['hour']/24)
X_train['dayofweek_sin'] = np.sin(2 * np.pi * X_train['dayofweek']/31)
X_train['dayofweek_cos'] = np.cos(2 * np.pi * X_train['dayofweek']/31)
X_train.drop(columns=['day', 'month', 'hour', 'dayofweek'], inplace=True)

In [8]:
X_val['day_sin'] = np.sin(2 * np.pi * X_val['day']/31)
X_val['day_cos'] = np.cos(2 * np.pi * X_val['day']/31)
X_val['month_sin'] = np.sin(2 * np.pi * X_val['month']/12)
X_val['month_cos'] = np.cos(2 * np.pi * X_val['month']/12)
X_val['hour_sin'] = np.sin(2 * np.pi * X_val['hour']/24)
X_val['hour_cos'] = np.cos(2 * np.pi * X_val['hour']/24)
X_val['dayofweek_sin'] = np.sin(2 * np.pi * X_val['dayofweek']/31)
X_val['dayofweek_cos'] = np.cos(2 * np.pi * X_val['dayofweek']/31)
X_val.drop(columns=['day', 'month', 'hour', 'dayofweek'], inplace=True)

In [9]:
num = [i for i in X_train.columns if i not in ['lights','year']]
cat = [i for i in X_train.columns if i not in num]
cat, num

(['lights', 'year'],
 ['T1',
  'RH_1',
  'T2',
  'RH_2',
  'T3',
  'RH_3',
  'T4',
  'RH_4',
  'T5',
  'RH_5',
  'T6',
  'RH_6',
  'T7',
  'RH_7',
  'T8',
  'RH_8',
  'T9',
  'RH_9',
  'T_out',
  'Press_mm_hg',
  'RH_out',
  'Windspeed',
  'Visibility',
  'Tdewpoint',
  'rv1',
  'rv2',
  'day_sin',
  'day_cos',
  'month_sin',
  'month_cos',
  'hour_sin',
  'hour_cos',
  'dayofweek_sin',
  'dayofweek_cos'])

In [10]:
def objective(trial):
    """
    Fungsi objective yang diperluas dengan lebih banyak model regresi.
    """
    num_pipe = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=trial.suggest_int('degree', 1, 2), include_bias=False)),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
    ])
    cat_pipe = Pipeline(steps=[
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_pipe, num),
        ('cat', cat_pipe, cat)
    ])
    
    params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': trial.suggest_int('lgbm_n_estimators', 100, 3000, step=100),
    'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.005, 0.5, log=True),
    'num_leaves': trial.suggest_int('lgbm_num_leaves', 10, 400),
    'max_depth': trial.suggest_int('lgbm_max_depth', 5, 50),
    'subsample': trial.suggest_float('lgbm_subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.5, 1.0),
    'reg_alpha': trial.suggest_float('lgbm_reg_alpha', 1e-8, 10.0, log=True),
    'reg_lambda': trial.suggest_float('lgbm_reg_lambda', 1e-8, 10.0, log=True),
    'random_state': 42,
    'verbose': -1
}
    regressor_obj = LGBMRegressor(**params)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', regressor_obj)
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='r2', n_jobs=-1)

    return score.mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

[I 2025-08-28 23:26:42,010] A new study created in memory with name: no-name-87bffa00-a205-4f9c-b547-e9d05b0552ee


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-28 23:35:37,331] Trial 3 finished with value: 0.5166045253225174 and parameters: {'degree': 1, 'lgbm_n_estimators': 500, 'lgbm_learning_rate': 0.28755396421255475, 'lgbm_num_leaves': 345, 'lgbm_max_depth': 23, 'lgbm_subsample': 0.9252982727929896, 'lgbm_colsample_bytree': 0.8979327076018371, 'lgbm_reg_alpha': 0.0063464645155775455, 'lgbm_reg_lambda': 6.152597233107565}. Best is trial 3 with value: 0.5166045253225174.
[I 2025-08-28 23:35:56,743] Trial 1 finished with value: 0.5280931754146956 and parameters: {'degree': 1, 'lgbm_n_estimators': 2300, 'lgbm_learning_rate': 0.1456652115460614, 'lgbm_num_leaves': 215, 'lgbm_max_depth': 43, 'lgbm_subsample': 0.5769584890311276, 'lgbm_colsample_bytree': 0.9790062475732788, 'lgbm_reg_alpha': 1.567537550942898e-05, 'lgbm_reg_lambda': 6.853923869297134e-05}. Best is trial 1 with value: 0.5280931754146956.
[I 2025-08-28 23:38:19,680] Trial 6 finished with value: 0.45425037376475663 and parameters: {'degree': 1, 'lgbm_n_estimators': 2100