In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn
!pip install pytorch-lightning
!pip install torch
!pip install optuna

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.6.1-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.6.1-py3-none-any.whl (857 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.3/857.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.15.2 pytorch-lightning-2.6.1 torchmetrics-1.8.2
Collecting optuna
  Down

In [2]:
# =====================================================
# 1. IMPORT LIBRARIES
# =====================================================

import pandas as pd
import numpy as np
import optuna

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [4]:
# =====================================================
# 2. LOAD & MERGE DATA
# =====================================================

historical = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/wind farm historical data.csv")
nwp = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/NWP.csv")

historical.columns = historical.columns.str.strip()
nwp.columns = nwp.columns.str.strip()

historical['Date'] = pd.to_datetime(historical['Date'], dayfirst=True)
nwp['time'] = pd.to_datetime(nwp['time'], format='mixed')

historical = historical.rename(columns={
    'Date':'timestamp',
    'Speed':'wind_speed',
    'Direction':'wind_direction',
    'Energy':'power'
})

nwp = nwp.rename(columns={
    'time':'timestamp',
    'mod':'wind_speed_nwp',
    'dir':'wind_dir_nwp',
    'temp':'temperature_nwp',
    'rh':'humidity_nwp',
    'mslp':'pressure_nwp'
})

data = pd.merge(historical, nwp, on='timestamp', how='inner')
data = data.sort_values('timestamp').reset_index(drop=True)
data = data.ffill().dropna()

print("Merged shape:", data.shape)

Merged shape: (8784, 9)


In [5]:
# =====================================================
# 3. FEATURE ENGINEERING
# =====================================================

data_fe = data.copy()

data_fe['hour'] = data_fe['timestamp'].dt.hour
data_fe['month'] = data_fe['timestamp'].dt.month

data_fe['hour_sin'] = np.sin(2*np.pi*data_fe['hour']/24)
data_fe['hour_cos'] = np.cos(2*np.pi*data_fe['hour']/24)
data_fe['month_sin'] = np.sin(2*np.pi*data_fe['month']/12)
data_fe['month_cos'] = np.cos(2*np.pi*data_fe['month']/12)

for lag in [1,2,3,6,12,24]:
    data_fe[f'power_lag_{lag}'] = data_fe['power'].shift(lag)

data_fe['rolling_mean_6'] = data_fe['power'].rolling(6).mean()
data_fe['rolling_std_6'] = data_fe['power'].rolling(6).std()

data_fe['wind_speed_cubed'] = data_fe['wind_speed']**3
data_fe['wind_speed_nwp_cubed'] = data_fe['wind_speed_nwp']**3

data_fe = data_fe.dropna().reset_index(drop=True)

print("After feature engineering:", data_fe.shape)

After feature engineering: (8760, 25)


In [6]:
# =====================================================
# 4. TIME-BASED SPLIT
# =====================================================

train_size = int(len(data_fe)*0.7)
val_size = int(len(data_fe)*0.15)

train = data_fe[:train_size]
val = data_fe[train_size:train_size+val_size]
test = data_fe[train_size+val_size:]

features = data_fe.drop(['timestamp','power'], axis=1).columns

X_train = train[features]
y_train = train['power']

X_val = val[features]
y_val = val['power']

X_test = test[features]
y_test = test['power']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [7]:
# =====================================================
# 5. OPTUNA TUNING FOR XGBOOST
# =====================================================

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    model.fit(X_train_scaled, y_train)

    pred_val = model.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)

best_model = XGBRegressor(**study.best_params)
best_model.fit(X_train_scaled, y_train)

pred_xgb = best_model.predict(X_test_scaled)

[I 2026-02-18 10:09:50,047] A new study created in memory with name: no-name-cad7d239-bae7-46c3-88e1-00531bb0410a
[I 2026-02-18 10:10:02,084] Trial 0 finished with value: 97.55510504763825 and parameters: {'n_estimators': 738, 'max_depth': 5, 'learning_rate': 0.06605830645087217, 'subsample': 0.6155985118515807, 'colsample_bytree': 0.9634498687191366, 'gamma': 1.7118545629548647, 'min_child_weight': 9, 'reg_alpha': 3.2977801301523635, 'reg_lambda': 3.05763496577193}. Best is trial 0 with value: 97.55510504763825.
[I 2026-02-18 10:10:19,209] Trial 1 finished with value: 104.62869918351963 and parameters: {'n_estimators': 1166, 'max_depth': 10, 'learning_rate': 0.18159611942426837, 'subsample': 0.6645535799415109, 'colsample_bytree': 0.895918037706686, 'gamma': 1.0953904435289812, 'min_child_weight': 10, 'reg_alpha': 4.691409760945075, 'reg_lambda': 3.4903105070850127}. Best is trial 0 with value: 97.55510504763825.
[I 2026-02-18 10:10:21,195] Trial 2 finished with value: 106.89939938394

Best parameters: {'n_estimators': 516, 'max_depth': 7, 'learning_rate': 0.04420426202478934, 'subsample': 0.699158352118413, 'colsample_bytree': 0.9530714984391538, 'gamma': 2.984508562771155, 'min_child_weight': 5, 'reg_alpha': 3.241648283724681, 'reg_lambda': 1.7110979944562512}


In [8]:
# =====================================================
# 6. RANDOM FOREST
# =====================================================

rf_model = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)
pred_rf = rf_model.predict(X_test_scaled)

In [9]:
# =====================================================
# 7. EXTRA TREES
# =====================================================

et_model = ExtraTreesRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

et_model.fit(X_train_scaled, y_train)
pred_et = et_model.predict(X_test_scaled)

In [10]:
# =====================================================
# 8. WEIGHTED ENSEMBLE
# =====================================================

w1 = 0.5
w2 = 0.3
w3 = 0.2

pred_ensemble = (w1*pred_xgb) + (w2*pred_rf) + (w3*pred_et)

In [11]:
# =====================================================
# 9. FINAL EVALUATION
# =====================================================

rmse = np.sqrt(mean_squared_error(y_test, pred_ensemble))
mae = mean_absolute_error(y_test, pred_ensemble)
r2 = r2_score(y_test, pred_ensemble)

mean_power = np.mean(y_test)
nrmse = rmse / mean_power

print("\n===== FINAL ENSEMBLE RESULTS =====")
print("RMSE:", round(rmse,4))
print("MAE:", round(mae,4))
print("R2:", round(r2,4))
print("nRMSE:", round(nrmse,4))


===== FINAL ENSEMBLE RESULTS =====
RMSE: 154.112
MAE: 94.4987
R2: 0.9207
nRMSE: 0.3345
