In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn
!pip install pytorch-lightning
!pip install torch
!pip install optuna



In [2]:
# =====================================================
# 1. IMPORT LIBRARIES
# =====================================================

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# =====================================================
# 2. LOAD DATA
# =====================================================

historical = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/wind farm historical data.csv")
nwp = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/NWP.csv")

historical.columns = historical.columns.str.strip()
nwp.columns = nwp.columns.str.strip()

# Convert datetime
historical['Date'] = pd.to_datetime(historical['Date'], dayfirst=True)
nwp['time'] = pd.to_datetime(nwp['time'], format='mixed')

# Rename columns
historical = historical.rename(columns={
    'Date':'timestamp',
    'Speed':'wind_speed',
    'Direction':'wind_direction',
    'Energy':'power'
})

nwp = nwp.rename(columns={
    'time':'timestamp',
    'mod':'wind_speed_nwp',
    'dir':'wind_dir_nwp',
    'temp':'temperature_nwp',
    'rh':'humidity_nwp',
    'mslp':'pressure_nwp'
})

# Merge
data = pd.merge(historical, nwp, on='timestamp', how='inner')
data = data.sort_values('timestamp').reset_index(drop=True)
data = data.ffill().dropna()

print("Merged shape:", data.shape)

Merged shape: (8784, 9)


In [4]:
# =====================================================
# 3. FEATURE ENGINEERING
# =====================================================

data_fe = data.copy()

# Time features
data_fe['hour'] = data_fe['timestamp'].dt.hour
data_fe['month'] = data_fe['timestamp'].dt.month
data_fe['year'] = data_fe['timestamp'].dt.year

data_fe['hour_sin'] = np.sin(2*np.pi*data_fe['hour']/24)
data_fe['hour_cos'] = np.cos(2*np.pi*data_fe['hour']/24)
data_fe['month_sin'] = np.sin(2*np.pi*data_fe['month']/12)
data_fe['month_cos'] = np.cos(2*np.pi*data_fe['month']/12)

# Lag features
for lag in [1,2,3,6,12,24]:
    data_fe[f'power_lag_{lag}'] = data_fe['power'].shift(lag)

# Rolling features
data_fe['rolling_mean_6'] = data_fe['power'].rolling(6).mean()
data_fe['rolling_std_6'] = data_fe['power'].rolling(6).std()

# Physics features
data_fe['wind_speed_cubed'] = data_fe['wind_speed']**3
data_fe['wind_speed_nwp_cubed'] = data_fe['wind_speed_nwp']**3

data_fe = data_fe.dropna().reset_index(drop=True)

print("After feature engineering:", data_fe.shape)

After feature engineering: (8760, 26)


In [5]:
# =====================================================
# 4. ROLLING WALK-FORWARD VALIDATION
# =====================================================

test_months = [10, 11, 12]  # contoh 3 window akhir

results = []

features = data_fe.drop(
    ['timestamp','power','year','month'],
    axis=1
).columns

for month in test_months:

    # Train: semua bulan sebelum bulan test
    train_data = data_fe[data_fe['month'] < month]
    test_data = data_fe[data_fe['month'] == month]

    if len(train_data) == 0 or len(test_data) == 0:
        continue

    # Scaling (fit hanya train)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(train_data[features])
    X_test = scaler.transform(test_data[features])

    y_train = train_data['power']
    y_test = test_data['power']

    # Model
    model = XGBRegressor(
        n_estimators=800,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    nrmse = rmse / np.mean(y_test)

    results.append({
        'Month': month,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'nRMSE': nrmse
    })

    print(f"\n===== Test Month {month} =====")
    print("RMSE:", round(rmse,2))
    print("MAE:", round(mae,2))
    print("R2:", round(r2,4))
    print("nRMSE:", round(nrmse,4))


===== Test Month 10 =====
RMSE: 97.2
MAE: 60.68
R2: 0.9259
nRMSE: 0.3483

===== Test Month 11 =====
RMSE: 145.81
MAE: 92.62
R2: 0.9444
nRMSE: 0.2761

===== Test Month 12 =====
RMSE: 127.43
MAE: 78.86
R2: 0.9008
nRMSE: 0.3832


In [6]:
# =====================================================
# 5. SUMMARY STATISTICS
# =====================================================

results_df = pd.DataFrame(results)

print("\n========== ROLLING VALIDATION SUMMARY ==========")
print(results_df)

print("\nAverage RMSE:", round(results_df['RMSE'].mean(),2))
print("Std RMSE:", round(results_df['RMSE'].std(),2))
print("Average R2:", round(results_df['R2'].mean(),4))
print("Average nRMSE:", round(results_df['nRMSE'].mean(),4))


   Month        RMSE        MAE        R2     nRMSE
0     10   97.198668  60.678536  0.925948  0.348264
1     11  145.806032  92.624767  0.944367  0.276099
2     12  127.428677  78.860820  0.900800  0.383210

Average RMSE: 123.48
Std RMSE: 24.54
Average R2: 0.9237
Average nRMSE: 0.3359
