In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn
!pip install pytorch-lightning
!pip install torch
!pip install optuna



In [2]:
# ===============================
# IMPORT
# ===============================
import pandas as pd
import numpy as np
import numpy as np
import optuna
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ===============================
# LOAD
# ===============================
historical = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/wind farm historical data.csv")
nwp = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/NWP.csv")

# ===============================
# CLEAN COLUMN NAMES
# ===============================
historical.columns = historical.columns.str.strip()
nwp.columns = nwp.columns.str.strip()

print("Historical columns:", historical.columns)
print("NWP columns:", nwp.columns)

# ===============================
# DATETIME CONVERSION (FIXED)
# ===============================

# Historical → format DD/MM/YYYY
historical['Date'] = pd.to_datetime(
    historical['Date'],
    dayfirst=True
)

# NWP → already ISO format (YYYY-MM-DD)
nwp['time'] = pd.to_datetime(
    nwp['time'],
    format='mixed'   # safest option
)

# ===============================
# RENAME COLUMNS
# ===============================
historical = historical.rename(columns={
    'Date': 'timestamp',
    'Speed': 'wind_speed',
    'Direction': 'wind_direction',
    'Energy': 'power'
})

nwp = nwp.rename(columns={
    'time': 'timestamp',
    'mod': 'wind_speed_nwp',
    'dir': 'wind_dir_nwp',
    'temp': 'temperature_nwp',
    'rh': 'humidity_nwp',
    'mslp': 'pressure_nwp'
})

# ===============================
# MERGE
# ===============================
data = pd.merge(historical, nwp, on='timestamp', how='inner')

# ===============================
# SORT & CLEAN
# ===============================
data = data.sort_values('timestamp').reset_index(drop=True)
data = data.ffill().dropna()

# ===============================
# CHECK
# ===============================
print("\nDataset Info:")
print(data.info())

print("\nFirst Rows:")
print(data.head())

print("\nMissing Values:")
print(data.isnull().sum())

Historical columns: Index(['Date', 'Speed', 'Direction', 'Energy'], dtype='object')
NWP columns: Index(['time', 'mod', 'dir', 'temp', 'rh', 'mslp'], dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   timestamp        8784 non-null   datetime64[ns]
 1   wind_speed       8784 non-null   float64       
 2   wind_direction   8784 non-null   float64       
 3   power            8784 non-null   float64       
 4   wind_speed_nwp   8784 non-null   float64       
 5   wind_dir_nwp     8784 non-null   float64       
 6   temperature_nwp  8784 non-null   float64       
 7   humidity_nwp     8784 non-null   float64       
 8   pressure_nwp     8784 non-null   float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 617.8 KB
None

First Rows:
            timestamp  wind_speed  wind_direction   

In [3]:
# ==========================================
# FEATURE ENGINEERING
# ==========================================

data_fe = data.copy()

# ===============================
# 1. TIME FEATURES
# ===============================
data_fe['hour'] = data_fe['timestamp'].dt.hour
data_fe['day'] = data_fe['timestamp'].dt.day
data_fe['month'] = data_fe['timestamp'].dt.month
data_fe['weekday'] = data_fe['timestamp'].dt.weekday

# Cyclical encoding
data_fe['hour_sin'] = np.sin(2 * np.pi * data_fe['hour'] / 24)
data_fe['hour_cos'] = np.cos(2 * np.pi * data_fe['hour'] / 24)

data_fe['month_sin'] = np.sin(2 * np.pi * data_fe['month'] / 12)
data_fe['month_cos'] = np.cos(2 * np.pi * data_fe['month'] / 12)


# ===============================
# 2. LAG FEATURES (Power)
# ===============================
for lag in [1, 2, 3, 6, 12, 24]:
    data_fe[f'power_lag_{lag}'] = data_fe['power'].shift(lag)


# ===============================
# 3. ROLLING STATISTICS
# ===============================
data_fe['rolling_mean_6'] = data_fe['power'].rolling(window=6).mean()
data_fe['rolling_std_6'] = data_fe['power'].rolling(window=6).std()

data_fe['rolling_mean_12'] = data_fe['power'].rolling(window=12).mean()
data_fe['rolling_std_12'] = data_fe['power'].rolling(window=12).std()


# ===============================
# 4. PHYSICS-INSPIRED FEATURE
# ===============================
data_fe['wind_speed_cubed'] = data_fe['wind_speed'] ** 3
data_fe['wind_speed_nwp_cubed'] = data_fe['wind_speed_nwp'] ** 3


# ===============================
# 5. REGIME CLASSIFICATION
# ===============================
def classify_regime(ws):
    if ws < 3:
        return 0
    elif ws < 12:
        return 1
    else:
        return 2

data_fe['regime'] = data_fe['wind_speed'].apply(classify_regime)


# ===============================
# 6. DROP NA FROM LAGS
# ===============================
data_fe = data_fe.dropna().reset_index(drop=True)

print("Final shape after feature engineering:", data_fe.shape)

Final shape after feature engineering: (8760, 30)


In [4]:
# ==========================================
# TRAIN-VAL-TEST SPLIT (TIME-BASED)
# ==========================================
train_size = int(len(data_fe) * 0.7)
val_size = int(len(data_fe) * 0.15)

train = data_fe[:train_size]
val = data_fe[train_size:train_size + val_size]
test = data_fe[train_size + val_size:]

features = data_fe.drop(['timestamp','power'], axis=1).columns

X_train = train[features]
y_train = train['power']

X_val = val[features]
y_val = val['power']

X_test = test[features]
y_test = test['power']


# ==========================================
# SCALING (FIT ONLY TRAIN)
# ==========================================
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


# ==========================================
# OPTUNA OBJECTIVE FUNCTION
# ==========================================
def objective(trial):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    model.fit(X_train_scaled, y_train)

    pred_val = model.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, pred_val))

    return rmse


# ==========================================
# RUN OPTUNA
# ==========================================
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\nBest Parameters Found:")
print(study.best_params)


# ==========================================
# TRAIN BEST MODEL
# ==========================================
best_model = XGBRegressor(**study.best_params)
best_model.fit(X_train_scaled, y_train)

pred_test = best_model.predict(X_test_scaled)


# ==========================================
# FINAL EVALUATION
# ==========================================
rmse = np.sqrt(mean_squared_error(y_test, pred_test))
mae = mean_absolute_error(y_test, pred_test)
r2 = r2_score(y_test, pred_test)

# FIXED MAPE (avoid infinity)
mape = np.mean(
    np.abs((y_test - pred_test) / (y_test + 1e-6))
) * 100

print("\n===== OPTIMIZED MODEL RESULTS =====")
print("RMSE:", round(rmse, 4))
print("MAE:", round(mae, 4))
print("R2:", round(r2, 4))
print("MAPE:", round(mape, 4), "%")


[I 2026-02-18 09:32:53,536] A new study created in memory with name: no-name-0aaed04c-1228-4f0a-ac1c-93b95491ba90
[I 2026-02-18 09:33:03,161] Trial 0 finished with value: 101.21019592072858 and parameters: {'n_estimators': 982, 'max_depth': 8, 'learning_rate': 0.13526357984331822, 'subsample': 0.8825038604750608, 'colsample_bytree': 0.8017532402613458, 'gamma': 1.2585699302051, 'min_child_weight': 3, 'reg_alpha': 4.821049298639872, 'reg_lambda': 3.4082571838879163}. Best is trial 0 with value: 101.21019592072858.
[I 2026-02-18 09:33:20,945] Trial 1 finished with value: 100.56818997223601 and parameters: {'n_estimators': 925, 'max_depth': 9, 'learning_rate': 0.08347054969803262, 'subsample': 0.714471671725302, 'colsample_bytree': 0.9186321983317539, 'gamma': 3.669511330194588, 'min_child_weight': 3, 'reg_alpha': 3.8033818658297083, 'reg_lambda': 4.539059185623725}. Best is trial 1 with value: 100.56818997223601.
[I 2026-02-18 09:33:29,103] Trial 2 finished with value: 97.89230203386019 


Best Parameters Found:
{'n_estimators': 795, 'max_depth': 7, 'learning_rate': 0.031565753644205455, 'subsample': 0.7062412247254835, 'colsample_bytree': 0.8505956079877246, 'gamma': 3.956503363010931, 'min_child_weight': 5, 'reg_alpha': 3.6963417390675883, 'reg_lambda': 0.26852615968053095}

===== OPTIMIZED MODEL RESULTS =====
RMSE: 148.6898
MAE: 92.1915
R2: 0.9261
MAPE: 399248448.549 %
