In [1]:
# =====================================================
# 1. IMPORT LIBRARIES
# =====================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

from statsmodels.tsa.arima.model import ARIMA

In [2]:
# =====================================================
# 2. LOAD DATA
# =====================================================

historical = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/wind farm historical data.csv")
nwp = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Wind Power/NWP.csv")

historical.columns = historical.columns.str.strip()
nwp.columns = nwp.columns.str.strip()

historical['Date'] = pd.to_datetime(historical['Date'], dayfirst=True)
nwp['time'] = pd.to_datetime(nwp['time'])

historical = historical.rename(columns={
    'Date':'timestamp',
    'Speed':'wind_speed',
    'Direction':'wind_direction',
    'Energy':'power'
})

nwp = nwp.rename(columns={
    'time':'timestamp',
    'mod':'wind_speed_nwp',
    'dir':'wind_dir_nwp',
    'temp':'temperature_nwp',
    'rh':'humidity_nwp',
    'mslp':'pressure_nwp'
})

data = pd.merge(historical, nwp, on='timestamp', how='inner')
data = data.sort_values('timestamp').reset_index(drop=True)
data = data.ffill().dropna()

print("Merged shape:", data.shape)

Merged shape: (8784, 9)


In [3]:
# =====================================================
# 3. FEATURE ENGINEERING
# =====================================================

data_fe = data.copy()

# Time features
data_fe['hour'] = data_fe['timestamp'].dt.hour
data_fe['month'] = data_fe['timestamp'].dt.month
data_fe['year'] = data_fe['timestamp'].dt.year

data_fe['hour_sin'] = np.sin(2*np.pi*data_fe['hour']/24)
data_fe['hour_cos'] = np.cos(2*np.pi*data_fe['hour']/24)

# Lag features
for lag in [1,2,3,6,12,24]:
    data_fe[f'power_lag_{lag}'] = data_fe['power'].shift(lag)

# Rolling features
data_fe['rolling_mean_6'] = data_fe['power'].rolling(6).mean()
data_fe['rolling_std_6'] = data_fe['power'].rolling(6).std()

# ----------------------------
# Physics-informed features
# ----------------------------

data_fe['temp_kelvin'] = data_fe['temperature_nwp'] + 273.15
data_fe['pressure_pascal'] = data_fe['pressure_nwp'] * 100

R_specific = 287.058
data_fe['air_density'] = data_fe['pressure_pascal'] / (R_specific * data_fe['temp_kelvin'])

data_fe['physics_theoretical_power'] = 0.5 * data_fe['air_density'] * (data_fe['wind_speed_nwp']**3)

data_fe = data_fe.dropna().reset_index(drop=True)

print("After feature engineering:", data_fe.shape)

After feature engineering: (8760, 26)


In [4]:
# =====================================================
# 4. XGBOOST MODEL
# =====================================================

test_months = [10,11,12]
results_xgb = []

features = data_fe.drop(['timestamp','power','year','month'], axis=1).columns

param_dist = {
    'n_estimators': [400,600,800],
    'max_depth': [3,4,5,6],
    'learning_rate': [0.01,0.03,0.05],
    'subsample': [0.7,0.8,0.9],
    'colsample_bytree': [0.7,0.8,0.9],
    'gamma': [0,0.1],
    'reg_alpha': [0,0.01],
    'reg_lambda': [1,1.5]
}

for month in test_months:

    train_data = data_fe[data_fe['month'] < month]
    test_data = data_fe[data_fe['month'] == month]

    if len(train_data)==0 or len(test_data)==0:
        continue

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(train_data[features])
    X_test = scaler.transform(test_data[features])

    y_train = train_data['power']
    y_test = test_data['power']

    tscv = TimeSeriesSplit(n_splits=3)

    model = XGBRegressor(objective='reg:squarederror', random_state=42)

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,
        cv=tscv,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    preds = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results_xgb.append({
        'Month': month,
        'Model':'XGBoost',
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    })

xgb_df = pd.DataFrame(results_xgb)
print("\nXGBoost Summary")
print(xgb_df)


XGBoost Summary
   Month    Model        RMSE        MAE        R2
0     10  XGBoost   94.898020  61.236624  0.929412
1     11  XGBoost  145.313023  93.469178  0.944743
2     12  XGBoost  131.708492  84.469256  0.894025


In [5]:
# =====================================================
# 5. ARIMA BASELINE
# =====================================================

results_arima = []

for month in test_months:

    train_data = data_fe[data_fe['month'] < month]
    test_data = data_fe[data_fe['month'] == month]

    if len(train_data)==0 or len(test_data)==0:
        continue

    y_train = train_data['power']
    y_test = test_data['power']

    model = ARIMA(y_train, order=(2,1,2))
    model_fit = model.fit()

    preds = model_fit.forecast(steps=len(y_test))

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results_arima.append({
        'Month': month,
        'Model':'ARIMA',
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    })

arima_df = pd.DataFrame(results_arima)
print("\nARIMA Summary")
print(arima_df)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)



ARIMA Summary
   Month  Model        RMSE         MAE        R2
0     10  ARIMA  437.143548  269.511017 -0.497829
1     11  ARIMA  745.747121  471.473560 -0.455331
2     12  ARIMA  405.331130  323.005754 -0.003681


  return get_prediction_index(


In [6]:
# =====================================================
# 6. FINAL COMPARISON
# =====================================================

final_results = pd.concat([xgb_df, arima_df])
print("\nFinal Comparison")
print(final_results)

print("\nAverage Performance")
print(final_results.groupby('Model')[['RMSE','MAE','R2']].mean())


Final Comparison
   Month    Model        RMSE         MAE        R2
0     10  XGBoost   94.898020   61.236624  0.929412
1     11  XGBoost  145.313023   93.469178  0.944743
2     12  XGBoost  131.708492   84.469256  0.894025
0     10    ARIMA  437.143548  269.511017 -0.497829
1     11    ARIMA  745.747121  471.473560 -0.455331
2     12    ARIMA  405.331130  323.005754 -0.003681

Average Performance
               RMSE         MAE        R2
Model                                    
ARIMA    529.407266  354.663444 -0.318947
XGBoost  123.973178   79.725019  0.922727
