In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('merge_train.csv',parse_dates=['date'])

In [3]:
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekdays'] = df['date'].dt.dayofweek  # 1 for weekend, 0 for weekday
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['EMA_4h'] = df['Price'].ewm(span=48, adjust=False).mean()


df['EMA_1d'] = df['Price'].ewm(span=288, adjust=False).mean()
df['Forecast_difference_0.5 hour future'] = df['Forecast_Roof_0.5 hour future'] - df['Forecast_Demand_0.5 hour future']
df["Forecast_ratio_0.5 hour future"] = df['Forecast_Roof_0.5 hour future'] / df['Forecast_Demand_0.5 hour future']

In [4]:
Features = [feature for feature in df.columns if feature != 'Price' and feature != "Region" and feature != "Actual_Roof_OPOWER" and feature != "Actual_Roof_LASTCHANGED" and feature != "date"]
Features


['Demand',
 'Forecast_Demand_0.5 hour future',
 'Forecast_Demand_1 hour future',
 'Forecast_Demand_1.5 hours future',
 'Forecast_Demand_2 hours future',
 'Forecast_Demand_2.5 hours future',
 'Forecast_Demand_3 hours future',
 'Forecast_Demand_3.5 hours future',
 'Forecast_Demand_4 hours future',
 'Forecast_Demand_4.5 hours future',
 'Forecast_Demand_5 hours future',
 'Forecast_Demand_5.5 hours future',
 'Forecast_Demand_6 hours future',
 'Forecast_Demand_6.5 hours future',
 'Forecast_Demand_7 hours future',
 'Forecast_Demand_7.5 hours future',
 'Forecast_Demand_8 hours future',
 'Forecast_Demand_8.5 hours future',
 'Forecast_Demand_9 hours future',
 'Forecast_Demand_9.5 hours future',
 'Forecast_Roof_0.5 hour future',
 'Forecast_Roof_1 hour future',
 'Forecast_Roof_1.5 hours future',
 'Forecast_Roof_2 hours future',
 'Forecast_Roof_2.5 hours future',
 'Forecast_Roof_3 hours future',
 'Forecast_Roof_3.5 hours future',
 'Forecast_Roof_4 hours future',
 'Forecast_Roof_4.5 hours future',
 

In [5]:
for feature in Features:
    if feature != 'Price':  
        df[f'{feature}_shift_1'] = df[feature].shift(1)
        df[f'{feature}_shift_6'] = df[feature].shift(6)
df = df.dropna()

  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)
  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)
  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)
  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)
  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)
  df[f'{feature}_shift_1'] = df[feature].shift(1)
  df[f'{feature}_shift_6'] = df[feature].shift(6)


In [6]:
df.set_index('date', inplace=True)
X = df[Features + [f'{feature}_shift_1' for feature in Features] +
                  [f'{feature}_shift_6' for feature in Features]]
y = df['Price']

split_date = pd.to_datetime("2024-06-01 00:00:00")
X_train = X[X.index < split_date]
X_test = X[X.index >= split_date]
y_train = y[y.index < split_date]
y_test = y[y.index >= split_date]

In [7]:
def tolerance_success_rate(y_true, y_pred, tolerance=0.1):
    success = np.abs(y_pred - y_true) <= y_true * tolerance
    return np.mean(success)

def custom_eval_metric(y_pred, dtrain):
    y_true = dtrain.get_label()
    success_rate = tolerance_success_rate(y_true, y_pred)
    return 'tolerance_success_rate', success_rate

In [8]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse', tolerance_success_rate],  # 使用多個評估指標
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

In [9]:
from xgboost import XGBRegressor
model = XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric=['rmse', custom_eval_metric]
)

# 訓練模型
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=10  # 每10輪打印一次評估結果
)

# 進行預測
y_pred = model.predict(X_test)

XGBoostError: [12:31:04] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0750514818a16474a-1\xgboost\xgboost-ci-windows\src\metric\metric.cc:49: Unknown metric function <function custom_eval_metric at 0x000001D383D4F2E0>

In [None]:
final_success_rate = tolerance_success_rate(y_test, y_pred)
print(f"Final 10% Tolerance Success Rate: {final_success_rate:.4f}")

# 計算 RMSE
rmse = np.sqrt(((y_test - y_pred) ** 2).mean())
print(f"RMSE: {rmse:.4f}")

# 顯示特徵重要性
importance = model.feature_importances_
feature_names = X.columns
feature_importance = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)
print("\nFeature Importance:")
for feature, score in feature_importance[:10]:  # 顯示前10個重要特徵
    print(f"{feature}: {score:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.title('XGBoost: Actual vs Predicted Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 計算每小時的成功率
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_results['Hour'] = df_results.index.hour
df_results['Is_Successful'] = np.abs(df_results['Predicted'] - df_results['Actual']) <= df_results['Actual'] * 0.1

hourly_success_rate = df_results.groupby('Hour')['Is_Successful'].mean().sort_values(ascending=False)

# 繪製每小時成功率的長條圖
plt.figure(figsize=(12, 6))
hourly_success_rate.plot(kind='bar')
plt.title('Prediction Success Rate by Hour (10% Tolerance)')
plt.xlabel('Hour of Day')
plt.ylabel('Success Rate')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()