In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
# charge le fichier prétraité (déjà index datetime)
df = pd.read_csv("../data/df_preprocessed.csv", parse_dates=['date_heure'], index_col='date_heure')

# vérif rapide
print(df.index.min(), df.index.max())
df.head()


In [None]:
# resample daily : sum of MW over the day -> daily MWh-ish (still in MW * 30min steps but sum is fine for relative)
df_daily = df['conso_elec_MW'].resample('D').sum().to_frame(name='conso_daily')

# diagnostic
print("Daily length:", len(df_daily))
df_daily.plot(figsize=(12,4), title='Daily consumption (sum of 30min steps)')
plt.show()


In [None]:
H = 30  # horizon in days
for h in range(1, H+1):
    df_daily[f'target_j+{h}'] = df_daily['conso_daily'].shift(-h)

# On voit les colonnes
df_daily.dropna(how='all', subset=[f'target_j+{h}' for h in range(1,H+1)]).shape
# Note: on gardera les lignes valides plus bas après création des lags


In [None]:
# lags journaliers utiles (par ex : t-1, t-7, t-14, t-30)
lags_days = [1, 2, 3, 7, 14, 30, 365]  # 365 seulement si tu as plusieurs années
for lag in lags_days:
    df_daily[f'lag_{lag}'] = df_daily['conso_daily'].shift(lag)

# features calendaires
df_daily['dayofweek'] = df_daily.index.dayofweek
df_daily['month'] = df_daily.index.month
df_daily['is_weekend'] = df_daily['dayofweek'].isin([5,6]).astype(int)

# dropna dû aux lags et targets
cols_target = [f'target_j+{h}' for h in range(1,H+1)]
cols_features = [f'lag_{lag}' for lag in lags_days] + ['dayofweek','month','is_weekend']
df_model = df_daily[cols_features + cols_target].dropna()

print("Final dataset for modeling:", df_model.shape)
df_model.head()


In [None]:
# option: split by date if tu veux (ex: train <= 2022, val = 2023, test >=2024)
# ici split 80/10/10 chronologique
n = len(df_model)
train_end = int(n * 0.8)
val_end = int(n * 0.9)

X = df_model[cols_features]
Y = df_model[cols_target]

X_train, Y_train = X.iloc[:train_end], Y.iloc[:train_end]
X_val, Y_val = X.iloc[train_end:val_end], Y.iloc[train_end:val_end]
X_test, Y_test = X.iloc[val_end:], Y.iloc[val_end:]

print("Train,Val,Test sizes:", X_train.shape, X_val.shape, X_test.shape)


In [None]:
# create the base estimator
base = XGBRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

# wrapper multioutput
multi_xgb = MultiOutputRegressor(base, n_jobs=-1)

# fit on train (optionally you can fit on train+val for final model)
multi_xgb.fit(X_train, Y_train)

# save model if desired
joblib.dump(multi_xgb, "model/xgb_multi_30d.pkl")
print("Model saved: model/xgb_multi_30d.pkl")


In [None]:
Y_pred = multi_xgb.predict(X_test)  # shape (n_test, H)
Y_pred = pd.DataFrame(Y_pred, index=Y_test.index, columns=Y_test.columns)

# calc RMSE per horizon
rmse_per_horizon = {}
for h_col in Y_test.columns:
    rmse = mean_squared_error(Y_test[h_col], Y_pred[h_col], squared=False)
    rmse_per_horizon[h_col] = rmse

rmse_series = pd.Series(rmse_per_horizon)
print("RMSE per horizon (first few):")
print(rmse_series.head(10))

# plot RMSE vs horizon
plt.figure(figsize=(9,4))
rmse_series.values.reshape(-1).astype(float)
plt.plot(range(1,H+1), rmse_series.values, marker='o')
plt.xlabel("Horizon (days ahead)")
plt.ylabel("RMSE (daily consumption)")
plt.title("RMSE by horizon (1..30 days)")
plt.grid()
plt.show()


In [None]:
to_plot = [1, 7, 30]  # horizons to inspect
plt.figure(figsize=(15, 4*len(to_plot)))

for i, h in enumerate(to_plot, start=1):
    plt.subplot(len(to_plot),1,i)
    col = f'target_j+{h}'
    plt.plot(Y_test.index, Y_test[col], label=f"True J+{h}", alpha=0.8)
    plt.plot(Y_pred.index, Y_pred[col], label=f"Pred J+{h}", alpha=0.8, linestyle='--')
    plt.title(f"True vs Predicted for horizon J+{h}")
    plt.legend()
    plt.grid()

plt.tight_layout()
plt.show()
