# Comparatif de Modèles - Prédiction Prix Électricité (DK1)
*Benchmark : Linear Regression vs Random Forest vs XGBoost vs LightGBM*

### 1. Préparation des Données (Standard)

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import warnings

# Import des Modèles
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

# --- Chargement et Nettoyage (Idem Notebook précédent) ---
url = "https://data.open-power-system-data.org/time_series/latest/time_series_60min_singleindex.csv"
destination = "../data/raw/time_series_60min.csv"
# On suppose le fichier déjà là, sinon il faut le télécharger (voir notebook précédent)
df = pd.read_csv(destination, parse_dates=['utc_timestamp'], low_memory=False).set_index('utc_timestamp')

cols_mapping = {
    'DK_1_price_day_ahead': 'price',
    'DK_1_load_forecast_entsoe_transparency': 'load_forecast',
    'DK_1_wind_generation_actual': 'wind_generation', 
    'DK_1_solar_generation_actual': 'solar_generation'
}
df_dk = df[list(cols_mapping.keys())].rename(columns=cols_mapping)
df_dk = df_dk.loc['2018-01-01':'2019-12-31'].interpolate(method='linear').dropna()

# Feature Engineering
df_dk['hour'] = df_dk.index.hour
df_dk['day_of_week'] = df_dk.index.dayofweek
df_dk['month'] = df_dk.index.month
df_dk['price_lag_24h'] = df_dk['price'].shift(24)
df_dk['wind_lag_24h'] = df_dk['wind_generation'].shift(24)
df_dk = df_dk.dropna()

print("Données prêtes!")

Données prêtes!


### 2. Configuration du Benchmark

In [3]:
# Définition des Features et Target
features = ['hour', 'day_of_week', 'month', 'price_lag_24h', 'load_forecast', 'wind_lag_24h', 'solar_generation']
target = 'price'

# Split Train/Test (Test sur les 3 derniers mois)
split_date = '2019-10-01'
X_train = df_dk.loc[df_dk.index < split_date, features]
y_train = df_dk.loc[df_dk.index < split_date, target]
X_test = df_dk.loc[df_dk.index >= split_date, features]
y_test = df_dk.loc[df_dk.index >= split_date, target]

# Dictionnaire des modèles à tester
models = {
    "Linear Regression (Ridge)": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1, random_state=42, verbose=-1)
}

### 3. Entraînement et Évaluation

In [4]:
results = []
predictions = pd.DataFrame(index=X_test.index)
predictions['Réel'] = y_test

print("Début de l'entraînement...")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    predictions[name] = pred
    
    # Calcul Métriques
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    mae = mean_absolute_error(y_test, pred)
    results.append({'Modèle': name, 'RMSE': rmse, 'MAE': mae})

df_results = pd.DataFrame(results).sort_values('RMSE')
print("\n--- Résultats ---")
display(df_results)

Début de l'entraînement...
Training Linear Regression (Ridge)...
Training Random Forest...
Training XGBoost...
Training LightGBM...

--- Résultats ---


Unnamed: 0,Modèle,RMSE,MAE
3,LightGBM,10.785322,8.066633
2,XGBoost,10.7975,8.123153
1,Random Forest,11.054627,8.267526
0,Linear Regression (Ridge),11.082908,8.468015


### 4. Visualisation Comparative

In [5]:
# 4.1 Comparaison des Erreurs (RMSE)
fig = px.bar(
    df_results, 
    x='RMSE', 
    y='Modèle', 
    orientation='h', 
    title="Benchmark : Qui est le meilleur ? (RMSE le plus bas)",
    color='RMSE',
    color_continuous_scale='Reds',
    text_auto='.2f',
    template="plotly_white"
)
fig.show()

# 4.2 Comparaison visuelle des prédictions (Zoom sur une semaine)
zoom_start = '2019-11-04'
zoom_end = '2019-11-11'
df_zoom = predictions.loc[zoom_start:zoom_end]

fig = px.line(
    df_zoom, 
    title=f"Comparaison des Prédictions : Semaine du {zoom_start}",
    labels={"value": "Prix (€/MWh)", "index": "Date"},
    template="plotly_white"
)
fig.show()

### 5. Analyse des Résidus (Erreurs)
*Un bon modèle doit avoir des erreurs centrées sur 0 (Distribution Normale)*

In [6]:
from plotly.subplots import make_subplots

fig = go.Figure()

for name in models.keys():
    residuals = predictions['Réel'] - predictions[name]
    fig.add_trace(go.Histogram(x=residuals, name=name, opacity=0.5, nbinsx=50))

fig.update_layout(barmode='overlay', title="Distribution des Erreurs (Résidus)", template="plotly_white")
fig.update_traces(opacity=0.60)
fig.show()