# ML pour la France

In [79]:
import urllib.request
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from statsmodels.tsa.seasonal import STL
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, root_mean_squared_error,mean_squared_error, r2_score,mean_absolute_percentage_error
import optuna
import shap
import seaborn as sns
import skimpy as sk
import summarytools as st
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
import plotly.io as pio
import calendar
pio.templates.default = "plotly_white"

print("Environnement configuré avec succès!")
print(f"Date d'exécution: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

Environnement configuré avec succès!
Date d'exécution: 2025-12-09 12:13


## Analyse des Features Créées

In [8]:
df_featured = pd.read_csv('../data/processed/df_featured_france.csv',
                        parse_dates=['utc_timestamp'],
                        low_memory=False)
df_featured.head()

Unnamed: 0,utc_timestamp,load,load_forecast,solar,wind,price_day_ahead,temperature,cloud_cover,nuclear,wind_speed,...,price_rolling_std_6h,load_rolling_mean_6h,price_rolling_mean_24h,price_rolling_std_24h,load_rolling_mean_24h,renewable_generation,total_generation,price_delta,load_x_hour,temp_x_cloud
0,2015-01-06 00:00:00+00:00,67795.0,67900.0,0.0,739.0,45.21,3.257507,0.395513,58930.0,2.012984,...,4.663818,71753.333333,52.661667,10.753981,71452.291667,739.0,59669.0,-4.57,0.0,1.288386
1,2015-01-06 01:00:00+00:00,66811.0,67450.0,0.0,736.0,38.8,3.114288,0.422361,58885.0,1.987322,...,3.420566,70178.166667,52.91625,10.488004,71726.958333,736.0,59621.0,-6.41,668.11,1.315353
2,2015-01-06 02:00:00+00:00,64040.0,64850.0,0.0,753.0,35.0,2.995453,0.413952,58497.0,1.989497,...,4.798436,68875.166667,53.065833,10.247332,71998.0,753.0,59250.0,-3.8,1280.8,1.239975
3,2015-01-06 03:00:00+00:00,63275.0,63650.0,0.0,754.0,33.43,2.959351,0.433965,57890.0,1.973881,...,6.751242,67421.5,53.172917,10.033511,72264.0,754.0,58644.0,-1.57,1898.25,1.284255
4,2015-01-06 04:00:00+00:00,65977.0,65300.0,0.0,823.0,37.58,2.842896,0.459675,58227.0,2.030342,...,7.54015,66305.5,53.184583,10.009351,72520.416667,823.0,59050.0,4.15,2639.08,1.306807


In [28]:
# Calculer les corrélations
numeric_cols = df_featured.select_dtypes(include=[np.number]).columns.tolist()
feature_corr = df_featured[numeric_cols].corr()['price_day_ahead'].drop('price_day_ahead').sort_values(ascending=False)


In [34]:
# HEATMAP CORRÉLATIONS

corr_matrix = df_featured[numeric_cols].corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_matrix.values, 3),
    texttemplate='%{text:.2f}',
    textfont={"size": 9},
    colorbar=dict(title="Corrélation")
))

fig.update_layout(
    title="<b>Matrice de Corrélation - Top 15 Features + Target</b>",
    height=1500,
    width=1500
)
fig.show()


In [29]:
# Visualisation de TOUTES les corrélations
fig = go.Figure()

fig.add_trace(go.Bar(
    x=feature_corr.values,
    y=feature_corr.index,
    orientation='h',
    marker=dict(
        color=['green' if x > 0 else 'red' for x in feature_corr.values]
    ),
    text=[f"{x:.3f}" for x in feature_corr.values],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Corrélation: %{x:.4f}<extra></extra>'
))

fig.add_vline(x=0, line_dash="dash", line_color="black")

fig.update_layout(
    title="<b>Toutes les Corrélations - Features vs Prix</b>",
    xaxis_title="Corrélation",
    template="plotly_white",
    height=800,
    margin=dict(l=300)
)
fig.show()


## PRÉPARATION DES DONNÉES POUR ML

In [66]:
# Séparer features et target
X = df_featured.drop('price_day_ahead', axis=1)
y = df_featured['price_day_ahead']


* Encodage des données

In [67]:
# Encodage de la saison
X_encoded = X.copy()
season_encoding = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
X_encoded['season'] = X_encoded['season'].map(season_encoding)

# Encodage de la semaine
X_encoded['week'] = X_encoded['utc_timestamp'].dt.isocalendar().week

# Encodage du mois
X_encoded['month'] = X_encoded['utc_timestamp'].dt.month

# Encodage du jour de la semaine
X_encoded['dayofweek'] = X_encoded['utc_timestamp'].dt.dayofweek

# Encodage de l'heure
X_encoded['hour'] = X_encoded['utc_timestamp'].dt.hour

# Encodage de la date
X_encoded['date'] = X_encoded['utc_timestamp'].dt.date

* Split train/test

In [68]:

# Split temporel (80/20)
split_idx = int(len(X_encoded) * 0.8)
X_train = X_encoded[:split_idx]
X_test = X_encoded[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

df_featured['utc_timestamp'] = pd.to_datetime(df_featured['utc_timestamp'])

print(f"Train set : {X_train.shape} observations ({len(X_train)/len(X_encoded)*100:.1f}%)")
print(f"Test set : {X_test.shape} observations ({len(X_test)/len(X_encoded)*100:.1f}%)")
print(f"Features : {X_train.shape[1]}")
print(f"Période train : {df_featured.index[0]} à {df_featured.index[split_idx-1]} -- {date_train.min()} à {date_train.max()}")
print(f"Période test : {df_featured.index[split_idx]} à {df_featured.index[-1]} -- {date_test.min()} à {date_test.max()}")


Train set : (20448, 41) observations (80.0%)
Test set : (5112, 41) observations (20.0%)
Features : 41
Période train : 0 à 20447 -- 2015-01-06 00:00:00+00:00 à 2017-05-06 23:00:00+00:00
Période test : 20448 à 25559 -- 2017-05-07 00:00:00+00:00 à 2017-12-05 23:00:00+00:00


* Normalisation des données

In [61]:
columns_to_drop = ['utc_timestamp', 'date']
X_train_numeric = X_train.drop(columns=[col for col in columns_to_drop if col in X_train.columns])
X_test_numeric = X_test.drop(columns=[col for col in columns_to_drop if col in X_test.columns])

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)


## Modélisation

* **LightGBM**

In [105]:
# 3. Entraînement du modèle LightGBM
model = lgb.LGBMRegressor(random_state=42)
print("\nEntraînement du modèle LightGBM...")
model.fit(X_train_scaled, y_train)
print("Entraînement du modèle terminé.")


Entraînement du modèle LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7822
[LightGBM] [Info] Number of data points in the train set: 20448, number of used features: 39
[LightGBM] [Info] Start training from score 48.717121
Entraînement du modèle terminé.


In [106]:
# 4. Prédictions sur l'ensemble de test
y_pred = model.predict(X_test_scaled)

In [107]:
# 5. Evaluation de la performance
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)


In [108]:
print("\n--- Performance du modele ---")
print(f"Mean Absolute Error (MAE): {mae:.2f} €/MWh")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} €/MWh")
print(f"R2 Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {MAPE:.2f}%")


--- Performance du modele ---
Mean Absolute Error (MAE): 0.41 €/MWh
Root Mean Squared Error (RMSE): 1.21 €/MWh
R2 Score: 0.99
Mean Absolute Percentage Error (MAPE): 0.01%


* **visualisation**

In [109]:
# Récupérer les dates correspondant au test set
test_dates = datetime_cols_test['utc_timestamp'] if datetime_cols_test is not None else y_test.index

# Visualisation des prédictions vs valeurs réelles
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=test_dates,
        y=y_test,
        mode="lines",
        name="Actual Price",
        line=dict(color="royalblue", width=2),
        opacity=0.8,
    )
)
fig.add_trace(
    go.Scatter(
        x=test_dates,
        y=y_pred,
        mode="lines",
        name="Predicted Price",
        line=dict(color="firebrick", width=2, dash="dash"),
    )
)
fig.update_layout(
    title="Actual vs. Predicted Prices (Test Set)",
    xaxis_title="Date",
    yaxis_title="Price (€/MWh)",
    template="plotly_white",
    hovermode="x unified",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.show()

In [110]:

# Erreur d'approximation
errors = y_test.values - y_pred
fig = go.Figure()
fig.add_trace(go.Histogram(x=errors, nbinsx=50, name='Erreur'))
fig.update_layout(
    title="Distribution des Erreurs de Prédiction",
    xaxis_title="Erreur (€/MWh)",
    yaxis_title="Fréquence",
    template="plotly_white"
)
fig.show()


In [114]:
# Sauvegarder
import joblib
joblib.dump(model, 'model_lgbm_france.pkl')
joblib.dump(scaler, 'scaler_france.pkl')
print("\n✓ Modèles sauvegardés")


✓ Modèles sauvegardés


## Analyse SHAP

In [115]:
# Créer l'explainer
explainer = shap.TreeExplainer(model)


In [123]:

# Calculer les valeurs SHAP 
shap_values = explainer.shap_values(X_test_scaled)
print(f"SHAP values shape : {shap_values.shape}")
print(f"X_test shape : {X_test_scaled.shape}")


SHAP values shape : (5112, 39)
X_test shape : (5112, 39)


In [129]:
# --- Visualisation SHAP (barres) ---
if isinstance(shap_values, list):
    shap_array = shap_values[0]
else:
    shap_array = shap_values

# Récupérer les noms des features (colonnes de X_test_numeric)
feature_names = X_test_numeric.columns.tolist()

shap_importance = (
    pd.DataFrame({
        "feature": feature_names,
        "mean_abs_shap": np.abs(shap_array).mean(axis=0),
    })
    .sort_values("mean_abs_shap", ascending=True)
    .reset_index(drop=True)
)

fig = px.bar(
    shap_importance,
    x="mean_abs_shap",
    y="feature",
    orientation="h",
    title="Importance globale des caractéristiques (|SHAP| moyen)",
    labels={"mean_abs_shap": "|SHAP| moyen", "feature": "Caractéristique"},
    template="plotly_white",
)
fig.update_layout(margin=dict(l=120, r=40, t=60, b=40))
fig.show()

In [138]:
# Summary plot (beeswarm) avec Plotly
# Préparer les données
feature_names = X_test_numeric.columns.tolist()
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# Calculer l'importance moyenne pour trier les features
mean_abs_shap = np.abs(shap_array).mean(axis=0)
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': mean_abs_shap
}).sort_values('importance', ascending=True)

# toutes les features
all_features = feature_importance['feature'].tolist()

# DataFrame pour Plotly
plot_data = []
for i, feat in enumerate(all_features):
    feat_idx = feature_names.index(feat)
    # Ajouter un jitter aléatoire sur l'axe y pour l'effet beeswarm
    n_points = len(shap_array)
    y_jitter = np.random.uniform(-0.3, 0.3, n_points)
    
    plot_data.append(pd.DataFrame({
        'feature': feat,
        'feature_num': i,
        'y_position': i + y_jitter,
        'shap_value': shap_array[:, feat_idx],
        'feature_value': X_test_scaled_df.iloc[:, feat_idx]
    }))

df_plot = pd.concat(plot_data, ignore_index=True)

# graphique
fig = px.scatter(
    df_plot,
    x='shap_value',
    y='y_position',
    color='feature_value',
    title='resume plot beeswarm ',
    labels={
        'shap_value': 'Valeur SHAP (impact sur la prédiction)',
        'feature_value': 'Valeur de la caractéristique'
    },
    color_continuous_scale='RdBu_r',
    height=600,
    width=1000
)

fig.update_traces(
    marker=dict(size=5, opacity=0.6, line=dict(width=0))
)

fig.update_layout(
    template='plotly_white',
    margin=dict(l=150, r=100, t=80, b=60),
    font=dict(size=11),
    yaxis=dict(
        tickmode='array',
        tickvals=list(range(len(all_features))),
        ticktext=all_features,
        title='Caractéristique'
    ),
    coloraxis_colorbar=dict(
        title="Valeur<br>normalisée",
        thickness=15,
        len=0.7
    ),
    showlegend=False
)

fig.show()

In [146]:
# Calculer les indices des top features
mean_abs_shap = np.abs(shap_array).mean(axis=0)
top_indices = np.argsort(mean_abs_shap)[-3:][::-1]  # Top 3 features (ordre décroissant)

sample_size = len(X_test_scaled)  # ou un nombre plus petit si nécessaire

# Scatter plot - Feature vs SHAP value
for idx in top_indices:
    feature = X_test_numeric.columns[idx]
    
    # Préparer les dates pour le hover (si disponibles)
    if hasattr(test_dates, 'strftime'):
        # test_dates est un DatetimeIndex
        hover_text = test_dates[:sample_size].strftime('%Y-%m-%d %H:%M')
    elif isinstance(test_dates, pd.Series) and pd.api.types.is_datetime64_any_dtype(test_dates):
        # test_dates est une Series datetime
        hover_text = test_dates.iloc[:sample_size].dt.strftime('%Y-%m-%d %H:%M')
    else:
        # Pas de dates disponibles, utiliser les indices
        hover_text = [f"Index {i}" for i in range(sample_size)]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=X_test_numeric.iloc[:sample_size, idx],
        y=shap_array[:sample_size, idx],
        mode='markers',
        marker=dict(
            size=6, 
            color=X_test_numeric.iloc[:sample_size, idx], 
            colorscale='Viridis', 
            showscale=True,
            colorbar=dict(title="Valeur<br>feature")
        ),
        text=hover_text,
        hovertemplate='<b>Date:</b> %{text}<br><b>Valeur:</b> %{x:.2f}<br><b>SHAP:</b> %{y:.2f}<extra></extra>'
    ))
    fig.update_layout(
        title=f'SHAP Dependence Plot - {feature}',
        xaxis_title=f'{feature}',
        yaxis_title='Valeur SHAP (impact sur la prédiction)',
        template="plotly_white",
        height=500,
        width=800
    )
    fig.show()

* AMIRA

In [147]:
# Split temporel 80/20
split_idx = int(len(X) * 0.8)
X_train = X[:split_idx]
X_test = X[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

print(f"\n{'TRAIN/TEST SPLIT':^80}")
print("="*80)
print(f"Train : {len(X_train)} observations")
print(f"Test  : {len(X_test)} observations")
print(f"Ratio : {len(X_train)/len(X)*100:.1f}% / {len(X_test)/len(X)*100:.1f}%")



                                TRAIN/TEST SPLIT                                
Train : 20448 observations
Test  : 5112 observations
Ratio : 80.0% / 20.0%


In [148]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

print("\n" + "="*80)
print("MODÈLE 1 : ARIMA (AutoRegressive Integrated Moving Average)")
print("="*80)

# ARIMA sur la série temporelle du prix
# Paramètres ARIMA : (p, d, q)
# p = ordre AR (lags), d = différenciation, q = ordre MA

try:
    # Fit ARIMA(1, 1, 1) - basique
    print("\nFitting ARIMA(1, 1, 1)...")
    arima_model = ARIMA(y_train, order=(1, 1, 1))
    arima_fit = arima_model.fit()
    
    print(f"\nRésultats ARIMA :")
    print(f"  AIC : {arima_fit.aic:.2f}")
    print(f"  BIC : {arima_fit.bic:.2f}")
    
    # Prédictions sur test set
    y_pred_arima = arima_fit.forecast(steps=len(y_test))
    
    # Métriques
    rmse_arima = np.sqrt(mean_squared_error(y_test, y_pred_arima))
    mae_arima = mean_absolute_error(y_test, y_pred_arima)
    r2_arima = r2_score(y_test, y_pred_arima)
    mape_arima = np.mean(np.abs((y_test - y_pred_arima) / y_test)) * 100
    
    print(f"\n{'ARIMA Performance':^80}")
    print("-"*80)
    print(f"RMSE     : {rmse_arima:.4f} €/MWh")
    print(f"MAE      : {mae_arima:.4f} €/MWh")
    print(f"R² Score : {r2_arima:.4f}")
    print(f"MAPE     : {mape_arima:.2f}%")
    
except Exception as e:
    print(f"⚠️  ARIMA Error : {e}")
    print("Note: ARIMA fonctionne mieux sur séries univariées")
    arima_fit = None
    y_pred_arima = None



MODÈLE 1 : ARIMA (AutoRegressive Integrated Moving Average)

Fitting ARIMA(1, 1, 1)...

Résultats ARIMA :
  AIC : 125308.79
  BIC : 125332.57

                               ARIMA Performance                                
--------------------------------------------------------------------------------
RMSE     : 23.5001 €/MWh
MAE      : 17.5756 €/MWh
R² Score : -1.0796
MAPE     : 29.00%


## Comparaison des modèles

In [150]:
# Comparaison des Modèles : ARIMA vs LightGBM vs Hybrid
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Charger le dataset
df_featured = pd.read_csv('../data/processed/df_featured_france.csv')
df_featured['utc_timestamp'] = pd.to_datetime(df_featured['utc_timestamp'])

print("="*80)
print("COMPARAISON DES MODÈLES - PRICE FORECASTING")
print("="*80)

# Préparer les données
numeric_cols = df_featured.select_dtypes(include=[np.number]).columns.tolist()
X = df_featured[numeric_cols].drop('price_day_ahead', axis=1)
y = df_featured['price_day_ahead']

print(f"\nDataset : {X.shape[0]} observations, {X.shape[1]} features")
print(f"Target (price_day_ahead) : Min {y.min():.2f}, Max {y.max():.2f}, Mean {y.mean():.2f}")


COMPARAISON DES MODÈLES - PRICE FORECASTING

Dataset : 25560 observations, 36 features
Target (price_day_ahead) : Min 5.00, Max 206.12, Mean 49.60


In [151]:
## PARTIE 2 : SPLIT TEMPOREL

# Split temporel 80/20
split_idx = int(len(X) * 0.8)
X_train = X[:split_idx]
X_test = X[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

print(f"\n{'TRAIN/TEST SPLIT':^80}")
print("="*80)
print(f"Train : {len(X_train)} observations")
print(f"Test  : {len(X_test)} observations")
print(f"Ratio : {len(X_train)/len(X)*100:.1f}% / {len(X_test)/len(X)*100:.1f}%")



                                TRAIN/TEST SPLIT                                
Train : 20448 observations
Test  : 5112 observations
Ratio : 80.0% / 20.0%


In [152]:

## PARTIE 3 : ARIMA MODEL
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

print("\n" + "="*80)
print("MODÈLE 1 : ARIMA (AutoRegressive Integrated Moving Average)")
print("="*80)



MODÈLE 1 : ARIMA (AutoRegressive Integrated Moving Average)


In [153]:

# ARIMA sur la série temporelle du prix
# Paramètres ARIMA : (p, d, q)
# p = ordre AR (lags), d = différenciation, q = ordre MA

try:
    # Fit ARIMA(1, 1, 1) - basique
    print("\nFitting ARIMA(1, 1, 1)...")
    arima_model = ARIMA(y_train, order=(1, 1, 1))
    arima_fit = arima_model.fit()
    
    print(f"\nRésultats ARIMA :")
    print(f"  AIC : {arima_fit.aic:.2f}")
    print(f"  BIC : {arima_fit.bic:.2f}")
    
    # Prédictions sur test set
    y_pred_arima = arima_fit.forecast(steps=len(y_test))
    
    # Métriques
    rmse_arima = np.sqrt(mean_squared_error(y_test, y_pred_arima))
    mae_arima = mean_absolute_error(y_test, y_pred_arima)
    r2_arima = r2_score(y_test, y_pred_arima)
    mape_arima = np.mean(np.abs((y_test - y_pred_arima) / y_test)) * 100
    
    print(f"\n{'ARIMA Performance':^80}")
    print("-"*80)
    print(f"RMSE     : {rmse_arima:.4f} €/MWh")
    print(f"MAE      : {mae_arima:.4f} €/MWh")
    print(f"R² Score : {r2_arima:.4f}")
    print(f"MAPE     : {mape_arima:.2f}%")
    
except Exception as e:
    print(f"⚠️  ARIMA Error : {e}")
    print("Note: ARIMA fonctionne mieux sur séries univariées")
    arima_fit = None
    y_pred_arima = None



Fitting ARIMA(1, 1, 1)...

Résultats ARIMA :
  AIC : 125308.79
  BIC : 125332.57

                               ARIMA Performance                                
--------------------------------------------------------------------------------
RMSE     : 23.5001 €/MWh
MAE      : 17.5756 €/MWh
R² Score : -1.0796
MAPE     : 29.00%


In [154]:

## PARTIE 4 : LIGHTGBM MODEL (BASELINE)
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

print("\n" + "="*80)
print("MODÈLE 2 : LightGBM (Gradient Boosting)")
print("="*80)



MODÈLE 2 : LightGBM (Gradient Boosting)


In [None]:

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# LightGBM
print("\nFitting LightGBM...")
lgbm_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

lgbm_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(-1)
    ]
)

# Prédictions
y_pred_lgb = lgbm_model.predict(X_test_scaled)

# Métriques
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)
mape_lgb = np.mean(np.abs((y_test - y_pred_lgb) / y_test)) * 100

print(f"\n{'LightGBM Performance':^80}")
print("-"*80)
print(f"RMSE     : {rmse_lgb:.4f} €/MWh")
print(f"MAE      : {mae_lgb:.4f} €/MWh")
print(f"R² Score : {r2_lgb:.4f}")
print(f"MAPE     : {mape_lgb:.2f}%")




Fitting LightGBM...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[294]	valid_0's l2: 1.36245

                              LightGBM Performance                              
--------------------------------------------------------------------------------
RMSE     : 1.1672 €/MWh
MAE      : 0.3755 €/MWh
R² Score : 0.9949
MAPE     : 0.59%

MODÈLE 3 : HYBRID (ARIMA Trend + LightGBM Residuals)

1. ARIMA pour la tendance...
2. LightGBM pour les résidus...

                               Hybrid Performance                               
--------------------------------------------------------------------------------
RMSE     : 23.7557 €/MWh
MAE      : 17.5871 €/MWh
R² Score : -1.1250
MAPE     : 29.22%


In [156]:

## PARTIE 5 : MODÈLE HYBRIDE (ARIMA + LightGBM)
print("\n" + "="*80)
print("MODÈLE 3 : HYBRID (ARIMA Trend + LightGBM Residuals)")
print("="*80)

try:
    # Étape 1 : ARIMA captures la tendance globale
    print("\n1. ARIMA pour la tendance...")
    arima_trend = arima_fit.forecast(steps=len(y_test))
    
    # Étape 2 : LightGBM capture les résidus (erreurs)
    print("2. LightGBM pour les résidus...")
    
    # Calculer les résidus sur test set
    residuals = y_test.values - arima_trend.values
    
    # LightGBM pour prédire les résidus
    lgbm_residual = lgb.LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        verbose=-1
    )
    
    lgbm_residual.fit(X_train_scaled, y_train - arima_fit.fittedvalues)
    
    # Prédictions des résidus
    pred_residuals = lgbm_residual.predict(X_test_scaled)
    
    # Combinaison : Tendance ARIMA + Résidus LightGBM
    y_pred_hybrid = arima_trend + pred_residuals
    
    # Métriques Hybrid
    rmse_hybrid = np.sqrt(mean_squared_error(y_test, y_pred_hybrid))
    mae_hybrid = mean_absolute_error(y_test, y_pred_hybrid)
    r2_hybrid = r2_score(y_test, y_pred_hybrid)
    mape_hybrid = np.mean(np.abs((y_test - y_pred_hybrid) / y_test)) * 100
    
    print(f"\n{'Hybrid Performance':^80}")
    print("-"*80)
    print(f"RMSE     : {rmse_hybrid:.4f} €/MWh")
    print(f"MAE      : {mae_hybrid:.4f} €/MWh")
    print(f"R² Score : {r2_hybrid:.4f}")
    print(f"MAPE     : {mape_hybrid:.2f}%")
    
except Exception as e:
    print(f"⚠️  Hybrid Error : {e}")
    y_pred_hybrid = None



MODÈLE 3 : HYBRID (ARIMA Trend + LightGBM Residuals)

1. ARIMA pour la tendance...
2. LightGBM pour les résidus...

                               Hybrid Performance                               
--------------------------------------------------------------------------------
RMSE     : 23.7557 €/MWh
MAE      : 17.5871 €/MWh
R² Score : -1.1250
MAPE     : 29.22%


In [157]:

## PARTIE 6 : COMPARAISON DES MODÈLES
print("\n" + "="*80)
print("COMPARAISON FINALE - RANKING")
print("="*80)

# Créer tableau de comparaison
comparison_data = {
    'Modèle': ['ARIMA', 'LightGBM', 'Hybrid'],
    'RMSE (€/MWh)': [rmse_arima, rmse_lgb, rmse_hybrid],
    'MAE (€/MWh)': [mae_arima, mae_lgb, mae_hybrid],
    'R² Score': [r2_arima, r2_lgb, r2_hybrid],
    'MAPE (%)': [mape_arima, mape_lgb, mape_hybrid]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + comparison_df.to_string(index=False))

# Ranking
best_rmse_idx = comparison_df['RMSE (€/MWh)'].idxmin()
best_r2_idx = comparison_df['R² Score'].idxmax()

print(f"\n{'🏆 MEILLEUR MODÈLE':^80}")
print("="*80)
print(f"RMSE  : {comparison_df.loc[best_rmse_idx, 'Modèle']} ({comparison_df.loc[best_rmse_idx, 'RMSE (€/MWh)']:.4f})")
print(f"R²    : {comparison_df.loc[best_r2_idx, 'Modèle']} ({comparison_df.loc[best_r2_idx, 'R² Score']:.4f})")



COMPARAISON FINALE - RANKING

  Modèle  RMSE (€/MWh)  MAE (€/MWh)  R² Score  MAPE (%)
   ARIMA     23.500137    17.575574 -1.079568 29.001954
LightGBM      1.167242     0.375544  0.994870  0.585748
  Hybrid     23.755681    17.587099 -1.125041 29.215279

                               🏆 MEILLEUR MODÈLE                                
RMSE  : LightGBM (1.1672)
R²    : LightGBM (0.9949)


In [161]:
## PARTIE 7 : VISUALISATIONS
# Graphique 1 : Prédictions vs Réalité
fig1 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('ARIMA', 'LightGBM', 'Hybrid'),
    specs=[[{"secondary_y": False}]*3]
)

# ARIMA
fig1.add_trace(go.Scatter(y=y_test.values, name='Réel', line=dict(color='blue', width=2)), row=1, col=1)
fig1.add_trace(go.Scatter(y=y_pred_arima, name='ARIMA', line=dict(color='red', dash='dash', width=2)), row=1, col=1)

# LightGBM
fig1.add_trace(go.Scatter(y=y_test.values, name='Réel', line=dict(color='blue', width=2), showlegend=False), row=1, col=2)
fig1.add_trace(go.Scatter(y=y_pred_lgb, name='LightGBM', line=dict(color='green', dash='dash', width=2)), row=1, col=2)

# Hybrid
fig1.add_trace(go.Scatter(y=y_test.values, name='Réel', line=dict(color='blue', width=2), showlegend=False), row=1, col=3)
fig1.add_trace(go.Scatter(y=y_pred_hybrid, name='Hybrid', line=dict(color='purple', dash='dash', width=2)), row=1, col=3)

fig1.update_layout(title_text="Prédictions vs Réalité - Comparaison 3 Modèles", height=500)
fig1.show()


In [162]:

# Graphique 2 : Métriques de performance
fig2 = go.Figure()

models = comparison_df['Modèle'].tolist()
x_pos = np.arange(len(models))

# RMSE
fig2.add_trace(go.Bar(
    x=models,
    y=comparison_df['RMSE (€/MWh)'],
    name='RMSE',
    marker_color='indianred'
))

fig2.update_layout(
    title="Performance Comparison - RMSE",
    xaxis_title="Modèle",
    yaxis_title="RMSE (€/MWh)",
    template="plotly_white",
    height=500
)
fig2.show()


In [163]:

# Graphique 3 : Erreurs
fig3 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('ARIMA', 'LightGBM', 'Hybrid')
)

errors_arima = y_test.values - y_pred_arima
errors_lgb = y_test.values - y_pred_lgb
errors_hybrid = y_test.values - y_pred_hybrid

fig3.add_trace(go.Histogram(x=errors_arima, name='ARIMA', nbinsx=50, marker_color='red'), row=1, col=1)
fig3.add_trace(go.Histogram(x=errors_lgb, name='LightGBM', nbinsx=50, marker_color='green'), row=1, col=2)
fig3.add_trace(go.Histogram(x=errors_hybrid, name='Hybrid', nbinsx=50, marker_color='purple'), row=1, col=3)

fig3.update_layout(title_text="Distribution des Erreurs", height=500, showlegend=False)
fig3.show()
