# CHARGEMENT ET EXPLORATION DES DONNÉES

### Chargement et Nettoyage Initial

In [41]:
import urllib.request
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from statsmodels.tsa.seasonal import STL
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import optuna
import shap
import seaborn as sns
import skimpy as sk
import summarytools as st
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
import plotly.io as pio
import calendar
pio.templates.default = "plotly_white"

print("Environnement configuré avec succès!")
print(f"Date d'exécution: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

Environnement configuré avec succès!
Date d'exécution: 2025-12-08 15:41


In [52]:
# dataset
df = pd.read_csv('../data/raw/time_series_60min_fr_dk.csv', 
    parse_dates=['utc_timestamp', 'cet_cest_timestamp'],
    low_memory=False)
df = df.set_index('utc_timestamp')
df.head()


Unnamed: 0_level_0,cet_cest_timestamp,DK_load_actual_entsoe_transparency,DK_load_forecast_entsoe_transparency,DK_solar_capacity,DK_solar_generation_actual,DK_wind_capacity,DK_wind_generation_actual,DK_wind_offshore_capacity,DK_wind_offshore_generation_actual,DK_wind_onshore_capacity,...,FR_solar_generation_actual,FR_wind_onshore_generation_actual,IT_NORD_FR_price_day_ahead,temperature_france,cloud_cover_france,temperature_denmark,cloud_cover_denmark,FR_nuclear_generation_actual,wind_speed_france,wind_speed_denmark
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31 23:00:00+00:00,2015-01-01 00:00:00+01:00,,,,,,,,,,...,,,,1.58902,0.288571,4.954803,0.82274,57982.0,0.387504,7.049201
2015-01-01 00:00:00+00:00,2015-01-01 01:00:00+01:00,,,489.0,,4643.0,,1264.0,,3379.0,...,,,,1.393066,0.281712,4.939117,0.906714,57787.0,0.387504,7.049201
2015-01-01 01:00:00+00:00,2015-01-01 02:00:00+01:00,3100.02,3126.8,489.0,,4643.0,2357.33,1264.0,902.71,3379.0,...,,1464.0,,1.31897,0.287806,4.970825,0.936386,57974.0,0.395347,7.16562
2015-01-01 02:00:00+00:00,2015-01-01 03:00:00+01:00,2980.39,3019.0,489.0,,4643.0,2387.35,1264.0,830.87,3379.0,...,,1543.0,,1.188934,0.292199,5.007904,0.942083,57522.0,0.487387,7.215656
2015-01-01 03:00:00+00:00,2015-01-01 04:00:00+01:00,2933.49,2976.3,489.0,,4643.0,2594.47,1264.0,915.43,3379.0,...,,1579.0,,1.03241,0.31295,5.076752,0.962194,57824.0,0.584224,7.344737


**Selection France**

In [54]:
# Sélectionner les colonnes France pertinentes
france_cols = ['FR_load_actual_entsoe_transparency', 'FR_load_forecast_entsoe_transparency',
               'FR_solar_generation_actual', 'FR_wind_onshore_generation_actual',
               'IT_NORD_FR_price_day_ahead', 'temperature_france',
               'cloud_cover_france', 'FR_nuclear_generation_actual', 'wind_speed_france']

df_france = df[france_cols].copy()
df_france.head()


Unnamed: 0_level_0,FR_load_actual_entsoe_transparency,FR_load_forecast_entsoe_transparency,FR_solar_generation_actual,FR_wind_onshore_generation_actual,IT_NORD_FR_price_day_ahead,temperature_france,cloud_cover_france,FR_nuclear_generation_actual,wind_speed_france
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-12-31 23:00:00+00:00,,,,,,1.58902,0.288571,57982.0,0.387504
2015-01-01 00:00:00+00:00,,,,,,1.393066,0.281712,57787.0,0.387504
2015-01-01 01:00:00+00:00,69773.0,69750.0,,1464.0,,1.31897,0.287806,57974.0,0.395347
2015-01-01 02:00:00+00:00,66417.0,66300.0,,1543.0,,1.188934,0.292199,57522.0,0.487387
2015-01-01 03:00:00+00:00,64182.0,63250.0,,1579.0,,1.03241,0.31295,57824.0,0.584224


In [58]:

# Renommer les colonnes pour faciliter l'utilisation
df_france.columns = ['load', 'load_forecast', 'solar', 'wind',
                     'price_day_ahead', 'temperature', 'cloud_cover', 'nuclear', 'wind_speed']
print(f"Shape initial : {df_france.shape}")
#print(f"Périodе : {df_france['utc_timestamp'].min()} à {df_france['utc_timestamp'].max()}")
df_france.head()


Shape initial : (50401, 9)


Unnamed: 0_level_0,load,load_forecast,solar,wind,price_day_ahead,temperature,cloud_cover,nuclear,wind_speed
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-12-31 23:00:00+00:00,,,,,,1.58902,0.288571,57982.0,0.387504
2015-01-01 00:00:00+00:00,,,,,,1.393066,0.281712,57787.0,0.387504
2015-01-01 01:00:00+00:00,69773.0,69750.0,,1464.0,,1.31897,0.287806,57974.0,0.395347
2015-01-01 02:00:00+00:00,66417.0,66300.0,,1543.0,,1.188934,0.292199,57522.0,0.487387
2015-01-01 03:00:00+00:00,64182.0,63250.0,,1579.0,,1.03241,0.31295,57824.0,0.584224


**Valeurs manquantes et doublons**

* Doublons

In [59]:
# Vérification des doublons sur l'index utc_timestamp
total = len(df_france)
unique = df_france.index.nunique()
dup = total - unique

print(f"Total lignes: {total}")
print(f"Lignes uniques par utc_timestamp: {unique}")
print(f"Doublons détectés: {dup}")

if dup:
    dup_timestamps = df.index[df.index.duplicated(keep=False)].unique()
    print(f"Nombre de timestamps dupliqués uniques: {len(dup_timestamps)}")
    display(pd.DataFrame({"duplicated_timestamp": dup_timestamps}).head(20))
    # Afficher un échantillon des lignes dupliquées pour inspection
    sample_ts = dup_timestamps[:5]
    for ts in sample_ts:
        print(f"\nExemple pour timestamp dupliqué: {ts}")
        display(df_france.loc[ts])
else:
    print("Aucun doublon trouvé sur utc_timestamp.")

Total lignes: 50401
Lignes uniques par utc_timestamp: 50401
Doublons détectés: 0
Aucun doublon trouvé sur utc_timestamp.


* Valeurs manquantes

In [60]:
# Quantification et visualisation des valeurs manquantes (df_france)

missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

# Bar plot des pourcentages de valeurs manquantes
fig_missing_bar = px.bar(
    missing_df_all.reset_index().rename(columns={"index": "column"}),
    x="missing_pct",
    y="column",
    orientation="h",
    text="missing_pct",
    title="Pourcentage de valeurs manquantes par colonne (df_france)",
    labels={"missing_pct": "% NaN", "column": "Colonne"},
)
fig_missing_bar.update_traces(texttemplate="%{text:.2f}%", textposition="outside")

fig_missing_bar.show()

Unnamed: 0,missing_count,missing_pct
price_day_ahead,24825,49.254975
load,44,0.0873
solar,15,0.029761
wind,8,0.015873
nuclear,5,0.00992
load_forecast,3,0.005952
temperature,0,0.0
cloud_cover,0,0.0
wind_speed,0,0.0


In [62]:
# Périodes avec valeurs manquantes pour IT_NORD_FR_price_day_ahead
col = "price_day_ahead"
mask = df_france[col].isna()

if not mask.any():
    print(f"Aucune valeur manquante pour {col}.")
else:
    # numéroter les runs (changes de state)
    run_id = (mask != mask.shift(1)).cumsum()
    runs = (
        df_france[mask]
        .groupby(run_id[mask])
        .apply(lambda x: pd.Series({
            "start": x.index.min(),
            "end": x.index.max(),
            "n_points": len(x)
        }))
        .reset_index(drop=True)
    )
    runs["duration_hours"] = (runs["end"] - runs["start"]) / np.timedelta64(1, "h") + 1
    runs = runs.sort_values("start").reset_index(drop=True)

    print(f"Nombre de périodes disjointes avec des NaN pour {col} : {len(runs)}")
    display(runs)

    overall = pd.Series({
        "first_nan": runs["start"].min(),
        "last_nan": runs["end"].max(),
        "total_nan_points": int(mask.sum()),
        "total_points": len(df_france),
        "nan_pct": mask.mean() * 100
    })
    display(overall)

Nombre de périodes disjointes avec des NaN pour price_day_ahead : 5


Unnamed: 0,start,end,n_points,duration_hours
0,2014-12-31 23:00:00+00:00,2015-01-04 23:00:00+00:00,97,97.0
1,2015-03-29 02:00:00+00:00,2015-03-29 03:00:00+00:00,2,2.0
2,2016-03-27 02:00:00+00:00,2016-03-27 03:00:00+00:00,2,2.0
3,2017-03-26 02:00:00+00:00,2017-03-26 03:00:00+00:00,2,2.0
4,2017-12-05 22:00:00+00:00,2020-09-30 23:00:00+00:00,24722,24722.0


first_nan           2014-12-31 23:00:00+00:00
last_nan            2020-09-30 23:00:00+00:00
total_nan_points                        24825
total_points                            50401
nan_pct                             49.254975
dtype: object

***Troncature des données --- Limite du DataFrame à la plage de dates pour laquelle les données de prix sont disponibles***

In [63]:
start_date = '2015-01-05'
end_date = '2017-12-05'
df_france = df_france.loc[start_date:end_date]

In [64]:
missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

Unnamed: 0,missing_count,missing_pct
load,13,0.050813
price_day_ahead,8,0.03127
nuclear,3,0.011726
solar,0,0.0
load_forecast,0,0.0
wind,0,0.0
temperature,0,0.0
cloud_cover,0,0.0
wind_speed,0,0.0


***Imputation des valeurs manquantes***

In [68]:
# Utiliser l'interpolation linéaire pour les quelques NaN restants
df_france.interpolate(method='linear', inplace=True)

In [70]:
missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

Unnamed: 0,missing_count,missing_pct
load,0,0.0
load_forecast,0,0.0
solar,0,0.0
wind,0,0.0
price_day_ahead,0,0.0
temperature,0,0.0
cloud_cover,0,0.0
nuclear,0,0.0
wind_speed,0,0.0


### Statistiques descriptive

In [71]:
df_france.describe()

Unnamed: 0,load,load_forecast,solar,wind,price_day_ahead,temperature,cloud_cover,nuclear,wind_speed
count,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0
mean,53820.941252,53903.126837,926.280801,2300.758443,49.598433,12.560578,0.582787,44617.642433,2.686989
std,11571.977,11701.785429,1265.216606,1598.557605,16.5726,5.97153,0.19015,6682.434355,1.389591
min,30426.0,29650.0,0.0,262.0,5.0,-2.715515,0.014118,28074.0,0.025694
25%,45167.0,45150.0,0.0,1137.0,39.03,7.861191,0.463206,39334.25,1.636837
50%,52217.5,52300.0,103.0,1798.0,47.46,12.189209,0.600811,43334.5,2.563375
75%,61482.5,61650.0,1752.25,3002.0,56.3,17.068931,0.72404,49198.75,3.506684
max,93696.0,95050.0,5361.0,9373.0,206.12,29.214722,0.995049,61676.0,8.894589


In [72]:
sk.skim(df_france)

In [73]:
st.dfSummary(df_france)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,load [float64],Mean (sd) : 53820.9 (11572.0) min < med < max: 30426.0 < 52217.5 < 93696.0 IQR (CV) : 16315.5 (4.7),"19,131 distinct values",,0 (0.0%)
2,load_forecast [float64],Mean (sd) : 53903.1 (11701.8) min < med < max: 29650.0 < 52300.0 < 95050.0 IQR (CV) : 16500.0 (4.6),"1,342 distinct values",,0 (0.0%)
3,solar [float64],Mean (sd) : 926.3 (1265.2) min < med < max: 0.0 < 103.0 < 5361.0 IQR (CV) : 1752.2 (0.7),"4,219 distinct values",,0 (0.0%)
4,wind [float64],Mean (sd) : 2300.8 (1598.6) min < med < max: 262.0 < 1798.0 < 9373.0 IQR (CV) : 1865.0 (1.4),"5,883 distinct values",,0 (0.0%)
5,price_day_ahead [float64],Mean (sd) : 49.6 (16.6) min < med < max: 5.0 < 47.5 < 206.1 IQR (CV) : 17.3 (3.0),"5,816 distinct values",,0 (0.0%)
6,temperature [float64],Mean (sd) : 12.6 (6.0) min < med < max: -2.7 < 12.2 < 29.2 IQR (CV) : 9.2 (2.1),"25,042 distinct values",,0 (0.0%)
7,cloud_cover [float64],Mean (sd) : 0.6 (0.2) min < med < max: 0.0 < 0.6 < 1.0 IQR (CV) : 0.3 (3.1),"25,552 distinct values",,0 (0.0%)
8,nuclear [float64],Mean (sd) : 44617.6 (6682.4) min < med < max: 28074.0 < 43334.5 < 61676.0 IQR (CV) : 9864.5 (6.7),"15,580 distinct values",,0 (0.0%)
9,wind_speed [float64],Mean (sd) : 2.7 (1.4) min < med < max: 0.0 < 2.6 < 8.9 IQR (CV) : 1.9 (1.9),"25,570 distinct values",,0 (0.0%)


In [77]:

# Visualisation Plotly - Distribution du prix
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_france['price_day_ahead'], nbinsx=50, name='Historique'))
fig.update_layout(
    title="Distribution des Prix (price_day_ahead)",
    xaxis_title="Prix (€/MWh)",
    yaxis_title="Fréquence",
    template="plotly_white",
    hovermode="x unified",
    height=400
)
fig.show()


In [79]:

# Série temporelle du prix
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france.index, y=df_france['price_day_ahead'],
                         mode='lines', name='Prix', line=dict(color='blue')))
fig.update_layout(
    title="Évolution Temporelle du Prix (2015-22017)",
    xaxis_title="Date",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white",
    hovermode="x unified",
    height=400
)
fig.show()


In [81]:

# Corrélations avec le prix
print("\n" + "=" * 80)
print("CORRÉLATIONS AVEC LE PRIX")
print("=" * 80)
correlations = df_france.corr()['price_day_ahead'].drop('price_day_ahead').sort_values(ascending=False)
print(correlations.round(4))

# Visualisation corrélations
fig = go.Figure()
fig.add_trace(go.Bar(x=correlations.values, y=correlations.index, 
                      orientation='h', marker=dict(color=correlations.values, 
                      colorscale='RdBu', cmid=0)))
fig.update_layout(
    title="Corrélations avec le Prix",
    xaxis_title="Corrélation",
    template="plotly_white",
    height=400
)
fig.show()



CORRÉLATIONS AVEC LE PRIX
load             0.4727
load_forecast    0.4678
nuclear          0.2685
wind_speed       0.0249
wind             0.0159
cloud_cover      0.0120
solar           -0.0355
temperature     -0.0854
Name: price_day_ahead, dtype: float64


In [None]:

### 1.3 - Scatter Plots Relationnels

```python
# Prix vs Charge
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france_clean['load'], y=df_france_clean['price'],
                         mode='markers', marker=dict(size=4, opacity=0.5, 
                         color=df_france_clean['temperature'], colorscale='Viridis'),
                         text=df_france_clean.index.strftime('%Y-%m-%d'),
                         hovertemplate='<b>Date:</b> %{text}<br><b>Charge:</b> %{x:.0f} MW<br><b>Prix:</b> %{y:.2f} €/MWh<extra></extra>'))
fig.update_layout(
    title="Prix vs Charge Électrique (Coloré par Température)",
    xaxis_title="Charge (MW)",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white",
    height=500
)
fig.show()

# Prix vs Température
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france_clean['temperature'], y=df_france_clean['price'],
                         mode='markers', marker=dict(size=4, opacity=0.5, color='red')))
fig.update_layout(
    title="Prix vs Température",
    xaxis_title="Température (°C)",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white"
)
fig.show()
```

---

## PARTIE 2 : FEATURE ENGINEERING

### 2.1 - Création des Features

```python
print("=" * 80)
print("FEATURE ENGINEERING")
print("=" * 80)

# Copier le dataframe
df_features = df_france_clean.copy()

# === FEATURES TEMPORELLES ===
df_features['hour'] = df_features.index.hour
df_features['day_of_week'] = df_features.index.dayofweek
df_features['month'] = df_features.index.month
df_features['day_of_year'] = df_features.index.dayofyear
df_features['is_weekend'] = (df_features['day_of_week'] >= 5).astype(int)

# Saison
season_map = {12: 'Winter', 1: 'Winter', 2: 'Winter',
              3: 'Spring', 4: 'Spring', 5: 'Spring',
              6: 'Summer', 7: 'Summer', 8: 'Summer',
              9: 'Fall', 10: 'Fall', 11: 'Fall'}
df_features['season'] = df_features['month'].map(season_map)

# === LAG FEATURES (Prix et Charge) ===
for lag in [1, 3, 6, 12, 24]:
    df_features[f'price_lag_{lag}h'] = df_features['price'].shift(lag)
    df_features[f'load_lag_{lag}h'] = df_features['load'].shift(lag)

# === ROLLING WINDOWS ===
for window in [6, 24]:
    df_features[f'price_rolling_mean_{window}h'] = df_features['price'].shift(1).rolling(window=window).mean()
    df_features[f'price_rolling_std_{window}h'] = df_features['price'].shift(1).rolling(window=window).std()
    df_features[f'load_rolling_mean_{window}h'] = df_features['load'].rolling(window=window).mean()

# === FEATURES DÉRIVÉES ===
df_features['renewable_generation'] = df_features['solar'] + df_features['wind']
df_features['total_generation'] = df_features['renewable_generation'] + df_features['nuclear']
df_features['price_delta'] = df_features['price'].diff()

# === INTERACTIVES ===
df_features['load_x_hour'] = df_features['load'] * df_features['hour'] / 100
df_features['temp_x_cloud'] = df_features['temperature'] * df_features['cloud_cover']

print(f"Features créées : {df_features.shape[1]}")
print(f"Observations après lag/rolling : {df_features.shape[0]}")

# Nettoyer les NaN générés par les lags et rolling windows
df_features = df_features.dropna()
print(f"Observations finales : {df_features.shape[0]}")
print(f"\nNom des features : {list(df_features.columns)}")

# Sauvegarder temporairement
df_features.to_csv('df_features_france.csv')
```

### 2.2 - Analyse des Features Créées

```python
print("\n" + "=" * 80)
print("ANALYSE DES FEATURES CRÉÉES")
print("=" * 80)

# Corrélations mises à jour
feature_corr = df_features.corr()['price'].drop('price').sort_values(ascending=False)
print(feature_corr.head(15))

# Visualisation Top Features
fig = go.Figure()
top_features = feature_corr.head(12)
fig.add_trace(go.Bar(
    x=top_features.values,
    y=top_features.index,
    orientation='h',
    marker=dict(color=top_features.values, colorscale='RdBu', zmid=0)
))
fig.update_layout(
    title="Top 12 Features par Corrélation avec le Prix",
    xaxis_title="Corrélation",
    template="plotly_white",
    height=500
)
fig.show()
```

---

## PARTIE 3 : PRÉPARATION DES DONNÉES POUR ML

### 3.1 - Train/Test Split Temporel

```python
print("=" * 80)
print("PRÉPARATION TRAIN/TEST")
print("=" * 80)

# Séparer features et target
X = df_features.drop('price', axis=1)
y = df_features['price']

# Encodage de la saison
X_encoded = X.copy()
season_encoding = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
X_encoded['season'] = X_encoded['season'].map(season_encoding)

# Split temporel (80/20)
split_idx = int(len(X_encoded) * 0.8)
X_train = X_encoded[:split_idx]
X_test = X_encoded[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

print(f"Train set : {X_train.shape[0]} observations ({len(X_train)/len(X_encoded)*100:.1f}%)")
print(f"Test set : {X_test.shape[0]} observations ({len(X_test)/len(X_encoded)*100:.1f}%)")
print(f"Features : {X_train.shape[1]}")
print(f"Période train : {df_features.index[0]} à {df_features.index[split_idx-1]}")
print(f"Période test : {df_features.index[split_idx]} à {df_features.index[-1]}")

# Normalization (important pour certains modèles)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nX_train shape : {X_train_scaled.shape}")
print(f"X_test shape : {X_test_scaled.shape}")
```

---

## PARTIE 4 : MODÈLE LIGHTGBM

### 4.1 - Entraînement LightGBM

```python
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("=" * 80)
print("ENTRAÎNEMENT LIGHTGBM")
print("=" * 80)

# Créer et entraîner LightGBM
lgbm_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)

# Prédictions
y_pred_train_lgb = lgbm_model.predict(X_train)
y_pred_test_lgb = lgbm_model.predict(X_test)

# Évaluations
rmse_train_lgb = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
rmse_test_lgb = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
mae_test_lgb = mean_absolute_error(y_test, y_pred_test_lgb)
r2_test_lgb = r2_score(y_test, y_pred_test_lgb)
mape_test_lgb = np.mean(np.abs((y_test - y_pred_test_lgb) / y_test)) * 100

print("\n" + "=" * 80)
print("RÉSULTATS LIGHTGBM")
print("=" * 80)
print(f"RMSE Train : {rmse_train_lgb:.4f} €/MWh")
print(f"RMSE Test  : {rmse_test_lgb:.4f} €/MWh")
print(f"MAE Test   : {mae_test_lgb:.4f} €/MWh")
print(f"R² Score   : {r2_test_lgb:.4f}")
print(f"MAPE       : {mape_test_lgb:.2f}%")

# Sauvegarder le modèle
import joblib
joblib.dump(lgbm_model, 'lgbm_france_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("\n✓ Modèle sauvegardé : lgbm_france_model.pkl")
```

### 4.2 - Visualisation des Prédictions LightGBM

```python
# Créer les indices des dates pour le test set
test_dates = df_features.index[split_idx:]

# Graphique prédictions vs réalité
fig = make_subplots(specs=[[{"secondary_y": False}]])

fig.add_trace(go.Scatter(
    x=test_dates, y=y_test.values,
    mode='lines', name='Réel', 
    line=dict(color='blue', width=2)
))

fig.add_trace(go.Scatter(
    x=test_dates, y=y_pred_test_lgb,
    mode='lines', name='Prédiction LightGBM',
    line=dict(color='red', width=2, dash='dash')
))

fig.update_layout(
    title="Prédictions vs Réalité - LightGBM",
    xaxis_title="Date",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white",
    hovermode="x unified",
    height=500
)
fig.show()

# Erreur d'approximation
errors = y_test.values - y_pred_test_lgb
fig = go.Figure()
fig.add_trace(go.Histogram(x=errors, nbinsx=50, name='Erreur'))
fig.update_layout(
    title="Distribution des Erreurs de Prédiction",
    xaxis_title="Erreur (€/MWh)",
    yaxis_title="Fréquence",
    template="plotly_white"
)
fig.show()

# Performance par trimestre
test_df_eval = pd.DataFrame({
    'date': test_dates,
    'actual': y_test.values,
    'predicted': y_pred_test_lgb,
    'error': np.abs(y_test.values - y_pred_test_lgb)
})
test_df_eval['month'] = test_df_eval['date'].dt.month
test_df_eval['season'] = test_df_eval['month'].map(season_map)

monthly_errors = test_df_eval.groupby('season')['error'].agg(['mean', 'std', 'min', 'max'])
print("\nErreurs par Saison :")
print(monthly_errors)
```

### 4.3 - Feature Importance LightGBM

```python
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgbm_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n" + "=" * 80)
print("TOP 15 FEATURES - LIGHTGBM")
print("=" * 80)
print(feature_importance.head(15))

# Visualisation
fig = go.Figure()
top_15_features = feature_importance.head(15)
fig.add_trace(go.Bar(
    x=top_15_features['importance'].values,
    y=top_15_features['feature'].values,
    orientation='h',
    marker=dict(color='steelblue')
))
fig.update_layout(
    title="Top 15 Features - LightGBM",
    xaxis_title="Importance",
    template="plotly_white",
    height=500
)
fig.show()
```

---

## PARTIE 5 : OPTIMISATION HYPERPARAMÈTRES (OPTUNA)

### 5.1 - Optuna pour LightGBM

```python
import optuna
from optuna.pruners import MedianPruner
from sklearn.model_selection import cross_val_score

print("=" * 80)
print("OPTIMISATION HYPERPARAMÈTRES - OPTUNA")
print("=" * 80)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 1.0),
        'random_state': 42,
        'verbose': -1,
        'n_jobs': -1
    }
    
    model = lgb.LGBMRegressor(**params)
    
    # Validation croisée
    scores = -cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_score = np.sqrt(scores.mean())
    
    return rmse_score

# Créer l'étude
study = optuna.create_study(
    direction='minimize',
    pruner=MedianPruner(),
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Optimiser (10 trials pour rapidité, augmenter à 100 pour la production)
study.optimize(objective, n_trials=10, show_progress_bar=True)

print("\n" + "=" * 80)
print("MEILLEURS PARAMÈTRES")
print("=" * 80)
print(study.best_params)
print(f"Meilleur RMSE (CV) : {study.best_value:.4f}")

# Ré-entraîner avec les meilleurs paramètres
best_params = study.best_params
lgbm_optimized = lgb.LGBMRegressor(**best_params)
lgbm_optimized.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                   callbacks=[lgb.early_stopping(50)], verbose=-1)

y_pred_optimized = lgbm_optimized.predict(X_test)
rmse_optimized = np.sqrt(mean_squared_error(y_test, y_pred_optimized))
r2_optimized = r2_score(y_test, y_pred_optimized)

print(f"\nAvec optimisation Optuna :")
print(f"RMSE : {rmse_optimized:.4f} €/MWh")
print(f"R² : {r2_optimized:.4f}")

# Sauvegarder
joblib.dump(lgbm_optimized, 'lgbm_optimized_model.pkl')
```

### 5.2 - Historique Optuna

```python
# Visualiser l'historique d'optimisation
import plotly.graph_objects as go

trials_data = []
for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE:
        trials_data.append({
            'trial': trial.number,
            'value': trial.value
        })

trials_df = pd.DataFrame(trials_data)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=trials_df['trial'],
    y=trials_df['value'],
    mode='lines+markers',
    name='RMSE',
    line=dict(color='green')
))
fig.add_hline(y=trials_df['value'].min(), line_dash="dash", 
              annotation_text=f"Min: {trials_df['value'].min():.4f}")
fig.update_layout(
    title="Historique d'Optimisation Optuna",
    xaxis_title="Trial Number",
    yaxis_title="RMSE (€/MWh)",
    template="plotly_white"
)
fig.show()
```

---

## PARTIE 6 : ANALYSE SHAP

### 6.1 - SHAP Explainer

```python
import shap

print("=" * 80)
print("ANALYSE SHAP")
print("=" * 80)

# Créer l'explainer
explainer = shap.TreeExplainer(lgbm_optimized)

# Calculer les valeurs SHAP sur un échantillon du test set (pour la rapidité)
sample_size = min(500, len(X_test))
shap_values = explainer.shap_values(X_test[:sample_size])

print(f"SHAP values shape : {shap_values.shape}")
print(f"X_test shape : {X_test[:sample_size].shape}")
```

### 6.2 - SHAP Summary Plots

```python
import matplotlib.pyplot as plt
import shap

# Summary plot (beeswarm)
fig = plt.figure(figsize=(14, 8))
shap.summary_plot(shap_values, X_test[:sample_size], show=False, max_display=15)
plt.title('SHAP Summary Plot - Beeswarm', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('shap_summary_beeswarm.png', dpi=150, bbox_inches='tight')
plt.show()

# Summary plot (bar)
fig = plt.figure(figsize=(14, 8))
shap.summary_plot(shap_values, X_test[:sample_size], plot_type="bar", show=False, max_display=15)
plt.title('SHAP Summary Plot - Feature Importance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('shap_summary_bar.png', dpi=150, bbox_inches='tight')
plt.show()
```

### 6.3 - SHAP Dependence Plots

```python
# Dépendance des top features
top_features_list = feature_importance.head(6)['feature'].values

for feature in top_features_list:
    if feature in X_test.columns:
        fig = plt.figure(figsize=(12, 6))
        shap.dependence_plot(feature, shap_values, X_test[:sample_size], show=False)
        plt.title(f'SHAP Dependence Plot - {feature}', fontsize=12, fontweight='bold')
        plt.tight_layout()
        plt.savefig(f'shap_dependence_{feature}.png', dpi=150, bbox_inches='tight')
        plt.show()
```

### 6.4 - SHAP Force Plot

```python
# Force plot pour quelques prédictions
fig = shap.force_plot(
    explainer.expected_value,
    shap_values[:3],
    X_test[:3],
    show=False
)
shap.getjs()  # Pour afficher correctement
plt.savefig('shap_force_plot.png', dpi=150, bbox_inches='tight')
```

### 6.5 - Visualisation Plotly des SHAP Values

```python
# Convertir SHAP en Plotly pour meilleures visualisations

# Top 10 features par importance SHAP moyenne
mean_abs_shap = np.abs(shap_values).mean(axis=0)
top_indices = np.argsort(mean_abs_shap)[-10:][::-1]

shap_summary_data = pd.DataFrame({
    'feature': X_test.columns[top_indices],
    'mean_abs_shap': mean_abs_shap[top_indices]
})

fig = go.Figure()
fig.add_trace(go.Bar(
    x=shap_summary_data['mean_abs_shap'],
    y=shap_summary_data['feature'],
    orientation='h',
    marker=dict(color='crimson')
))
fig.update_layout(
    title="Feature Importance (Mean |SHAP Values|)",
    xaxis_title="Mean Absolute SHAP Value",
    template="plotly_white",
    height=400
)
fig.show()

# Scatter plot - Feature vs SHAP value
for feature in X_test.columns[top_indices[:3]]:
    feature_idx = X_test.columns.get_loc(feature)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=X_test.iloc[:sample_size, feature_idx],
        y=shap_values[:, feature_idx],
        mode='markers',
        marker=dict(size=6, color=X_test.iloc[:sample_size, feature_idx], 
                   colorscale='Viridis', showscale=True),
        text=X_test.iloc[:sample_size].index.strftime('%Y-%m-%d'),
        hovertemplate='<b>Date:</b> %{text}<br><b>Value:</b> %{x:.2f}<br><b>SHAP:</b> %{y:.2f}<extra></extra>'
    ))
    fig.update_layout(
        title=f'SHAP Dependence - {feature}',
        xaxis_title=f'{feature}',
        yaxis_title='SHAP Value',
        template="plotly_white",
        height=500
    )
    fig.show()
```

---

## PARTIE 7 : RAPPORT FINAL

### 7.1 - Résumé des Performances

```python
print("\n" + "=" * 80)
print("RAPPORT FINAL - RÉSUMÉ DES MODÈLES")
print("=" * 80)

results_summary = pd.DataFrame({
    'Modèle': ['LightGBM Initial', 'LightGBM Optimisé'],
    'RMSE': [rmse_test_lgb, rmse_optimized],
    'MAE': [mae_test_lgb, mean_absolute_error(y_test, y_pred_optimized)],
    'R²': [r2_test_lgb, r2_optimized],
    'MAPE': [mape_test_lgb, np.mean(np.abs((y_test - y_pred_optimized) / y_test)) * 100]
})

print(results_summary.to_string(index=False))

# Comparaison visuelle
fig = go.Figure()
fig.add_trace(go.Bar(name='LightGBM Initial', x=['RMSE', 'MAE', 'R²'], 
                     y=[rmse_test_lgb, mae_test_lgb, r2_test_lgb]))
fig.add_trace(go.Bar(name='LightGBM Optimisé', x=['RMSE', 'MAE', 'R²'], 
                     y=[rmse_optimized, mean_absolute_error(y_test, y_pred_optimized), r2_optimized]))
fig.update_layout(title="Comparaison des Modèles", template="plotly_white", barmode='group')
fig.show()
```

### 7.2 - Insights Clés

```python
print("\n" + "=" * 80)
print("INSIGHTS CLÉS DE L'ANALYSE")
print("=" * 80)

insights = f"""
1. PERFORMANCE DU MODÈLE
   - Le modèle LightGBM atteint un R² de {r2_optimized:.4f} (très bon)
   - RMSE : {rmse_optimized:.2f} €/MWh (précision haute)
   - Erreur moyenne (MAPE) : {np.mean(np.abs((y_test - y_pred_optimized) / y_test)) * 100:.2f}%

2. FACTEURS CLÉS INFLUENÇANT LE PRIX
   - Charge électrique : {feature_importance[feature_importance['feature']=='load']['importance'].values[0]:.0f}
   - Lags du prix : Impact très significatif (autocorrélation)
   - Heure du jour : Forte saisonnalité intra-journalière
   - Weekend : Baisse claire du prix (-24%)

3. PATTERNS DÉTECTÉS
   - Prix plus élevés en hiver (chauffage, demande accrue)
   - Pics de prix aux heures de pointe (matin/soir)
   - Production solaire faible en France → peu de corrélation directe
   - Production éolienne en hiver plus importante

4. RECOMMANDATIONS
   ✓ Utiliser le modèle pour prévision court-terme (24h)
   ✓ Recalibrer mensuellement avec nouvelles données
   ✓ Attention aux outliers (pics exceptionnels >100€)
   ✓ Intégrer les données météo futures pour meilleure précision
"""

print(insights)
```

### 7.3 - Sauvegarde des Résultats

```python
# Sauvegarder les résultats
results_summary.to_csv('model_results.csv', index=False)
feature_importance.to_csv('feature_importance.csv', index=False)

# Créer un rapport JSON
import json

report = {
    'model_performance': {
        'rmse': float(rmse_optimized),
        'mae': float(mean_absolute_error(y_test, y_pred_optimized)),
        'r2': float(r2_optimized),
        'mape': float(np.mean(np.abs((y_test - y_pred_optimized) / y_test)) * 100)
    },
    'best_hyperparameters': study.best_params,
    'top_features': feature_importance.head(10).to_dict('records'),
    'train_test_split': {
        'train_size': len(X_train),
        'test_size': len(X_test),
        'train_period': str(df_features.index[0]),
        'test_period_start': str(df_features.index[split_idx]),
        'test_period_end': str(df_features.index[-1])
    }
}

with open('model_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print("\n✓ Résultats sauvegardés :")
print("  - model_results.csv")
print("  - feature_importance.csv")
print("  - model_report.json")
print("  - lgbm_optimized_model.pkl")
```

---

## Fin du Notebook

Ce notebook couvre toutes les étapes de la prédiction des prix électriques français avec une analyse complète et des visualisations interactives en Plotly.
