# EDA & Modélisation - Marché Électrique Danemark (DK1)
*Analyse approfondie : Prix Day-Ahead, Impact Éolien et Charge Résiduelle*

### 1. Configuration et Importation

In [1]:
import urllib.request
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import shap
import warnings

warnings.filterwarnings('ignore')

# --- Téléchargement des données ---
os.makedirs('../data/raw', exist_ok=True)
url = "https://data.open-power-system-data.org/time_series/latest/time_series_60min_singleindex.csv"
destination = "../data/raw/time_series_60min.csv"

if not os.path.exists(destination):
    print("⏳ Téléchargement du dataset (ceci peut prendre un moment)...")
    urllib.request.urlretrieve(url, destination)
    print("Dataset téléchargé !")
else:
    print("Dataset déjà présent localement.")

# Chargement
df = pd.read_csv(destination, parse_dates=['utc_timestamp'], low_memory=False)
df = df.set_index('utc_timestamp')

  from .autonotebook import tqdm as notebook_tqdm


Dataset déjà présent localement.


### 2. Préparation des Données (Zone DK1)

In [2]:
# Mappage des colonnes DK1
cols_mapping = {
    'DK_1_price_day_ahead': 'price',
    'DK_1_load_actual_entsoe_transparency': 'load_actual',
    'DK_1_load_forecast_entsoe_transparency': 'load_forecast',
    'DK_1_solar_generation_actual': 'solar_generation',
    'DK_1_wind_generation_actual': 'wind_generation' 
}

df_dk = df[list(cols_mapping.keys())].rename(columns=cols_mapping)

# Focus sur 2018-2019 (Données complètes et récentes)
df_dk = df_dk.loc['2018-01-01':'2019-12-31']
df_dk = df_dk.interpolate(method='linear').dropna()

# Enrichissement Temporel
df_dk['hour'] = df_dk.index.hour
df_dk['day_of_week'] = df_dk.index.dayofweek
df_dk['weekday_name'] = df_dk.index.day_name()
df_dk['month'] = df_dk.index.month

print(f"Données prêtes : {df_dk.shape}")

Données prêtes : (17520, 9)


### 3. Visualisations Fondamentales

In [3]:
# 3.1 Heatmap Hebdomadaire (Patterns de consommation)
heatmap_data = df_dk.groupby(['weekday_name', 'hour'])['price'].mean().reset_index()
days_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
heatmap_data['weekday_name'] = pd.Categorical(heatmap_data['weekday_name'], categories=days_order, ordered=True)
heatmap_data = heatmap_data.sort_values(['weekday_name', 'hour'])
z_data = heatmap_data.pivot(index='weekday_name', columns='hour', values='price')

fig = px.imshow(z_data, title="Heatmap Hebdomadaire des Prix (DK1)", color_continuous_scale="RdYlGn_r", aspect="auto")
fig.show()

# 3.2 Price Duration Curve
sorted_price = df_dk['price'].sort_values(ascending=False).reset_index(drop=True)
sorted_price.index = (sorted_price.index / len(sorted_price)) * 100
fig = px.area(x=sorted_price.index, y=sorted_price.values, title="Price Duration Curve", labels={"x": "% Temps", "y": "Prix"})
fig.add_hline(y=0, line_dash="dash", line_color="red")
fig.show()

### 4. Visualisations Avancées (Market Dynamics)

In [4]:
# --- 4.1. Analyse de la Charge Résiduelle (Residual Load) ---
# La charge résiduelle = Demande - (Vent + Solaire). C'est ce que le marché doit fournir via le pilotable.

df_dk['residual_load'] = df_dk['load_actual'] - df_dk['wind_generation'] - df_dk['solar_generation']

fig = px.scatter(
    df_dk.sample(5000), 
    x="residual_load", 
    y="price", 
    color="wind_generation",
    title="Merit Order Empirique : Prix vs Charge Résiduelle",
    labels={"residual_load": "Charge Résiduelle (MW)", "price": "Prix (€/MWh)", "wind_generation": "Vent (MW)"},
    template="plotly_white",
    opacity=0.5,
    trendline="lowess",
    trendline_color_override="red"
)
fig.show()

# --- 4.2. Boxplot des Prix par Tranches de Vent ---
# Discrétisation du vent pour voir la distribution des prix
df_dk['wind_bins'] = pd.cut(df_dk['wind_generation'], bins=10)

# On convertit les intervalles en chaînes pour l'affichage
df_dk['wind_bins_str'] = df_dk['wind_bins'].astype(str)

# Tri pour l'axe X (basé sur la limite gauche de l'intervalle)
df_dk['wind_bins_sort'] = df_dk['wind_bins'].apply(lambda x: x.left)
df_dk = df_dk.sort_values('wind_bins_sort')

fig = px.box(
    df_dk, 
    x="wind_bins_str", 
    y="price", 
    title="Distribution des Prix par Tranche de Production Éolienne",
    labels={"wind_bins_str": "Tranches de Production Éolienne (MW)", "price": "Prix (€/MWh)"},
    template="plotly_white",
    color_discrete_sequence=['#1f77b4']
)
fig.update_xaxes(type='category') # Force l'ordre catégoriel
fig.show()

### 5. Modélisation Prédictive (LightGBM)

In [5]:
# Feature Engineering pour le modèle
df_model = df_dk.copy()
df_model['price_lag_24h'] = df_model['price'].shift(24)
df_model['wind_forecast_lag_24h'] = df_model['wind_generation'].shift(24) 
df_model = df_model.dropna()

features = ['hour', 'day_of_week', 'month', 'price_lag_24h', 'load_forecast', 'wind_forecast_lag_24h', 'solar_generation']
target = 'price'

# Split Train/Test (Test sur les 6 derniers mois de 2019)
split = '2019-06-01'
X_train = df_model.loc[df_model.index < split, features]
y_train = df_model.loc[df_model.index < split, target]
X_test = df_model.loc[df_model.index >= split, features]
y_test = df_model.loc[df_model.index >= split, target]

# Entraînement
model = lgb.LGBMRegressor(random_state=42, n_estimators=500)
model.fit(X_train, y_train)

# Prédiction et Métriques
y_pred = model.predict(X_test)
print(f"RMSE Test : {np.sqrt(mean_squared_error(y_test, y_pred)):.2f} €/MWh")

# SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'feature': features, 'importance': shap_sum}).sort_values('importance', ascending=True)

fig = px.bar(importance_df, x='importance', y='feature', orientation='h', title="Importance des Features (SHAP)")
fig.show()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 12362, number of used features: 7
[LightGBM] [Info] Start training from score 43.041228
RMSE Test : 16.14 €/MWh
