# CHARGEMENT ET EXPLORATION DES DONNÉES

### Chargement et Nettoyage Initial

In [1]:
import urllib.request
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from statsmodels.tsa.seasonal import STL
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import optuna
import shap
import seaborn as sns
import skimpy as sk
import summarytools as st
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
import plotly.io as pio
import calendar
pio.templates.default = "plotly_white"

print("Environnement configuré avec succès!")
print(f"Date d'exécution: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

Environnement configuré avec succès!
Date d'exécution: 2025-12-09 17:20


In [2]:
# dataset
df = pd.read_csv('../data/raw/time_series_60min_fr_dk.csv', 
    parse_dates=['utc_timestamp', 'cet_cest_timestamp'],
    low_memory=False)
df = df.set_index('utc_timestamp')
df.head()


Unnamed: 0_level_0,cet_cest_timestamp,DK_load_actual_entsoe_transparency,DK_load_forecast_entsoe_transparency,DK_solar_capacity,DK_solar_generation_actual,DK_wind_capacity,DK_wind_generation_actual,DK_wind_offshore_capacity,DK_wind_offshore_generation_actual,DK_wind_onshore_capacity,...,FR_solar_generation_actual,FR_wind_onshore_generation_actual,IT_NORD_FR_price_day_ahead,temperature_france,cloud_cover_france,temperature_denmark,cloud_cover_denmark,FR_nuclear_generation_actual,wind_speed_france,wind_speed_denmark
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31 23:00:00+00:00,2015-01-01 00:00:00+01:00,,,,,,,,,,...,,,,1.58902,0.288571,4.954803,0.82274,57982.0,0.387504,7.049201
2015-01-01 00:00:00+00:00,2015-01-01 01:00:00+01:00,,,489.0,,4643.0,,1264.0,,3379.0,...,,,,1.393066,0.281712,4.939117,0.906714,57787.0,0.387504,7.049201
2015-01-01 01:00:00+00:00,2015-01-01 02:00:00+01:00,3100.02,3126.8,489.0,,4643.0,2357.33,1264.0,902.71,3379.0,...,,1464.0,,1.31897,0.287806,4.970825,0.936386,57974.0,0.395347,7.16562
2015-01-01 02:00:00+00:00,2015-01-01 03:00:00+01:00,2980.39,3019.0,489.0,,4643.0,2387.35,1264.0,830.87,3379.0,...,,1543.0,,1.188934,0.292199,5.007904,0.942083,57522.0,0.487387,7.215656
2015-01-01 03:00:00+00:00,2015-01-01 04:00:00+01:00,2933.49,2976.3,489.0,,4643.0,2594.47,1264.0,915.43,3379.0,...,,1579.0,,1.03241,0.31295,5.076752,0.962194,57824.0,0.584224,7.344737


**Selection France**

In [3]:
# Sélectionner les colonnes France pertinentes
france_cols = ['FR_load_actual_entsoe_transparency', 'FR_load_forecast_entsoe_transparency',
               'FR_solar_generation_actual', 'FR_wind_onshore_generation_actual',
               'IT_NORD_FR_price_day_ahead', 'temperature_france',
               'cloud_cover_france', 'FR_nuclear_generation_actual', 'wind_speed_france']

df_france = df[france_cols].copy()
df_france.head()


Unnamed: 0_level_0,FR_load_actual_entsoe_transparency,FR_load_forecast_entsoe_transparency,FR_solar_generation_actual,FR_wind_onshore_generation_actual,IT_NORD_FR_price_day_ahead,temperature_france,cloud_cover_france,FR_nuclear_generation_actual,wind_speed_france
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-12-31 23:00:00+00:00,,,,,,1.58902,0.288571,57982.0,0.387504
2015-01-01 00:00:00+00:00,,,,,,1.393066,0.281712,57787.0,0.387504
2015-01-01 01:00:00+00:00,69773.0,69750.0,,1464.0,,1.31897,0.287806,57974.0,0.395347
2015-01-01 02:00:00+00:00,66417.0,66300.0,,1543.0,,1.188934,0.292199,57522.0,0.487387
2015-01-01 03:00:00+00:00,64182.0,63250.0,,1579.0,,1.03241,0.31295,57824.0,0.584224


In [4]:

# Renommer les colonnes pour faciliter l'utilisation
df_france.columns = ['load', 'load_forecast', 'solar', 'wind',
                     'price_day_ahead', 'temperature', 'cloud_cover', 'nuclear', 'wind_speed']
print(f"Shape initial : {df_france.shape}")
#print(f"Périodе : {df_france['utc_timestamp'].min()} à {df_france['utc_timestamp'].max()}")
df_france.head()


Shape initial : (50401, 9)


Unnamed: 0_level_0,load,load_forecast,solar,wind,price_day_ahead,temperature,cloud_cover,nuclear,wind_speed
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-12-31 23:00:00+00:00,,,,,,1.58902,0.288571,57982.0,0.387504
2015-01-01 00:00:00+00:00,,,,,,1.393066,0.281712,57787.0,0.387504
2015-01-01 01:00:00+00:00,69773.0,69750.0,,1464.0,,1.31897,0.287806,57974.0,0.395347
2015-01-01 02:00:00+00:00,66417.0,66300.0,,1543.0,,1.188934,0.292199,57522.0,0.487387
2015-01-01 03:00:00+00:00,64182.0,63250.0,,1579.0,,1.03241,0.31295,57824.0,0.584224


**Valeurs manquantes et doublons**

* Doublons

In [5]:
# Vérification des doublons sur l'index utc_timestamp
total = len(df_france)
unique = df_france.index.nunique()
dup = total - unique

print(f"Total lignes: {total}")
print(f"Lignes uniques par utc_timestamp: {unique}")
print(f"Doublons détectés: {dup}")

if dup:
    dup_timestamps = df.index[df.index.duplicated(keep=False)].unique()
    print(f"Nombre de timestamps dupliqués uniques: {len(dup_timestamps)}")
    display(pd.DataFrame({"duplicated_timestamp": dup_timestamps}).head(20))
    # Afficher un échantillon des lignes dupliquées pour inspection
    sample_ts = dup_timestamps[:5]
    for ts in sample_ts:
        print(f"\nExemple pour timestamp dupliqué: {ts}")
        display(df_france.loc[ts])
else:
    print("Aucun doublon trouvé sur utc_timestamp.")

Total lignes: 50401
Lignes uniques par utc_timestamp: 50401
Doublons détectés: 0
Aucun doublon trouvé sur utc_timestamp.


* Valeurs manquantes

In [6]:
# Quantification et visualisation des valeurs manquantes (df_france)

missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

# Bar plot des pourcentages de valeurs manquantes
fig_missing_bar = px.bar(
    missing_df_all.reset_index().rename(columns={"index": "column"}),
    x="missing_pct",
    y="column",
    orientation="h",
    text="missing_pct",
    title="Pourcentage de valeurs manquantes par colonne (df_france)",
    labels={"missing_pct": "% NaN", "column": "Colonne"},
)
fig_missing_bar.update_traces(texttemplate="%{text:.2f}%", textposition="outside")

fig_missing_bar.show()

Unnamed: 0,missing_count,missing_pct
price_day_ahead,24825,49.254975
load,44,0.0873
solar,15,0.029761
wind,8,0.015873
nuclear,5,0.00992
load_forecast,3,0.005952
temperature,0,0.0
cloud_cover,0,0.0
wind_speed,0,0.0


In [7]:
# Périodes avec valeurs manquantes pour IT_NORD_FR_price_day_ahead
col = "price_day_ahead"
mask = df_france[col].isna()

if not mask.any():
    print(f"Aucune valeur manquante pour {col}.")
else:
    # numéroter les runs (changes de state)
    run_id = (mask != mask.shift(1)).cumsum()
    runs = (
        df_france[mask]
        .groupby(run_id[mask])
        .apply(lambda x: pd.Series({
            "start": x.index.min(),
            "end": x.index.max(),
            "n_points": len(x)
        }))
        .reset_index(drop=True)
    )
    runs["duration_hours"] = (runs["end"] - runs["start"]) / np.timedelta64(1, "h") + 1
    runs = runs.sort_values("start").reset_index(drop=True)

    print(f"Nombre de périodes disjointes avec des NaN pour {col} : {len(runs)}")
    display(runs)

    overall = pd.Series({
        "first_nan": runs["start"].min(),
        "last_nan": runs["end"].max(),
        "total_nan_points": int(mask.sum()),
        "total_points": len(df_france),
        "nan_pct": mask.mean() * 100
    })
    display(overall)

Nombre de périodes disjointes avec des NaN pour price_day_ahead : 5


Unnamed: 0,start,end,n_points,duration_hours
0,2014-12-31 23:00:00+00:00,2015-01-04 23:00:00+00:00,97,97.0
1,2015-03-29 02:00:00+00:00,2015-03-29 03:00:00+00:00,2,2.0
2,2016-03-27 02:00:00+00:00,2016-03-27 03:00:00+00:00,2,2.0
3,2017-03-26 02:00:00+00:00,2017-03-26 03:00:00+00:00,2,2.0
4,2017-12-05 22:00:00+00:00,2020-09-30 23:00:00+00:00,24722,24722.0


first_nan           2014-12-31 23:00:00+00:00
last_nan            2020-09-30 23:00:00+00:00
total_nan_points                        24825
total_points                            50401
nan_pct                             49.254975
dtype: object

***Troncature des données --- Limite du DataFrame à la plage de dates pour laquelle les données de prix sont disponibles***

In [8]:
start_date = '2015-01-05'
end_date = '2017-12-05'
df_france = df_france.loc[start_date:end_date]

In [9]:
missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

Unnamed: 0,missing_count,missing_pct
load,13,0.050813
price_day_ahead,8,0.03127
nuclear,3,0.011726
solar,0,0.0
load_forecast,0,0.0
wind,0,0.0
temperature,0,0.0
cloud_cover,0,0.0
wind_speed,0,0.0


***Imputation des valeurs manquantes***

In [10]:
# Utiliser l'interpolation linéaire pour les quelques NaN restants
df_france.interpolate(method='linear', inplace=True)

In [11]:
missing_count = df_france.isna().sum()
missing_pct = (missing_count / len(df_france)) * 100
missing_df_all = (
    pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
    .sort_values("missing_pct", ascending=False)
)
display(missing_df_all)

Unnamed: 0,missing_count,missing_pct
load,0,0.0
load_forecast,0,0.0
solar,0,0.0
wind,0,0.0
price_day_ahead,0,0.0
temperature,0,0.0
cloud_cover,0,0.0
nuclear,0,0.0
wind_speed,0,0.0


### Statistiques descriptive

In [12]:
df_france.describe()

Unnamed: 0,load,load_forecast,solar,wind,price_day_ahead,temperature,cloud_cover,nuclear,wind_speed
count,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0,25584.0
mean,53820.941252,53903.126837,926.280801,2300.758443,49.598433,12.560578,0.582787,44617.642433,2.686989
std,11571.977,11701.785429,1265.216606,1598.557605,16.5726,5.97153,0.19015,6682.434355,1.389591
min,30426.0,29650.0,0.0,262.0,5.0,-2.715515,0.014118,28074.0,0.025694
25%,45167.0,45150.0,0.0,1137.0,39.03,7.861191,0.463206,39334.25,1.636837
50%,52217.5,52300.0,103.0,1798.0,47.46,12.189209,0.600811,43334.5,2.563375
75%,61482.5,61650.0,1752.25,3002.0,56.3,17.068931,0.72404,49198.75,3.506684
max,93696.0,95050.0,5361.0,9373.0,206.12,29.214722,0.995049,61676.0,8.894589


In [13]:
sk.skim(df_france)

In [14]:
st.dfSummary(df_france)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,load [float64],Mean (sd) : 53820.9 (11572.0) min < med < max: 30426.0 < 52217.5 < 93696.0 IQR (CV) : 16315.5 (4.7),"19,131 distinct values",,0 (0.0%)
2,load_forecast [float64],Mean (sd) : 53903.1 (11701.8) min < med < max: 29650.0 < 52300.0 < 95050.0 IQR (CV) : 16500.0 (4.6),"1,342 distinct values",,0 (0.0%)
3,solar [float64],Mean (sd) : 926.3 (1265.2) min < med < max: 0.0 < 103.0 < 5361.0 IQR (CV) : 1752.2 (0.7),"4,219 distinct values",,0 (0.0%)
4,wind [float64],Mean (sd) : 2300.8 (1598.6) min < med < max: 262.0 < 1798.0 < 9373.0 IQR (CV) : 1865.0 (1.4),"5,883 distinct values",,0 (0.0%)
5,price_day_ahead [float64],Mean (sd) : 49.6 (16.6) min < med < max: 5.0 < 47.5 < 206.1 IQR (CV) : 17.3 (3.0),"5,816 distinct values",,0 (0.0%)
6,temperature [float64],Mean (sd) : 12.6 (6.0) min < med < max: -2.7 < 12.2 < 29.2 IQR (CV) : 9.2 (2.1),"25,042 distinct values",,0 (0.0%)
7,cloud_cover [float64],Mean (sd) : 0.6 (0.2) min < med < max: 0.0 < 0.6 < 1.0 IQR (CV) : 0.3 (3.1),"25,552 distinct values",,0 (0.0%)
8,nuclear [float64],Mean (sd) : 44617.6 (6682.4) min < med < max: 28074.0 < 43334.5 < 61676.0 IQR (CV) : 9864.5 (6.7),"15,580 distinct values",,0 (0.0%)
9,wind_speed [float64],Mean (sd) : 2.7 (1.4) min < med < max: 0.0 < 2.6 < 8.9 IQR (CV) : 1.9 (1.9),"25,570 distinct values",,0 (0.0%)


## EDA France

* **Analyse du "Price_day_ahead"**

In [15]:

# Distribution du prix
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_france['price_day_ahead'], nbinsx=50, name='Historique'))
fig.update_layout(
    title="Distribution des Prix (price_day_ahead)",
    xaxis_title="Prix (€/MWh)",
    yaxis_title="Fréquence",
    template="plotly_white",
    hovermode="x unified",
    height=400
)
fig.show()


In [16]:

# Série temporelle du prix
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france.index, y=df_france['price_day_ahead'],
                         mode='lines', name='Prix', line=dict(color='blue')))
fig.update_layout(
    title="Évolution Temporelle du Prix (2015-22017)",
    xaxis_title="Date",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white",
    hovermode="x unified",
    height=400
)
fig.show()


In [20]:
# Prix vs Charge
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france['load'], y=df_france['price_day_ahead'],
                         mode='markers', marker=dict(size=4, opacity=0.5, 
                         color=df_france['temperature'], colorscale='Viridis'),
                         text=df_france.index.strftime('%Y-%m-%d'),
                         hovertemplate='<b>Date:</b> %{text}<br><b>Charge:</b> %{x:.0f} MW<br><b>Prix:</b> %{y:.2f} €/MWh<extra></extra>'))
fig.update_layout(
    title="Prix vs Charge Électrique",
    xaxis_title="Charge (MW)",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white",
    height=500
)
fig.show()


In [21]:

# Prix vs Température
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_france['temperature'], y=df_france['price_day_ahead'],
                         mode='markers', marker=dict(size=4, opacity=0.5, color='red')))
fig.update_layout(
    title="Prix vs Température",
    xaxis_title="Température (°C)",
    yaxis_title="Prix (€/MWh)",
    template="plotly_white"
)
fig.show()


* **Saisonnalité**

In [22]:
#Analyse de la saisonnalité
df_seasonal = df_france.copy()
df_seasonal['month'] = df_seasonal.index.month_name()
df_seasonal['day_of_week'] = df_seasonal.index.day_name()
df_seasonal['hour'] = df_seasonal.index.hour

In [24]:
# Saisonnalité annuelle : distribution mensuelle des prix

fig = px.box(
    df_seasonal,
    x="month",
    y="price_day_ahead",
    points="outliers", 
    title="Saisonnalité annuelle : distribution mensuelle des prix",
    labels={"month": "Mois", "price": "Prix (€/MWh)"},
    template="plotly_white",
)
fig.update_layout(xaxis=dict(dtick=1))

fig.show()

In [25]:
# Saisonnalité quotidienne : distribution des prix par heure

fig = px.box(
    df_seasonal,
    x="hour",
    y="price_day_ahead",
    points="outliers",
    title="Saisonnalité quotidienne : distribution des prix par heure",
    labels={"hour": "Heure de la journée", "price": "Prix (€/MWh)"},
    template="plotly_white",
)
fig.update_xaxes(dtick=1)

fig.show()

In [26]:
# Semaine vs week-end
df_seasonal["week_period"] = np.where(
    df_seasonal["day_of_week"].isin(["Saturday", "Sunday"]),
    "Weekend",
    "Weekday",
)

fig_week = px.box(
    df_seasonal,
    x="week_period",
    y="price_day_ahead",
    color="week_period",
    points="outliers",
    title="Distribution des prix : Semaine vs Week-end",
    labels={"week_period": "", "price": "Prix (€/MWh)"},
    template="plotly_white",
)
fig_week.update_layout(showlegend=False)
fig_week.show()

In [27]:
# Saisons (été/hiver/printemps/automne)
SEASON_LABELS = {
    "December": "Winter", "January": "Winter", "February": "Winter",
    "March": "Spring", "April": "Spring", "May": "Spring",
    "June": "Summer", "July": "Summer", "August": "Summer",
    "September": "Autumn", "October": "Autumn", "November": "Autumn",
}
df_seasonal["season"] = df_seasonal["month"].map(SEASON_LABELS)
season_order = ["Winter", "Spring", "Summer", "Autumn"]

fig_season = px.box(
    df_seasonal,
    x="season",
    y="price_day_ahead",
    category_orders={"season": season_order},
    points="outliers",
    title="Distribution des prix : Saisons",
    labels={"season": "Saison", "price": "Prix (€/MWh)"},
    template="plotly_white",
)
fig_season.show()

In [28]:
# Saisonnalité hebdomadaire : distribution des prix par jour de la semaine

day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

fig = px.box(
    df_seasonal,
    x="day_of_week",
    y="price_day_ahead",
    category_orders={"day_of_week": day_order},
    points="outliers",
    title="Saisonnalité hebdomadaire : distribution des prix par jour de la semaine",
    labels={"day_of_week": "Jour de la semaine", "price": "Prix (€/MWh)"},
    template="plotly_white",
)
fig.update_xaxes(tickangle=-45)

fig.show()

* **Analyse des correlations**

In [29]:
# Matrice de correlation 
corr_matrix = df_france.corr(method='pearson')

In [30]:
# Heatmap de la matrice de corrélation

fig = px.imshow(
    corr_matrix,
    text_auto=".2f",
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title="Correlation Matrix of Variables",
    labels=dict(x="Variables", y="Variables", color="Corr"),
)
fig.update_layout(height=800, width=900, template="plotly_white")

fig.show() 

In [31]:

# Corrélations avec le prix
print("\n" + "=" * 80)
print("CORRÉLATIONS AVEC LE PRIX")
print("=" * 80)
correlations = df_france.corr()['price_day_ahead'].drop('price_day_ahead').sort_values(ascending=False)
print(correlations.round(4))

# Visualisation corrélations
fig = go.Figure()
fig.add_trace(go.Bar(x=correlations.values, y=correlations.index, 
                      orientation='h', marker=dict(color=correlations.values, 
                      colorscale='RdBu', cmid=0)))
fig.update_layout(
    title="Corrélations avec le Prix",
    xaxis_title="Corrélation",
    template="plotly_white",
    height=400
)
fig.show()



CORRÉLATIONS AVEC LE PRIX
load             0.4727
load_forecast    0.4678
nuclear          0.2685
wind_speed       0.0249
wind             0.0159
cloud_cover      0.0120
solar           -0.0355
temperature     -0.0854
Name: price_day_ahead, dtype: float64


## FEATURE ENGINEERING

* **Création des Features**

In [33]:
# Copier le dataframe
df_features = df_france.copy()


In [34]:
# === FEATURES TEMPORELLES ===
df_features['hour'] = df_features.index.hour
df_features['day_of_week'] = df_features.index.dayofweek # Monday=0, Sunday=6
df_features['day_of_year'] = df_features.index.dayofyear
df_features['month'] = df_features.index.month
df_features['year'] = df_features.index.year
df_features['is_weekend'] = (df_features['day_of_week'] >= 5).astype(int)


In [35]:
# Saison
season_map = {12: 'Winter', 1: 'Winter', 2: 'Winter',
              3: 'Spring', 4: 'Spring', 5: 'Spring',
              6: 'Summer', 7: 'Summer', 8: 'Summer',
              9: 'Fall', 10: 'Fall', 11: 'Fall'}
df_features['season'] = df_features['month'].map(season_map)


In [37]:
# === LAG FEATURES (Prix et Charge) ===
for lag in [1, 3, 6, 12, 24]:
    df_features[f'price_lag_{lag}h'] = df_features['price_day_ahead'].shift(lag)
    df_features[f'load_lag_{lag}h'] = df_features['load'].shift(lag)


In [38]:

# === ROLLING WINDOWS ===
for window in [6, 24]:
    df_features[f'price_rolling_mean_{window}h'] = df_features['price_day_ahead'].shift(1).rolling(window=window).mean()
    df_features[f'price_rolling_std_{window}h'] = df_features['price_day_ahead'].shift(1).rolling(window=window).std()
    df_features[f'load_rolling_mean_{window}h'] = df_features['load'].rolling(window=window).mean()


In [39]:
# === FEATURES DÉRIVÉES ===
df_features['renewable_generation'] = df_features['solar'] + df_features['wind']
df_features['total_generation'] = df_features['renewable_generation'] + df_features['nuclear']
df_features['price_delta'] = df_features['price_day_ahead'].diff()


In [40]:
# === INTERACTIVES ===
df_features['load_x_hour'] = df_features['load'] * df_features['hour'] / 100
df_features['temp_x_cloud'] = df_features['temperature'] * df_features['cloud_cover']


In [41]:

print(f"Features créées : {df_features.shape[1]}")
print(f"Observations après lag/rolling : {df_features.shape[0]}")


Features créées : 37
Observations après lag/rolling : 25584


In [42]:

# Nettoyer les NaN générés par les lags et rolling windows
df_features = df_features.dropna()
print(f"Observations finales : {df_features.shape[0]}")
print(f"\nNom des features : {list(df_features.columns)}")


Observations finales : 25560

Nom des features : ['load', 'load_forecast', 'solar', 'wind', 'price_day_ahead', 'temperature', 'cloud_cover', 'nuclear', 'wind_speed', 'hour', 'day_of_week', 'day_of_year', 'month', 'year', 'is_weekend', 'season', 'price_lag_1h', 'load_lag_1h', 'price_lag_3h', 'load_lag_3h', 'price_lag_6h', 'load_lag_6h', 'price_lag_12h', 'load_lag_12h', 'price_lag_24h', 'load_lag_24h', 'price_rolling_mean_6h', 'price_rolling_std_6h', 'load_rolling_mean_6h', 'price_rolling_mean_24h', 'price_rolling_std_24h', 'load_rolling_mean_24h', 'renewable_generation', 'total_generation', 'price_delta', 'load_x_hour', 'temp_x_cloud']


In [43]:

# Sauvegarder temporairement
df_features.to_csv('df_features_france.csv')
