### ============================================================================
### Imports et configuration
### ============================================================================

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration visualisations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Environnement configuré avec succès!")
print(f"Date d'exécution: {datetime.now().strftime('%Y-%m-%d %H:%M')}")


Environnement configuré avec succès!
Date d'exécution: 2025-11-21 16:23


### ============================================================================
### Téléchargement dataset
### ============================================================================

In [2]:
import urllib.request
import os

# Créer dossier data si inexistant
os.makedirs('../data/raw', exist_ok=True)

# URL dataset 60min
url = "https://data.open-power-system-data.org/time_series/2020-10-06/time_series_60min_singleindex.csv"
destination = "../data/raw/time_series_60min.csv"

# Télécharger si pas déjà présent
if not os.path.exists(destination):
    print("⏳ Téléchargement du dataset (124 MB)... Patience!")
    urllib.request.urlretrieve(url, destination)
    print("Dataset téléchargé!")
else:
    print(" Dataset déjà présent localement")


 Dataset déjà présent localement


### ============================================================================
### Chargement et inspection initiale
### ============================================================================

In [3]:
# Charger dataset
df = pd.read_csv(
    '../data/raw/time_series_60min.csv',
    parse_dates=['utc_timestamp', 'cet_cest_timestamp'],
    low_memory=False
)

# Définir timestamp comme index
df = df.set_index('utc_timestamp')

print(f" Shape du dataset: {df.shape}")
print(f" Période: {df.index.min()} → {df.index.max()}")
print(f" Mémoire: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"\n Premières lignes:")
df.head(5)


 Shape du dataset: (50401, 299)
 Période: 2014-12-31 23:00:00+00:00 → 2020-09-30 23:00:00+00:00
 Mémoire: 121.1 MB

 Premières lignes:


Unnamed: 0_level_0,cet_cest_timestamp,AT_load_actual_entsoe_transparency,AT_load_forecast_entsoe_transparency,AT_price_day_ahead,AT_solar_generation_actual,AT_wind_onshore_generation_actual,BE_load_actual_entsoe_transparency,BE_load_forecast_entsoe_transparency,BE_solar_generation_actual,BE_wind_generation_actual,...,SI_load_actual_entsoe_transparency,SI_load_forecast_entsoe_transparency,SI_solar_generation_actual,SI_wind_onshore_generation_actual,SK_load_actual_entsoe_transparency,SK_load_forecast_entsoe_transparency,SK_solar_generation_actual,SK_wind_onshore_generation_actual,UA_load_actual_entsoe_transparency,UA_load_forecast_entsoe_transparency
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31 23:00:00+00:00,2015-01-01 00:00:00+01:00,,,,,,,,,,...,,,,,,,,,,
2015-01-01 00:00:00+00:00,2015-01-01 01:00:00+01:00,5946.0,6701.0,35.0,,69.0,9484.0,9897.0,,,...,,,,,,,,,,
2015-01-01 01:00:00+00:00,2015-01-01 02:00:00+01:00,5726.0,6593.0,45.0,,64.0,9152.0,9521.0,,734.81,...,1045.47,816.0,,1.17,2728.0,2860.0,3.8,,,
2015-01-01 02:00:00+00:00,2015-01-01 03:00:00+01:00,5347.0,6482.0,41.0,,65.0,8799.0,9135.0,,766.64,...,1004.79,805.0,,1.04,2626.0,2810.0,3.8,,,
2015-01-01 03:00:00+00:00,2015-01-01 04:00:00+01:00,5249.0,6454.0,38.0,,64.0,8567.0,8909.0,,733.13,...,983.79,803.0,,1.61,2618.0,2780.0,3.8,,,


### ============================================================================
### Sélection colonnes FRANCE
### ============================================================================

In [4]:
# Identifier toutes colonnes France
fr_cols = [col for col in df.columns if col.startswith('FR_')]

print(f"Colonnes disponibles pour la France: {len(fr_cols)}")
print("\n Liste des colonnes:")
for col in fr_cols:
    print(f"  • {col}")

# Extraire données France
df_fr = df[fr_cols].copy()

# Informations sur complétude
print("\n" + "="*80)
print(" COMPLÉTUDE DES DONNÉES FRANCE")
print("="*80)
missing_pct = (df_fr.isnull().sum() / len(df_fr) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0].map("{:.2f}%".format))


Colonnes disponibles pour la France: 4

 Liste des colonnes:
  • FR_load_actual_entsoe_transparency
  • FR_load_forecast_entsoe_transparency
  • FR_solar_generation_actual
  • FR_wind_onshore_generation_actual

 COMPLÉTUDE DES DONNÉES FRANCE
FR_load_actual_entsoe_transparency      0.09%
FR_solar_generation_actual              0.03%
FR_wind_onshore_generation_actual       0.02%
FR_load_forecast_entsoe_transparency    0.01%
dtype: object
