## Import modules

In [1]:
# Import external modules
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

pd.options.plotting.backend = "plotly"

## Import raw data

In [2]:
import gdown

url = "https://drive.google.com/drive/folders/10fxlNGVm3xIJQLB958CU56T6UTGp_md0?usp=drive_link"
gdown.download_folder(url, quiet=True, use_cookies=False)

In [2]:
# Define used paths & filenames
raw_data_path = "data/raw/"
raw_power_filename = os.path.join(
    raw_data_path, "consommation-quotidienne-brute-regionale.csv"
)
raw_weather_filename = os.path.join(raw_data_path, "donnees-synop-essentielles-omm.csv")

In [3]:
CONFIG_POWER = {
    "Date": "Date - Heure",
    "Region": "Région",
    "Power": "Consommation brute électricité (MW) - RTE",
    "Status": "Statut - RTE",
}

CONFIG_WEATHER = {
    "Date": "Date",
    "Region": "region (name)",
    "Temperature": "Température (°C)",
    "Nebulosite": "Nebulosité totale",
}

dict_diff = {
    x: CONFIG_WEATHER[x] for x in CONFIG_WEATHER.keys() if x not in CONFIG_POWER.keys()
}

DATACONFIG = {x: CONFIG_POWER[x] for x in CONFIG_POWER.keys() if x != "Status"}
DATACONFIG.update(dict_diff)

In [4]:
class Dataloader:
    def __init__(self) -> None:
        pass

    @classmethod
    def pre_treatment(
        cls, data, config, frequency: str = "h", resampling_mode: str = "sum"
    ):
        data = data[list(config.values())]
        data[config["Date"]] = pd.to_datetime(
            data[config["Date"]].str.replace("T", " ").str.replace("+", " +")
        )
        data = (
            data.groupby([config["Date"], config["Region"]])
            .first()
            .sort_index(level=config["Date"])
        )
        data = data.unstack(level=config["Region"]).asfreq("30T").interpolate()
        data = data.resample(frequency).agg(resampling_mode)
        data = data.stack(level=config["Region"])
        return data

    @classmethod
    def load_power(cls, path: str, frequency: str = "h", resampling_mode: str = "sum"):
        df = pd.read_csv(path, sep=";")
        df = df[df[CONFIG_POWER["Status"]] == "Définitif"].drop(
            CONFIG_POWER["Status"], axis=1
        )
        config = {x: CONFIG_POWER[x] for x in CONFIG_POWER.keys()}
        config.pop("Status")
        df = cls.pre_treatment(
            data=df,
            config=config,
            frequency=frequency,
            resampling_mode=resampling_mode,
        )
        return df

    @classmethod
    def load_weather(
        cls, path: str, frequency: str = "h", resampling_mode: str = "sum"
    ):
        df = pd.read_csv(path, sep=";")
        df = cls.pre_treatment(
            data=df,
            config=CONFIG_WEATHER,
            frequency=frequency,
            resampling_mode=resampling_mode,
        )
        return df

    @classmethod
    def load_data(
        cls,
        path_power: str,
        path_weather: str,
        frequency: str = "h",
        resampling_mode: str = "sum",
    ):
        df_power = cls.load_power(
            path_power, frequency=frequency, resampling_mode=resampling_mode
        )
        df_weather = cls.load_weather(
            path_weather, frequency=frequency, resampling_mode=resampling_mode
        )
        df_tot = (
            df_power.reset_index()
            .merge(
                df_weather,
                left_on=[CONFIG_POWER["Date"], CONFIG_POWER["Region"]],
                right_on=[CONFIG_WEATHER["Date"], CONFIG_WEATHER["Region"]],
            )
            .set_index([DATACONFIG["Date"], DATACONFIG["Region"]])
        )
        del df_power
        del df_weather
        return df_tot

In [5]:
df_tot = Dataloader.load_data(
    path_power=raw_power_filename, path_weather=raw_weather_filename
)

  data[config["Date"]] = pd.to_datetime(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[config["Date"]] = pd.to_datetime(
  data[config["Date"]] = pd.to_datetime(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[config["Date"]] = pd.to_datetime(


## Define feature eng functions

In [6]:
# Temporal Component Extraction
# Capture temporal patterns (time of day, day of week) influencing energy consumption

# Exemple of function


def extract_time_features(df):
    # Convert 'Date - Time' index to time column
    df.reset_index(inplace=True)
    df["Date - Heure"] = pd.to_datetime(df["Date - Heure"])

    # Time component extraction
    df["heure"] = df["Date - Heure"].dt.hour
    df["jour"] = df["Date - Heure"].dt.day
    df["mois"] = df["Date - Heure"].dt.month
    df["jour_de_la_semaine"] = df["Date - Heure"].dt.dayofweek
    df["est_weekend"] = df["jour_de_la_semaine"].isin([5, 6]).astype(int)

    # Reset 'Date - Time' and 'Region' as indexes if necessary
    df.set_index(["Date - Heure", "Région"], inplace=True)
    return df

In [7]:
# Standardization
# put temperature data on a common scale for fair comparison

# Exemple of function


def normalize_temperature(df):
    scaler = MinMaxScaler()
    temp_values = df["Température (°C)"].values.reshape(-1, 1)
    df["Température_norm"] = scaler.fit_transform(temp_values)
    return df

In [8]:
# Categorization
# Transform continuous data into categories to simplify impact of cloudiness on energy consumption analysis

# Exemple of functions


def categorize_cloudiness(df):
    bins = [0, 50, 100, 150, 200, 250]
    labels = ["Très Clair", "Clair", "Partiellement Nuageux", "Nuageux", "Très Nuageux"]
    df["Nebulosité_cat"] = pd.cut(df["Nebulosité totale"], bins=bins, labels=labels)
    # One-hot encoding for categorical variables (e.g. categorized cloud cover)
    df = pd.get_dummies(df, columns=["Nebulosité_cat"])
    return df


def high_consumption_indicator(df, threshold=5000):
    df["Haute_Consommation"] = (
        df["Consommation brute électricité (MW) - RTE"] > threshold
    ).astype(int)
    return df

In [9]:
# Lag feature => How past energy consumption can influence future forecasts
# Capturing Temporal Dynamics: Lag features enable the model to understand and integrate the impact of past conditions and behaviors on current energy consumption forecasts.


def create_extended_lag_features(df):
    df.reset_index(inplace=True)
    df.sort_values(by=["Date - Heure", "Région"], inplace=True)
    # Define the different lag intervals in hours
    lag_intervals = {
        "day_1": 24,  # 1 day
        "day_7": 24 * 7,  # 7 days
        "week_1": 24 * 7,  # 1 week
        "week_4": 24 * 28,  # 4 weeks
        "year_1": 24 * 365,  # 1 year
    }
    # Create lag features for each interval
    for lag_name, lag_hours in lag_intervals.items():
        df[f"Consumption_lag_{lag_name}"] = df.groupby("Région")[
            "Consommation brute électricité (MW) - RTE"
        ].shift(periods=lag_hours)
    df.set_index(["Date - Heure", "Région"], inplace=True)
    # Drop rows with any NaN values generated due to shifting
    df.dropna(inplace=True)

    return df

## Apply the feature eng fuctions

In [10]:
df_clean = df_tot.copy()
df_clean = extract_time_features(df_clean)
df_clean = normalize_temperature(df_clean)
df_clean = categorize_cloudiness(df_clean)
df_clean = high_consumption_indicator(df_clean)
df_clean = create_extended_lag_features(df_clean)

In [11]:
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Consommation brute électricité (MW) - RTE,Température (°C),Nebulosité totale,heure,jour,mois,jour_de_la_semaine,est_weekend,Température_norm,Nebulosité_cat_Très Clair,Nebulosité_cat_Clair,Nebulosité_cat_Partiellement Nuageux,Nebulosité_cat_Nuageux,Nebulosité_cat_Très Nuageux,Haute_Consommation,Consumption_lag_day_1,Consumption_lag_day_7,Consumption_lag_week_1,Consumption_lag_week_4,Consumption_lag_year_1
Date - Heure,Région,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00+01:00,Auvergne-Rhône-Alpes,17172.0,16.900000,200.000000,0,1,1,2,0,0.404723,False,False,False,True,False,1,17841.0,14600.0,14600.0,19726.0,8173.0
2014-01-01 00:00:00+01:00,Bourgogne-Franche-Comté,5070.0,11.000000,200.000000,0,1,1,2,0,0.351816,False,False,False,True,False,1,5147.0,4418.0,4418.0,5994.0,2357.0
2014-01-01 00:00:00+01:00,Bretagne,6667.0,15.700000,150.000000,0,1,1,2,0,0.393962,False,False,True,False,False,1,6376.0,6317.0,6317.0,6749.0,3050.0
2014-01-01 00:00:00+01:00,Centre-Val de Loire,5135.0,12.650000,105.000000,0,1,1,2,0,0.366612,False,False,True,False,False,1,5274.0,4848.0,4848.0,6196.0,2476.0
2014-01-01 00:00:00+01:00,Grand Est,10604.0,12.350000,185.000000,0,1,1,2,0,0.363922,False,False,False,True,False,1,10726.0,9710.0,9710.0,13322.0,4943.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 23:00:00+01:00,Nouvelle-Aquitaine,12110.0,16.266667,200.000000,23,31,12,1,0,0.399043,False,False,False,True,False,1,13032.0,10095.0,10095.0,14323.0,13077.0
2019-12-31 23:00:00+01:00,Occitanie,10984.0,10.000000,200.000000,23,31,12,1,0,0.342849,False,False,False,True,False,1,11326.0,8857.0,8857.0,11970.0,12083.0
2019-12-31 23:00:00+01:00,Pays de la Loire,8133.0,15.050000,200.833333,23,31,12,1,0,0.388133,False,False,False,False,True,1,8777.0,6852.0,6852.0,9390.0,7366.0
2019-12-31 23:00:00+01:00,Provence-Alpes-Côte d'Azur,12127.0,8.900000,52.500000,23,31,12,1,0,0.332985,False,True,False,False,False,1,12073.0,10644.0,10644.0,11629.0,12459.0


In [12]:
df_clean.to_csv("clean-data-consolidated.csv")

In [13]:
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Consommation brute électricité (MW) - RTE,Température (°C),Nebulosité totale,heure,jour,mois,jour_de_la_semaine,est_weekend,Température_norm,Nebulosité_cat_Très Clair,Nebulosité_cat_Clair,Nebulosité_cat_Partiellement Nuageux,Nebulosité_cat_Nuageux,Nebulosité_cat_Très Nuageux,Haute_Consommation,Consumption_lag_day_1,Consumption_lag_day_7,Consumption_lag_week_1,Consumption_lag_week_4,Consumption_lag_year_1
Date - Heure,Région,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00+01:00,Auvergne-Rhône-Alpes,17172.0,16.900000,200.000000,0,1,1,2,0,0.404723,False,False,False,True,False,1,17841.0,14600.0,14600.0,19726.0,8173.0
2014-01-01 00:00:00+01:00,Bourgogne-Franche-Comté,5070.0,11.000000,200.000000,0,1,1,2,0,0.351816,False,False,False,True,False,1,5147.0,4418.0,4418.0,5994.0,2357.0
2014-01-01 00:00:00+01:00,Bretagne,6667.0,15.700000,150.000000,0,1,1,2,0,0.393962,False,False,True,False,False,1,6376.0,6317.0,6317.0,6749.0,3050.0
2014-01-01 00:00:00+01:00,Centre-Val de Loire,5135.0,12.650000,105.000000,0,1,1,2,0,0.366612,False,False,True,False,False,1,5274.0,4848.0,4848.0,6196.0,2476.0
2014-01-01 00:00:00+01:00,Grand Est,10604.0,12.350000,185.000000,0,1,1,2,0,0.363922,False,False,False,True,False,1,10726.0,9710.0,9710.0,13322.0,4943.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 23:00:00+01:00,Nouvelle-Aquitaine,12110.0,16.266667,200.000000,23,31,12,1,0,0.399043,False,False,False,True,False,1,13032.0,10095.0,10095.0,14323.0,13077.0
2019-12-31 23:00:00+01:00,Occitanie,10984.0,10.000000,200.000000,23,31,12,1,0,0.342849,False,False,False,True,False,1,11326.0,8857.0,8857.0,11970.0,12083.0
2019-12-31 23:00:00+01:00,Pays de la Loire,8133.0,15.050000,200.833333,23,31,12,1,0,0.388133,False,False,False,False,True,1,8777.0,6852.0,6852.0,9390.0,7366.0
2019-12-31 23:00:00+01:00,Provence-Alpes-Côte d'Azur,12127.0,8.900000,52.500000,23,31,12,1,0,0.332985,False,True,False,False,False,1,12073.0,10644.0,10644.0,11629.0,12459.0
