In [1]:
# Imports
import pandas as pd

from re_forecast.data.get_data import get_rte_data
from re_forecast.preprocessing.datetime import construct_time_consistent_df
from re_forecast.preprocessing.plot import plot_time_serie

# Preprocessing step 2 bis : Clean values

Goals of this notebook :  
- Develop a function to normalize the datas and its inverse  
- Develop a function to stationarize the data and its inverse  
- Develop a function to remove volatility and its inverse  
- Develop a function to remove the average seasonality and its inverse  
- Pack all these functions into a CleanValues object  

/!\ Option : Pack all the functions into a sklearn pipeline

Don't forget to find how to deal with nan values

## 0/ Load data

We load the dataset corresponding to the Fecamp generation unit :

In [2]:
# Set the parameters
ressource_nb = 2
start_date = "2022-06-01 00:00:00" # Note: don't modify the start & end date for following requests
end_date = "2024-03-01 00:00:00"
eic_code = "17W0000014455708"
production_type = None
production_subtype = None

# Download generation data: don't forget to set the api_delay_bypass to true in the params in this case,
# and to reset it to false if you know that you will download fresh data from the api
gen_data = get_rte_data(ressource_nb = ressource_nb,
                        start_date = start_date,
                        end_date = end_date,
                        eic_code = eic_code,
                        production_type = production_type,
                        production_subtype = production_subtype)

# Constuct the time consistent df
dt_columns = ["start_date", "end_date"]
gen_data_dt = construct_time_consistent_df(gen_data, dt_columns)

## 0 bis/ Construct a function to format the dataframe

We want to construct a function that extract a specific datetime column from a df, set it as an index and drop all other columns appart the value column

In [3]:
# droped_columns = gen_data_dt.columns[~gen_data_dt.columns.isin(["value"])]
# gen_data_dt.drop(droped_columns, axis = 1)

In [4]:
def peel_time_serie_df(gen_df: pd.DataFrame,
                       keeped_columns = {"dt_column": "start_date_complete", "value_column": "value"}
                       ) -> pd.DataFrame:
    """Drop all columns appart the choosen value column and the choosen
    datetime column"""

    # /!\ Copy the gen df to avoid setting with copy warning
    gen_df = gen_df.copy(deep = True)

    # Extract the dt column and the value column names
    dt_column = keeped_columns["dt_column"]
    value_column = keeped_columns["value_column"]

    # Set the dt column as index
    gen_df.set_index(dt_column, drop = True, inplace = True)

    # Drop all others columns appart the choosen value column
    droped_columns = gen_df.columns[~gen_df.columns.isin([value_column])]
    gen_df.drop(droped_columns, axis = 1, inplace = True)

    return gen_df

In [5]:
# Test the function
gen_data_dt_peeled = peel_time_serie_df(gen_data_dt)

## 1/ Normalization function

In [6]:
# Define a custom error NotFittedError
class NotFittedError(Exception):
    """This exception is raised in a scaler object, when a transform
    or an inverse_transform method is called before the fit method"""

In [7]:
class NormalScaler:

    def __init__(self):
        """Initialize self.mean, self.std and the
        error message used in case of error handling."""

        # Create null std and mean
        self.mean, self.std = (0, 0)

        # Create the error message
        self.error_message = "The fit method must be called before the transform and the inverse_transform method"

    def fit(self, gen_df: pd.DataFrame):
        """Extract the mean and the std of a time serie df
        Arguments:
        - gen_df: df with one column value and a datetime index"""

        # Extract the mean and the std of the df
        self.mean, self.std = gen_df.mean(), gen_df.std()

    def transform(self, gen_df: pd.DataFrame) -> pd.DataFrame:
        """Normalize the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index"""

        # Raise the not fitted error if the transform method is called before fit
        if isinstance(self.mean, int):
            raise NotFittedError(f"{self.error_message}")

        # Normalize the df
        return (gen_df - self.mean) / self.std

    def fit_transform(self, gen_df: pd.DataFrame) -> pd.DataFrame:
        """Extract the mean and the std of a time serie df and
        normalize the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index"""

        # Fit
        self.fit(gen_df)

        # Transform: Normalization
        return self.transform(gen_df)

    def inverse_transform(self, gen_df_normalized: pd.DataFrame) -> pd.DataFrame:
        """Inverse normalize the time serie.
        Arguments:
        - gen_df_normalized: normalized df with one column value
        and a datetime index"""

        # Raise the not fitted error if the inverse_transform method is called before fit
        if isinstance(self.mean, int):
            raise NotFittedError(f"{self.error_message}")

        # Inverse a normalize df
        return gen_df_normalized * self.std + self.mean

In [8]:
# Instanciate the NormalScaler
norm_scaler = NormalScaler()

# Fit and normalize
gen_data_dt_norm = norm_scaler.fit_transform(gen_data_dt_peeled)

# Plot
plot_time_serie(gen_data_dt_norm,
                "",
                "value",
                dt_index = True)

In [9]:
# Test the inverse transformation
gen_data_dt_inv_norm = norm_scaler.inverse_transform(gen_data_dt_norm)

# Plot
plot_time_serie(gen_data_dt_inv_norm,
                "",
                "value",
                dt_index = True)

## 2/ Stationarize

In [10]:
def stationarize_time_serie(gen_df: pd.DataFrame,
                            order: int,
                            # inverse: bool
                            ) -> pd.DataFrame:
    """Stationarize the time serie by the order given.
    Arguments:
    - gen_df: df with one column value and a datetime index
    - inverse: wether to return the inverse function"""

    # Copy the original df
    gen_df_diff = gen_df.copy(deep = True)

    # Case order == 0, just return the copy
    if not order:
        return gen_df_diff

    # Iterate over the order of the derivative
    for _ in range(1, order + 1):
        # Differenciate sequentially the gen_df_diff
        gen_df_diff = gen_df_diff.diff()

    return gen_df_diff

In [49]:
# Test the function
gen_data_dt_st = stationarize_time_serie(gen_data_dt_norm, 0)

# Plot
plot_time_serie(gen_data_dt_st,
                "",
                "value",
                dt_index = True)

## 3/ Remove volatility

The volatility is the variation of the standard deviation over time. To remove the volatility, we have to compute the standard deviation of each day or month, and remove this from the values.

In [13]:
class VolatilityRemover:

    def __init__(self):
        """Initialize self.volatility and the
        error message used in case of error handling."""

        # Set the volatility to 0, for the error handling
        self.volatility = 0

        # Set the error message
        self.error_message = "The fit method must be called before the transform and the inverse_transform method"

    def fit(self,
            gen_df: pd.DataFrame,
            window_size: int
            ):
        """Extract the seasonal volatility of a time serie df
        Arguments:
        - gen_df: df with one column value and a datetime index
        - window_size: size of the window for the rolling standard deviation"""

        # Compute the volatility
        self.volatility = gen_df.rolling(window_size).std().bfill()

    def transform(self, gen_df: pd.DataFrame) -> pd.DataFrame:
        """Remove the volatility of the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index"""

        # Raise the not fitted error if the inverse_transform method is called before fit
        if isinstance(self.volatility, int):
            raise NotFittedError(f"{self.error_message}")

        # Remove the volatility from the original dataframe
        return gen_df / self.volatility

    def fit_transform(self,
                      gen_df: pd.DataFrame,
                      window_size: int
                      ) -> pd.DataFrame:
        """Extract the seasonal volatility of a time serie df and
        remove the volatility of the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index
        - window_size: size of the window for the rolling standard deviation"""

        # Fit
        self.fit(gen_df, window_size)

        # Transform: Remove volatility
        return self.transform(gen_df)

    def inverse_transform(self, gen_df: pd.DataFrame):
        """Re-add the volatility to the time serie.
        Arguments:
        - gen_df: df without volatility with one column value
        and a datetime index"""

        # Raise the not fitted error if the inverse_transform method is called before fit
        if isinstance(self.volatility, int):
            raise NotFittedError(f"{self.error_message}")

        # Multiply by the volatility
        return gen_df * self.volatility

## 4/ Remove the average seasonality

Just copy and modify the VolatilityRemover object

In [50]:
class AverageSeasonalityRemover:

    def __init__(self):
        """Initialize self.avg_seasonality and the
        error message used in case of error handling."""

        # Set the avg_seasonality to 0, for the error handling
        self.avg_seasonality = 0

        # Set the error message
        self.error_message = "The fit method must be called before the transform and the inverse_transform method"

    def fit(self,
            gen_df: pd.DataFrame,
            window_size: int
            ):
        """Extract the seasonal avg_seasonality of a time serie df
        Arguments:
        - gen_df: df with one column value and a datetime index
        - window_size: size of the window for the rolling standard deviation"""

        # Compute the avg_seasonality
        self.avg_seasonality = gen_df.rolling(window_size).mean().bfill()

    def transform(self, gen_df: pd.DataFrame) -> pd.DataFrame:
        """Remove the avg_seasonality of the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index"""

        # Raise the not fitted error if the inverse_transform method is called before fit
        if isinstance(self.avg_seasonality, int):
            raise NotFittedError(f"{self.error_message}")

        # Remove the avg_seasonality from the original dataframe
        return gen_df - self.avg_seasonality

    def fit_transform(self,
                      gen_df: pd.DataFrame,
                      window_size: int
                      ) -> pd.DataFrame:
        """Extract the seasonal avg_seasonality of a time serie df and
        remove the avg_seasonality of the time serie.
        Arguments:
        - gen_df: df with one column value and a datetime index
        - window_size: size of the window for the rolling standard deviation"""

        # Fit
        self.fit(gen_df, window_size)

        # Transform: Remove avg_seasonality
        return self.transform(gen_df)

    def inverse_transform(self, gen_df: pd.DataFrame):
        """Re-add the avg_seasonality to the time serie.
        Arguments:
        - gen_df: df without avg_seasonality with one column value
        and a datetime index"""

        # Raise the not fitted error if the inverse_transform method is called before fit
        if isinstance(self.avg_seasonality, int):
            raise NotFittedError(f"{self.error_message}")

        # Multiply by the avg_seasonality
        return gen_df + self.avg_seasonality

In [53]:
# Instanciate the AverageSeasonalityRemover
avg_seasonality_rmv = AverageSeasonalityRemover()

# Fit and normalize
gen_data_dt_seasonality_rmv = avg_seasonality_rmv.fit_transform(gen_data_dt_peeled,
                                                                window_size = 24)

# Plot
plot_time_serie(gen_data_dt_seasonality_rmv,
                "",
                "value",
                dt_index = True)

## Option : Construct a sklearn pipeline including all the transformation functions