In [1]:
# Imports
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from re_forecast.data.get_data import get_rte_data
from re_forecast.preprocessing.preprocess_data import preprocess_data
from re_forecast.exploration.plot import plot_time_serie

# Training with SARIMA

Objectives of this notebook :  
- Base steps :  
  - Import data
  - Impute missing values
  - Develop a basic train/test split function for time series
- Baseline :  
  - Compute a baseline score
- Stationarity :  
  - Test for stationarity with an Augmented Dickey-Fuller (ADF) test
  - Preprocess the data and iterate with the computation of the ADF
- ARIMA :  
  - Plot the ACF and the PACF graphs and determine the coefficient for the ARIMA (and later SARIMA) models
  - Test different configurations of ARIMA models

## Base steps

Steps :
- Import data
- Impute missing values
- Develop a basic train/test split function for time series

In [2]:
# Set the parameters
ressource_nb = 2
start_date = "2022-06-01 00:00:00" # Note: don't modify the start & end date for following requests
end_date = "2024-03-01 00:00:00"
eic_code = "17W0000014455708"
production_type = None
production_subtype = None

# Download generation data: don't forget to set the api_delay_bypass to true in the params in this case,
# and to reset it to false if you know that you will download fresh data from the api
gen_data = get_rte_data(ressource_nb = ressource_nb,
                        start_date = start_date,
                        end_date = end_date,
                        eic_code = eic_code,
                        production_type = production_type,
                        production_subtype = production_subtype)

# Preprocess the data with the preprocess pipeline: check data quality,
# construct a time-consistent df, constrain min and max values and impute missing values
gen_data_dt_preprocessed = preprocess_data(gen_data)

# Plot
plot_time_serie(gen_data_dt_preprocessed,
                "",
                "value",
                dt_index = True)

In [3]:
def check_split_values(split_values_dict: dict,
                       split_values_names: tuple = ("train_split", "test_split"),
                       split_values_bounds: tuple = (0, 1)
                       ) -> bool:
    """Check function for the train_test_split_time_serie.
    Check:
    - if there is only one split value
    - if the split value name is amoung the valid split value names
    - if the split value is between 0 and 1 excluded
    Arguments:
    - split_value_dict: dict matching split value names and values
    Parameters:
    - split_values_names: tuple containing the names of the split values arguments
    - split_values_bounds: bounds to respect for the split values"""

    # Check if the dict is lenght 1
    if len(split_values_dict) != 1:
        return False

    # Check if the name of the split value is among the accepted names
    if not all(key in split_values_names for key in split_values_dict.keys()):
        return False

    # Check if the split value is between 0 and 1
    if not all(split_values_bounds[0] < value < split_values_bounds[1] for value in split_values_dict.values()):
        return False

    return True


def compute_split_value(split_value_dict: dict,
                        split_values_names: tuple = ("train_split", "test_split")
                        ) -> float:
    """Compute the split value depending on the split type,
    train or test.
    Arguments:
    - split_value_dict: dict matching split value names and values
    Parameters:
    - split_values_names: tuple containing the names of the split values arguments"""

    # Extract the name of the split value
    split_value_key = list(split_value_dict.keys())[0]

    # If the split value is equal to "train_split",
    # just return the split value corresponding
    if split_value_key == split_values_names[0]:
        return split_value_dict[split_value_key]

    # Else return 1 - the split value given
    return 1 - split_value_dict[split_value_key]


# Develop a basic train/test split function for time series
def train_test_split_time_serie(gen_df: pd.DataFrame,
                                **split_value: float | None
                                ) -> tuple:
    """Train test split a time serie. You can enter a train
    proportion or a test proportion.
    Arguments:
    - gen_df: the time serie df to train test split
    - **split_value: the split proportion, train or test"""

    # Verify the split values
    if not check_split_values(split_value):
        raise ValueError("Enter one valid split value between 0 and 1 excluded")

    # Compute the split value corresponding
    # to the split type, train or test
    split_value = compute_split_value(split_value)

    # Compute the index corresponding to the split value
    split_index = int(split_value*len(gen_df))

    return gen_df.iloc[0:split_index, :], gen_df.iloc[split_index:, :]

In [10]:
# Test the train test split function
gen_data_train, gen_data_test = train_test_split_time_serie(gen_data_dt_preprocessed,
                                                            train_split = 0.7)

(                     value
 start_date_complete       
 2023-11-02 00:00:00  158.0
 2023-11-02 01:00:00  135.0
 2023-11-02 02:00:00  139.0
 2023-11-02 03:00:00  159.0
 2023-11-02 04:00:00  144.0
 ...                    ...
 2024-01-24 18:00:00    0.8
 2024-01-24 19:00:00    0.8
 2024-01-24 20:00:00    2.2
 2024-01-24 21:00:00    7.2
 2024-01-24 22:00:00    1.0
 
 [2015 rows x 1 columns],
                      value
 start_date_complete       
 2024-01-24 23:00:00  103.2
 2024-01-25 00:00:00   57.0
 2024-01-25 01:00:00   32.0
 2024-01-25 02:00:00   21.0
 2024-01-25 03:00:00   14.0
 ...                    ...
 2024-02-29 19:00:00  106.0
 2024-02-29 20:00:00  179.0
 2024-02-29 21:00:00  141.0
 2024-02-29 22:00:00   82.0
 2024-02-29 23:00:00   29.0
 
 [865 rows x 1 columns])

## Baseline

Compute a baseline score.