# Timeseries Forecasting (ETS model)

## 1. Installing dependencies

In [None]:
# !pip install datasetsforecast

In [None]:
# !pip install sktime

In [None]:
# !pip install statsforecast

In [None]:
# Basics
import pandas as pd
import numpy as np

# Some functions for plotting and stuff
import utils as ts_utils
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
from IPython.display import Image
import ast

# Statistical models
from statsforecast import StatsForecast
from statsforecast.models import MSTL, AutoETS

# Retrieving the parameters
from statsforecast.arima import arima_string

## 2. Data Preparation

In [None]:
# Size of the data to read
data_size = 'norm'

# Date of the data to read
data_date = '2110' # '1806' = 18th of June

# Read the data (takes around 2 minutes)
dataset = pd.read_csv(f"~/Thesis/data/eod_balances_{data_date}_{data_size}.csv")

dataset

In [None]:
# Create the timer
timer = ts_utils.Timer()

### 2.1 In-sample and Out-sample split

In [None]:
# Calculate total amount of timeseries
num_timeseries = len(dataset.columns) - 1

# Specify train test split percentage
train_test_split = 0.8

# Split into train and out of sample test data
num_out_of_sample = int(train_test_split * num_timeseries)

# Create in-sample dataframe
in_sample_data = dataset.iloc[:, : num_out_of_sample + 1] # Training and testing

# Create out-sample dataframe
n = num_timeseries-num_out_of_sample
columns_to_keep = dataset.columns[[0]].tolist() + dataset.columns[-n:].tolist()
out_sample_data = dataset[columns_to_keep]

## 3. In-sample Analysis

### 3.1 Train/Test splitting and plotting

In [None]:
# Change the data to the long format
Y_df = in_sample_data.melt(id_vars=['date'], var_name='unique_id', value_name='y')
Y_df = Y_df.rename(columns={'date':'ds'})

# Convert date column to datetime type
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

In [None]:
# Define the horizon (12 months of 30 days each)
fh = 30
horizon = 12 * fh

# Identify the unique dates in the dataset
unique_dates = Y_df['ds'].unique()

# Convert to a list and then sort the dates
unique_dates = sorted(list(unique_dates))

# Determine the cutoff date (cutoff at 12 months before the last date in the dataset)
cutoff_date = unique_dates[-(horizon + 1)]

# Training data: all data up to the cutoff date
Y_train_df = Y_df[Y_df['ds'] <= cutoff_date]

In [None]:
# Initialize lists to store the input and test sets
input_dfs = []
test_dfs = []

# Loop to create the 6 input and test sets
for i in range(6):
    # Determine the start date of the test period
    test_start_date = unique_dates[-(horizon - i * 2 * fh)]
    test_end_date = unique_dates[-(horizon - (i * 2 * fh) - fh)]
    
    # Input data: all data up to the start of the current test period
    input_df = Y_df[Y_df['ds'] <= test_start_date]
    input_dfs.append(input_df)
    
    # Test data: the 30-day period following the start of the test period
    test_df = Y_df[(Y_df['ds'] > test_start_date) & (Y_df['ds'] <= test_end_date)]
    test_dfs.append(test_df)

# Define the 6 input periods
Y_input_df_0 = input_dfs[0]
Y_input_df_1 = input_dfs[1]
Y_input_df_2 = input_dfs[2]
Y_input_df_3 = input_dfs[3]
Y_input_df_4 = input_dfs[4]
Y_input_df_5 = input_dfs[5]

# Define the 6 test periods
Y_test_df_0 = test_dfs[0]
Y_test_df_1 = test_dfs[1]
Y_test_df_2 = test_dfs[2]
Y_test_df_3 = test_dfs[3]
Y_test_df_4 = test_dfs[4]
Y_test_df_5 = test_dfs[5]

In [None]:
# Function to plot the series and create the gif
def create_timeseries_gif(unique_id, Y_df):
    # Filter the data for the given unique_id
    ts_data = Y_df[Y_df['unique_id'] == unique_id].copy()

    # Sort by date
    ts_data = ts_data.sort_values('ds')

    # Last 720 points for visualization
    ts_data = ts_data[:]

    # Determine the unique dates in the dataset
    unique_dates = sorted(list(ts_data['ds'].unique()))

    # Define the major cutoff date
    cutoff_date = unique_dates[-(horizon + 1)]

    # Initialize figure with a larger size for higher quality
    fig, ax = plt.subplots(figsize=(18, 5))

    # Define colors for the different regions
    colors = {
        'train': '#a6bddb',
        'input': '#fd8d3c',
        'test': '#feb24c',
        'unused': '#ffeda0'
    }

    # Function to update the plot for each frame
    def update(frame):
        ax.clear()
        ax.plot(ts_data['ds'], ts_data['y'], color='blue', linewidth=0.75)

        # Highlight different regions
        test_start_date = unique_dates[-(horizon - frame * 2 * fh)]
        test_end_date = unique_dates[-(horizon - (frame * 2 * fh) - fh)]

        # Train data fill (diagonal lines)
        ax.fill_between(ts_data['ds'], ts_data['y'].min(), ts_data['y'].max(),
                        where=(ts_data['ds'] <= cutoff_date), 
                        facecolor='none', edgecolor=colors['train'], hatch='//', linewidth=0, label='Train Data')

        # Input data fill (from the start of the timeseries to the start of the test period)
        ax.fill_between(ts_data['ds'], ts_data['y'].min(), ts_data['y'].max(),
                        where=(ts_data['ds'] <= test_start_date), 
                        facecolor=colors['input'], alpha=0.15, label=f'Input Data')
        
        # Test data fill
        ax.fill_between(ts_data['ds'], ts_data['y'].min(), ts_data['y'].max(),
                        where=((ts_data['ds'] > test_start_date) & (ts_data['ds'] <= test_end_date)), 
                        facecolor=colors['test'], alpha=0.5, label=f'Test Data')
        
        # # Unused data fill
        # ax.fill_between(ts_data['ds'], ts_data['y'].min(), ts_data['y'].max(),
        #                 where=(ts_data['ds'] > test_end_date), alpha=0.35, facecolor=colors['unused'], label='Unused Data')

        # Add the major cutoff vertical dotted line
        ax.axvline(cutoff_date, color='black', linestyle='dashdot', linewidth=1)

        # Add gray vertical dotted lines on each side of the test period
        ax.axvline(test_start_date, color='gray', linestyle='--', linewidth=0.75)
        ax.axvline(test_end_date, color='gray', linestyle='--', linewidth=0.75)

        # Set plot title and labels
        ax.set_title(f'Evaluation Visualisation - Test Period {frame+1}')
        ax.set_xlabel('Date')
        ax.set_ylabel('Value')
        ax.legend(loc='upper left')

    # Create an animation
    ani = FuncAnimation(fig, update, frames=6, repeat=False)

    # Save the animation as a gif with a longer frame duration
    gif_path = f'figures/{unique_id}_timeseries.gif'
    ani.save(gif_path, writer=PillowWriter(fps=1))  # Adjusted fps for slower animation

    plt.close(fig)
    
    # Display the GIF in the notebook
    return Image(gif_path)

# Create the gif
unique_id = '6' 
create_timeseries_gif(unique_id, Y_df)

### 3.2 Training models using correct seasonality

In [None]:
def forecast_mstl_autoets(Y_input_df, seasonalities_df, h=30, levels=[60, 70, 80, 90], freq='D', n_jobs=5):
    """
    Forecast time series data using MSTL + AutoETS model with specified seasonalities.

    Parameters:
    - Y_input_df (pd.DataFrame): Input DataFrame containing 'unique_id', 'ds', 'y' columns.
    - seasonalities_df (pd.DataFrame): DataFrame containing 'unique_id' and 'best_fit' columns.
    - h (int): Forecast horizon.
    - levels (list of int): Confidence levels for prediction intervals.
    - freq (str): Frequency of the time series data.
    - n_jobs (int): Number of parallel jobs to run.

    Returns:
    - pd.DataFrame: Forecasts with prediction intervals, containing 'unique_id', 'ds', 'ETS', and interval columns.
    """

    # Step 1: Process 'best_fit' column to get 'season_length_list'
    def parse_season_length(s):
        return ast.literal_eval(s)

    seasonalities_df = seasonalities_df.copy()
    seasonalities_df['season_length_list'] = seasonalities_df['best_fit'].apply(parse_season_length)
    seasonalities_df['unique_id'] = seasonalities_df['unique_id'].astype(str)

    # Step 2: Merge seasonality info with input data
    Y_input_df = Y_input_df.copy()
    Y_input_df['unique_id'] = Y_input_df['unique_id'].astype(str)
    Y_input_df = Y_input_df.merge(
        seasonalities_df[['unique_id', 'season_length_list']],
        on='unique_id',
        how='left'
    )

    # Check for any missing seasonality information
    missing_seasonalities = Y_input_df[Y_input_df['season_length_list'].isnull()]['unique_id'].unique()
    if len(missing_seasonalities) > 0:
        print(f"Warning: Missing seasonality information for unique_ids: {missing_seasonalities}")
        # Drop these time series
        Y_input_df = Y_input_df.dropna(subset=['season_length_list'])

    # Step 3: Group data by season_length_tuple
    Y_input_df['season_length_tuple'] = Y_input_df['season_length_list'].apply(tuple)
    grouped = Y_input_df.groupby('season_length_tuple')

    forecast_dfs = []

    # Iterate over each group and perform forecasting
    for season_length_tuple, group_df in grouped:
        season_length_list = list(season_length_tuple)
        num_series = group_df['unique_id'].nunique()

        print(f"Forecasting {num_series} series with seasonality {season_length_list}...")

        # Extract necessary columns
        data = group_df[['unique_id', 'ds', 'y']].copy()

        # Handle empty season_length_list
        if season_length_list:
            # Non-empty seasonality
            mstl_model = [MSTL(
                season_length=season_length_list,
                trend_forecaster=AutoETS(model=["Z", "Z", "N"])
            )]
        else:
            print(f"Note: We found seasonality of [], we skip the MSTL model here.")

        if season_length_list:
            # Initialize StatsForecast
            stats_forecast = StatsForecast(models=mstl_model, freq=freq, n_jobs=n_jobs)
    
            # Perform forecast with specified confidence levels
            preds = stats_forecast.forecast(df=data, h=h, level=levels)
    
            # Rename forecast columns
            preds = preds.rename(columns={'MSTL': 'ETS'})
    
            # Rename confidence interval columns
            for level in levels:
                lower_col = f'MSTL-lo-{level}'
                upper_col = f'MSTL-hi-{level}'
    
                # Compute the tail percentage (e.g., for level=90, tail=10)
                tail = 100 - level
    
                preds = preds.rename(columns={
                    lower_col: f'ETS-lo-{tail}',
                    upper_col: f'ETS-hi-{level}'
                })
        else:
            # Initialize StatsForecast
            stats_forecast = StatsForecast(models=[AutoETS(model=["Z", "Z", "N"])], n_jobs=n_jobs, freq=freq)
    
            # Perform forecast with specified confidence levels
            preds = stats_forecast.forecast(df=data, h=h, level=levels)
    
            # Rename forecast columns
            preds = preds.rename(columns={'AutoETS': 'ETS'})
    
            # Rename confidence interval columns
            for level in levels:
                lower_col = f'AutoETS-lo-{level}'
                upper_col = f'AutoETS-hi-{level}'
    
                # Compute the tail percentage (e.g., for level=90, tail=10)
                tail = 100 - level
    
                preds = preds.rename(columns={
                    lower_col: f'ETS-lo-{tail}',
                    upper_col: f'ETS-hi-{level}'
                })

        # Append forecasts to the list
        forecast_dfs.append(preds)

    # Combine all forecasts into a single DataFrame
    ets_model_insample_preds = pd.concat(forecast_dfs).reset_index(drop=False)

    # Keep only necessary columns
    cols_to_keep = ['unique_id', 'ds', 'ETS'] + [col for col in ets_model_insample_preds.columns if col.startswith('ETS-lo-') or col.startswith('ETS-hi-')]
    ets_model_insample_preds = ets_model_insample_preds[cols_to_keep]

    print("Forecasting completed.")

    return ets_model_insample_preds

## 4. Insample predictions

In [None]:
# Set our forecasting horizon
h = 30

# Set our confidence levels
levels = [60, 70, 80, 90]

# Read the seasonalities that we got from seasonality_detection.ipynb
seasonalities = pd.read_csv(f"~/Thesis/models/best_fits_{data_date}_full.csv")

In [None]:
# Predict the first period
ets_model_insample_preds_0 = forecast_mstl_autoets(Y_input_df=Y_input_df_0, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_0.to_csv(f'~/Thesis/predictions/ETS/insample/period01/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the second period
ets_model_insample_preds_1 = forecast_mstl_autoets(Y_input_df=Y_input_df_1, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_1.to_csv(f'~/Thesis/predictions/ETS/insample/period02/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the third period
ets_model_insample_preds_2 = forecast_mstl_autoets(Y_input_df=Y_input_df_2, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_2.to_csv(f'~/Thesis/predictions/ETS/insample/period03/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the fourth period
ets_model_insample_preds_3 = forecast_mstl_autoets(Y_input_df=Y_input_df_3, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_3.to_csv(f'~/Thesis/predictions/ETS/insample/period04/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the fifth period
ets_model_insample_preds_4 = forecast_mstl_autoets(Y_input_df=Y_input_df_4, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_4.to_csv(f'~/Thesis/predictions/ETS/insample/period05/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the sixth period
ets_model_insample_preds_5 = forecast_mstl_autoets(Y_input_df=Y_input_df_5, seasonalities_df=seasonalities)

# Save the prediction
ets_model_insample_preds_5.to_csv(f'~/Thesis/predictions/ETS/insample/period06/model_preds_{data_date}_{data_size}.csv', index=False)

## 5. Out-sample predictions

### 5.1 Data handling

In [None]:
# Change the data to the long format
Y_df = out_sample_data.melt(id_vars=['date'], var_name='unique_id', value_name='y')
Y_df = Y_df.rename(columns={'date':'ds'})

# Convert date column to datetime type
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

In [None]:
# Define the horizon (12 months of 30 days each)
fh = 30
horizon = 12 * fh

# Identify the unique dates in the dataset
unique_dates = Y_df['ds'].unique()

# Convert to a list and then sort the dates
unique_dates = sorted(list(unique_dates))

# Determine the cutoff date (cutoff at 12 months before the last date in the dataset)
cutoff_date = unique_dates[-(horizon + 1)]

# Training data: all data up to the cutoff date
Y_train_df = Y_df[Y_df['ds'] <= cutoff_date]

In [None]:
# Initialize lists to store the input and test sets
input_dfs = []
test_dfs = []

# Loop to create the 6 input and test sets
for i in range(6):
    # Determine the start date of the test period
    test_start_date = unique_dates[-(horizon - i * 2 * fh)]
    test_end_date = unique_dates[-(horizon - (i * 2 * fh) - fh)]
    
    # Input data: all data up to the start of the current test period
    input_df = Y_df[Y_df['ds'] <= test_start_date]
    input_dfs.append(input_df)
    
    # Test data: the 30-day period following the start of the test period
    test_df = Y_df[(Y_df['ds'] > test_start_date) & (Y_df['ds'] <= test_end_date)]
    test_dfs.append(test_df)

# Define the 6 input periods
Y_input_df_0 = input_dfs[0]
Y_input_df_1 = input_dfs[1]
Y_input_df_2 = input_dfs[2]
Y_input_df_3 = input_dfs[3]
Y_input_df_4 = input_dfs[4]
Y_input_df_5 = input_dfs[5]

# Define the 6 test periods
Y_test_df_0 = test_dfs[0]
Y_test_df_1 = test_dfs[1]
Y_test_df_2 = test_dfs[2]
Y_test_df_3 = test_dfs[3]
Y_test_df_4 = test_dfs[4]
Y_test_df_5 = test_dfs[5]

### 5.2 Predictions

In [None]:
# Set our forecasting horizon
h = 30

# Set our confidence levels
levels = [60, 70, 80, 90]

# Read the seasonalities that we got from seasonality_detection.ipynb
seasonalities = pd.read_csv(f"~/Thesis/models/best_fits_{data_date}_full.csv")

In [None]:
# Predict the first period
ets_model_outsample_preds_0 = forecast_mstl_autoets(Y_input_df=Y_input_df_0, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_0.to_csv(f'~/Thesis/predictions/ETS/outsample/period01/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the second period
ets_model_outsample_preds_1 = forecast_mstl_autoets(Y_input_df=Y_input_df_1, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_1.to_csv(f'~/Thesis/predictions/ETS/outsample/period02/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the third period
ets_model_outsample_preds_2 = forecast_mstl_autoets(Y_input_df=Y_input_df_2, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_2.to_csv(f'~/Thesis/predictions/ETS/outsample/period03/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the fourth period
ets_model_outsample_preds_3 = forecast_mstl_autoets(Y_input_df=Y_input_df_3, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_3.to_csv(f'~/Thesis/predictions/ETS/outsample/period04/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the fifth period
ets_model_outsample_preds_4 = forecast_mstl_autoets(Y_input_df=Y_input_df_4, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_4.to_csv(f'~/Thesis/predictions/ETS/outsample/period05/model_preds_{data_date}_{data_size}.csv', index=False)

In [None]:
# Predict the sixth period
ets_model_outsample_preds_5 = forecast_mstl_autoets(Y_input_df=Y_input_df_5, seasonalities_df=seasonalities)

# Save the prediction
ets_model_outsample_preds_5.to_csv(f'~/Thesis/predictions/ETS/outsample/period06/model_preds_{data_date}_{data_size}.csv', index=False)