In [None]:
# !pip install datasetsforecast
# !pip install statsforecast

In [None]:
# Basics
import pandas as pd
import numpy as np

# Some functions for plotting and stuff
import utils as ts_utils
import matplotlib.pyplot as plt

# Statistical models
from statsforecast import StatsForecast
from statsforecast.models import MSTL, AutoARIMA
from statsforecast.arima import arima_string

## 2. Data Preparation

In [None]:
# Size of the data to read
data_size = 'full'

# Date of the data to read
data_date = '2110' # '1806' = 18th of June

# Read the data (takes around 2 minutes)
dataset = pd.read_csv(f"~/Thesis/data/eod_balances_{data_date}_{data_size}.csv")

dataset

### 2.1 Pivot Data

In [None]:
# Change the data to the long format
Y_df = dataset.melt(id_vars=['date'], var_name='unique_id', value_name='y')
Y_df = Y_df.rename(columns={'date':'ds'})

# Convert date column to datetime type
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

### 2.2 Train/Test splitting

In [None]:
# Define the horizon (12 months of 30 days each)
fh = 30
horizon = 12 * fh

# Identify the unique dates in the dataset
unique_dates = Y_df['ds'].unique()

# Convert to a list and then sort the dates
unique_dates = sorted(list(unique_dates))

# Determine the cutoff date (cutoff at 12 months before the last date in the dataset)
cutoff_date = unique_dates[-(horizon + 1)]

# Training data: all data up to the cutoff date
Y_train_df = Y_df[Y_df['ds'] <= cutoff_date]

# Get a 'to be forecasted' time period (the test / 'ground truth' data)
Y_test_df = Y_df[Y_df.ds > Y_df.ds.unique()[0 - horizon - 1]]

Y_train_df

## 3. Seasonality detection

In [None]:
# The mapping between fits and unique_id is not 1-to-1, find out the correct mapping
def match_fitted_models_to_timeseries(Y_train_df, fitted_models):
    # Create a dictionary to store the results
    model_to_unique_id = {}
    
    # Group the Y_train_df by unique_id
    grouped = Y_train_df.groupby('unique_id')
    
    for i, model in enumerate(fitted_models):
        model_data = model['data'].values
        
        for unique_id, group in grouped:
            # Sort the group by 'ds' to ensure alignment
            group_sorted = group.sort_values('ds')
            group_data = group_sorted['y'].values
            
            # Check if the lengths match
            if len(model_data) == len(group_data):
                # Compare the data
                if np.allclose(model_data, group_data, rtol=1e-5, atol=1e-8):
                    model_to_unique_id[i] = unique_id
                    break
    
    # Create a DataFrame from the dictionary
    result_df = pd.DataFrame.from_dict(model_to_unique_id, orient='index', columns=['unique_id'])
    result_df.index.name = 'model_index'
    
    return result_df

def add_median_remainder(result_df, fitted_models, name):
    # Create a dictionary to store the median remainder for each model
    median_remainders = {}
    
    for i, model in enumerate(fitted_models):
        if 'remainder' in model.columns:
            median_remainders[i] = model['remainder'].median()
        else:
            median_remainders[i] = None 
    
    # Create a new DataFrame with the median remainders
    median_df = pd.DataFrame.from_dict(median_remainders, orient='index', columns=[name])
    median_df.index.name = 'model_index'
    
    # Merge the median remainders with the original result DataFrame
    result_with_median = result_df.merge(median_df, on='model_index', how='left')
    
    return result_with_median

### 3.1.1 Seasonality = []

In [None]:
# Initialize a model
mstl_0 = [AutoARIMA(season_length=1)]

# Prepare the fit
sf_0 = StatsForecast(models=mstl_0, freq='D', n_jobs=5)

# Fit the model
sf_0.fit(df=Y_train_df)

In [None]:
# Create a dictionary to store median residuals
median_residuals = {}

n = len(set(Y_train_df['unique_id']))

# Calculate median residuals for each timeseries
for i in range(n):
    residuals = sf_0.fitted_[i, 0].model_['residuals']
    median_residuals[i] = np.median(residuals)

# Create a new DataFrame from the median_residuals dictionary
residuals_df = pd.DataFrame.from_dict(median_residuals, orient='index', columns=['[]'])
residuals_df.index.name = 'model_index'

### 3.1.2 Seasonality = [7]

In [None]:
# Initialize a model
mstl_7 = [MSTL(season_length=[7], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_7 = StatsForecast(models=mstl_7, freq='D', n_jobs=5)

sf_7.fit(df=Y_train_df)

In [None]:
# Retrieve the mapping
seasonality_remainders = match_fitted_models_to_timeseries(Y_train_df, [sf_7.fitted_[i,0].model_ for i in range(n)])

In [None]:
# Add the remainders from seasonality = []
seasonality_remainders = seasonality_remainders.join(residuals_df['[]'], how='left')

In [None]:
# Add the remainders from seasonality = [7]
fitted_models = [sf_7.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[7]')

### 3.1.3 Seasonality = [30]

In [None]:
# Initialize a model
mstl_30 = [MSTL(season_length=[30], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_30 = StatsForecast(models=mstl_30, freq='D', n_jobs=5)

sf_30.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_30.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[30]')

### 3.1.4 Seasonality = [365]

In [None]:
# Initialize a model
mstl_365 = [MSTL(season_length=[365], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_365 = StatsForecast(models=mstl_365, freq='D', n_jobs=5)

sf_365.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_365.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[365]')

### 3.1.5 Seasonality = [7, 30]

In [None]:
# Initialize a model
mstl_7_30 = [MSTL(season_length=[7,30], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_7_30 = StatsForecast(models=mstl_7_30, freq='D', n_jobs=5)

sf_7_30.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_7_30.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[7, 30]')

### 3.1.6 Seasonality = [7, 365]

In [None]:
# Initialize a model
mstl_7_365 = [MSTL(season_length=[7,365], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_7_365 = StatsForecast(models=mstl_7_365, freq='D', n_jobs=5)

sf_7_365.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_7_365.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[7, 365]')

### 3.1.7 Seasonality = [30, 365]

In [None]:
# Initialize a model
mstl_30_365 = [MSTL(season_length=[30,365], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_30_365 = StatsForecast(models=mstl_30_365, freq='D', n_jobs=5)

sf_30_365.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_30_365.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[30, 365]')

### 3.1.8 Seasonality = [7, 30, 365]

In [None]:
# Initialize a model
mstl_7_30_365 = [MSTL(season_length=[7,30,365], # seasonalities of the time series 
    trend_forecaster=AutoARIMA(max_p=2, max_q=2)
)]

sf_7_30_365 = StatsForecast(models=mstl_7_30_365, freq='D', n_jobs=5)

sf_7_30_365.fit(df=Y_train_df)

# Add the remainders
fitted_models = [sf_7_30_365.fitted_[i,0].model_ for i in range(n)]
seasonality_remainders = add_median_remainder(seasonality_remainders, fitted_models, '[7, 30, 365]')

### 3.2 Find the best fits (lowest remainders)

In [None]:
# Create a copy of the dataframe excluding the 'unique_id' column
df_abs = seasonality_remainders.drop('unique_id', axis=1).abs()

# Find the column name with the minimum absolute value for each row
best_fit = df_abs.idxmin(axis=1)

# Add the 'best_fit' column to the original dataframe
seasonality_remainders['best_fit'] = best_fit

seasonality_remainders

### 3.3 Save the best fits

In [None]:
# Create a new dataframe with 'unique_id' as index and 'best_fit' as the only column
best_fits_df = seasonality_remainders[['unique_id', 'best_fit']].set_index('unique_id')
best_fits_df.index = best_fits_df.index.astype(int)
best_fits_df = best_fits_df.sort_index()

# Save the new dataframe as a CSV file
best_fits_df.to_csv(f'best_fits_{data_date}_{data_size}.csv')

In [None]:
best_fits_df