# Timeseries Forecasting (Naive model)

## 1. Installing dependencies

In [None]:
# !pip install datasetsforecast

In [None]:
# !pip install sktime

In [None]:
# !pip install statsforecast

In [None]:
# Basics
import pandas as pd
import numpy as np

# Some functions for plotting and stuff
import utils as ts_utils

# Statistical models
from statsforecast import StatsForecast
from statsforecast.models import Naive

## 2. Data Preparation

In [None]:
# Size of the data to read
data_size = 'norm'

# Date of the data to read
data_date = '2110' # '1806' = 18th of June

# Read the data (takes around 2 minutes)
dataset = pd.read_csv(f"~/Thesis/data/eod_balances_{data_date}_{data_size}.csv")

dataset

In [None]:
# Create the timer
timer = ts_utils.Timer()

### 2.1 In-sample and Out-sample split

In [None]:
# Calculate total amount of timeseries
num_timeseries = len(dataset.columns) - 1

# Specify train test split percentage
train_test_split = 0.8

# Split into train and out of sample test data
num_out_of_sample = int(train_test_split * num_timeseries)

# Create in-sample dataframe
in_sample_data = dataset.iloc[:, : num_out_of_sample + 1] # Training and testing

# Create out-sample dataframe
n = num_timeseries-num_out_of_sample
columns_to_keep = dataset.columns[[0]].tolist() + dataset.columns[-n:].tolist()
out_sample_data = dataset[columns_to_keep]

## 3. In-sample Analysis

### 3.1 Train/Test splitting and plotting

In [None]:
# Change the data to the long format
Y_df = in_sample_data.melt(id_vars=['date'], var_name='unique_id', value_name='y')
Y_df = Y_df.rename(columns={'date':'ds'})

# Convert date column to datetime type
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

In [None]:
# Define the horizon (12 months of 30 days each)
fh = 30
horizon = 12 * fh

# Identify the unique dates in the dataset
unique_dates = Y_df['ds'].unique()

# Convert to a list and then sort the dates
unique_dates = sorted(list(unique_dates))

# Determine the cutoff date (cutoff at 12 months before the last date in the dataset)
cutoff_date = unique_dates[-(horizon + 1)]

# Training data: all data up to the cutoff date
Y_train_df = Y_df[Y_df['ds'] <= cutoff_date]

In [None]:
# Initialize lists to store the input and test sets
input_dfs = []
test_dfs = []

# Loop to create the 6 input and test sets
for i in range(6):
    # Determine the start date of the test period
    test_start_date = unique_dates[-(horizon - i * 2 * fh)]
    test_end_date = unique_dates[-(horizon - (i * 2 * fh) - fh)]
    
    # Input data: all data up to the start of the current test period
    input_df = Y_df[Y_df['ds'] <= test_start_date]
    input_dfs.append(input_df)
    
    # Test data: the 30-day period following the start of the test period
    test_df = Y_df[(Y_df['ds'] > test_start_date) & (Y_df['ds'] <= test_end_date)]
    test_dfs.append(test_df)

# Define the 6 input periods
Y_input_df_0 = input_dfs[0]
Y_input_df_1 = input_dfs[1]
Y_input_df_2 = input_dfs[2]
Y_input_df_3 = input_dfs[3]
Y_input_df_4 = input_dfs[4]
Y_input_df_5 = input_dfs[5]

# Define the 6 test periods
Y_test_df_0 = test_dfs[0]
Y_test_df_1 = test_dfs[1]
Y_test_df_2 = test_dfs[2]
Y_test_df_3 = test_dfs[3]
Y_test_df_4 = test_dfs[4]
Y_test_df_5 = test_dfs[5]

In [None]:
# Timeserie to plot
unique_id = '6'

# Plot the train and test dataframes
ts_utils.plot_train_test_split(Y_input_df_0, Y_test_df_0, unique_id)

### 3.2 Training and predicting

In [None]:
# Initialize the model
naive_model_insample = StatsForecast(models=[Naive()], freq='D')

In [None]:
# Fit the model on first period
naive_model_insample_0 = naive_model_insample.fit(df=Y_input_df_0)

# Predict the first period
naive_model_insample_pred_0 = naive_model_insample_0.predict(h=30)

# Save the first period predictions
naive_model_insample_pred_0.to_csv(f'~/Thesis/predictions/Naive/insample/period01/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on second period
naive_model_insample_1 = naive_model_insample.fit(df=Y_input_df_1)

# Predict the second period
naive_model_insample_pred_1 = naive_model_insample_1.predict(h=30)

# Save the second period predictions
naive_model_insample_pred_1.to_csv(f'~/Thesis/predictions/Naive/insample/period02/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on third period
naive_model_insample_2 = naive_model_insample.fit(df=Y_input_df_2)

# Predict the third period
naive_model_insample_pred_2 = naive_model_insample_2.predict(h=30)

# Save the third period predictions
naive_model_insample_pred_2.to_csv(f'~/Thesis/predictions/Naive/insample/period03/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on fourth period
naive_model_insample_3 = naive_model_insample.fit(df=Y_input_df_3)

# Predict the fourth period
naive_model_insample_pred_3 = naive_model_insample_3.predict(h=30)

# Save the fourth period predictions
naive_model_insample_pred_3.to_csv(f'~/Thesis/predictions/Naive/insample/period04/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on fifth period
naive_model_insample_4 = naive_model_insample.fit(df=Y_input_df_4)

# Predict the fifth period
naive_model_insample_pred_4 = naive_model_insample_4.predict(h=30)

# Save the fifth period predictions
naive_model_insample_pred_4.to_csv(f'~/Thesis/predictions/Naive/insample/period05/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on sixth period
naive_model_insample_5 = naive_model_insample.fit(df=Y_input_df_5)

# Predict the sixth period
naive_model_insample_pred_5 = naive_model_insample_5.predict(h=30)

# Save the sixth period predictions
naive_model_insample_pred_5.to_csv(f'~/Thesis/predictions/Naive/insample/period06/model_preds_{data_date}_{data_size}.csv', index=True)

## 4. Out-of-sample Analysis

### 4.1 Data Handling

In [None]:
# Change the data to the long format
Y_df = out_sample_data.melt(id_vars=['date'], var_name='unique_id', value_name='y')
Y_df = Y_df.rename(columns={'date':'ds'})

# Convert date column to datetime type
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

In [None]:
# Define the horizon (12 months of 30 days each)
fh = 30
horizon = 12 * fh

# Identify the unique dates in the dataset
unique_dates = Y_df['ds'].unique()

# Convert to a list and then sort the dates
unique_dates = sorted(list(unique_dates))

# Determine the cutoff date (cutoff at 12 months before the last date in the dataset)
cutoff_date = unique_dates[-(horizon + 1)]

# Training data: all data up to the cutoff date
Y_train_df = Y_df[Y_df['ds'] <= cutoff_date]

In [None]:
# Initialize lists to store the input and test sets
input_dfs = []
test_dfs = []

# Loop to create the 6 input and test sets
for i in range(6):
    # Determine the start date of the test period
    test_start_date = unique_dates[-(horizon - i * 2 * fh)]
    test_end_date = unique_dates[-(horizon - (i * 2 * fh) - fh)]
    
    # Input data: all data up to the start of the current test period
    input_df = Y_df[Y_df['ds'] <= test_start_date]
    input_dfs.append(input_df)
    
    # Test data: the 30-day period following the start of the test period
    test_df = Y_df[(Y_df['ds'] > test_start_date) & (Y_df['ds'] <= test_end_date)]
    test_dfs.append(test_df)

# Define the 6 input periods
Y_input_df_0 = input_dfs[0]
Y_input_df_1 = input_dfs[1]
Y_input_df_2 = input_dfs[2]
Y_input_df_3 = input_dfs[3]
Y_input_df_4 = input_dfs[4]
Y_input_df_5 = input_dfs[5]

# Define the 6 test periods
Y_test_df_0 = test_dfs[0]
Y_test_df_1 = test_dfs[1]
Y_test_df_2 = test_dfs[2]
Y_test_df_3 = test_dfs[3]
Y_test_df_4 = test_dfs[4]
Y_test_df_5 = test_dfs[5]

In [None]:
# Timeserie to plot
unique_id = Y_train_df['unique_id'][0]

# Plot the train and test dataframes
ts_utils.plot_train_test_split(Y_train_df, Y_test_df_0, unique_id)

### 4.2 Retrieve Predictions

In [None]:
# Initialize the model
naive_model_outsample = StatsForecast(models=[Naive()], freq='D')

In [None]:
# Fit the model on first period
naive_model_outsample_0 = naive_model_outsample.fit(df=Y_input_df_0)

# Predict the first period
naive_model_outsample_pred_0 = naive_model_outsample_0.predict(h=30)

# Save the first period predictions
naive_model_outsample_pred_0.to_csv(f'~/Thesis/predictions/Naive/outsample/period01/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on second period
naive_model_outsample_1 = naive_model_outsample.fit(df=Y_input_df_1)

# Predict the second period
naive_model_outsample_pred_1 = naive_model_outsample_1.predict(h=30)

# Save the second period predictions
naive_model_outsample_pred_1.to_csv(f'~/Thesis/predictions/Naive/outsample/period02/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on third period
naive_model_outsample_2 = naive_model_outsample.fit(df=Y_input_df_2)

# Predict the third period
naive_model_outsample_pred_2 = naive_model_outsample_2.predict(h=30)

# Save the third period predictions
naive_model_outsample_pred_2.to_csv(f'~/Thesis/predictions/Naive/outsample/period03/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on fourth period
naive_model_outsample_3 = naive_model_outsample.fit(df=Y_input_df_3)

# Predict the fourth period
naive_model_outsample_pred_3 = naive_model_outsample_3.predict(h=30)

# Save the fourth period predictions
naive_model_outsample_pred_3.to_csv(f'~/Thesis/predictions/Naive/outsample/period04/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on fifth period
naive_model_outsample_4 = naive_model_outsample.fit(df=Y_input_df_4)

# Predict the fifth period
naive_model_outsample_pred_4 = naive_model_outsample_4.predict(h=30)

# Save the fifth period predictions
naive_model_outsample_pred_4.to_csv(f'~/Thesis/predictions/Naive/outsample/period05/model_preds_{data_date}_{data_size}.csv', index=True)

In [None]:
# Fit the model on sixth period
naive_model_outsample_5 = naive_model_outsample.fit(df=Y_input_df_5)

# Predict the sixth period
naive_model_outsample_pred_5 = naive_model_outsample_5.predict(h=30)

# Save the sixth period predictions
naive_model_outsample_pred_5.to_csv(f'~/Thesis/predictions/Naive/outsample/period06/model_preds_{data_date}_{data_size}.csv', index=True)