In [None]:
import sys
sys.path.append('../')

### SET UP ENVIRONMENT

In [None]:
!pip install -r ../../../dev-requirements.txt
!pip install python-dotenv

from dotenv import load_dotenv
load_dotenv('../.env')

In [None]:
from common import get_clustered_dataframes

clusters_data = get_clustered_dataframes()

### MODELS SECTION

#### Multivariate Time Series - Vector Auto Regression (VAR)

[Source](https://www.analyticsvidhya.com/blog/2018/09/multivariate-time-series-guide-forecasting-modeling-python-codes/)

In [None]:
from math import sqrt

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.vector_ar.var_model import VAR

import pandas as pd
import matplotlib.pyplot as plt

def find_best_model(train_data,
                    max_lag,
                    trends=['c', 'ct', 'ctt', 'n']):
    best_order = None
    best_aic = float('inf')  # Initialize with positive infinity
    best_model = None
    best_trend = None

    for p in range(1, max_lag + 1):
        for trend in trends:
            # Fit a VAR model with the specified lag order and trend
            model = VAR(train_data)
            model_fit = model.fit(p, trend)
            
            # Calculate AIC for the current model
            current_aic = model_fit.aic
            
            # Check if the current AIC is the lowest so far
            if current_aic < best_aic:
                best_aic = current_aic
                best_order = p
                best_model = model_fit
                best_trend = trend
    
    print('Best lag order:', best_order)
    print('Best trend:', best_trend)
    print('Best AIC:', best_aic)

    return best_model, best_order, best_trend, best_aic


def data_preparation(cluster_data,
                     apply_differencing=False):
    # Data preparation
    
    # Step 1: Remove the "date" column
    data = cluster_data.drop(['Date'], axis=1)

    if apply_differencing:
        data = data.diff().dropna()
        # Step 2: Set the "date" column as the index
        data.index = pd.to_datetime(cluster_data.Date.iloc[1:], format='%Y-%m-%d')  # Modify the format as needed
    else:
        # Step 2: Set the "date" column as the index
        data.index = pd.to_datetime(cluster_data.Date, format='%Y-%m-%d')  # Modify the format as needed

    data = data.asfreq('D')  # Specify the frequency as 'D' for daily data

    # Step 3: Dealing with missing values --> already done

    # Step 4: Split the data into train and test sets
    train_size = int(len(data) * 0.7)
    test_size = (len(data) - train_size) / 2

    train = data.iloc[0:train_size] 
    valid = data.iloc[train_size:int(len(data) - test_size)]
    test = data.iloc[int(train_size + test_size):len(data)]

    # print('train size: ', len(train))
    # print('valid size: ', len(valid))
    # print('test size: ', len(test))
    # print('total size: ', len(data))
    # print("\n")
    assert len(data) == len(train) + len(valid) + len(test)

    return train, valid, test

In [None]:
from math import sqrt
import os

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.vector_ar.var_model import VAR

import mlflow
import pandas as pd
import matplotlib.pyplot as plt

def mts_var_training(cluster_data, 
                     apply_differencing: bool = False,
                     max_lag: int = 10):
    
    experiment_name = f"Training Binance 1D - 3 Cluster"
    mlflow.set_experiment(experiment_name)

    train, valid, test = data_preparation(cluster_data, apply_differencing)
    cols = cluster_data.columns.drop('Date')

    
    with mlflow.start_run():

        # Log parameters to MLflow
        mlflow.log_params({
            "Training_Model": "VAR",
            "Diff": apply_differencing,
            "Max_Lag": max_lag,
            "Criptos": cols
        })

        model_fit, order, trend, aic = find_best_model(train, max_lag)

        # make prediction on validation
        prediction = model_fit.forecast(model_fit.endog, steps=len(valid))

        #converting predictions to dataframe
        pred = pd.DataFrame(index=range(0, len(prediction)), columns=[cols]) 
        for j in range(0, len(cols)):
            for i in range(0, len(prediction)):
                pred.iloc[i, j] = prediction[i][j]

        #check rmse
        for cripto in cols:
            preds = pred[cripto]
            valids = valid[cripto]

            df = pd.DataFrame()
            df['pred'] = preds
            df['valid'] = valids.values
            df.index = valids.index

            rmse = sqrt(mean_squared_error(preds, valids))
            print('RMSE value for', cripto, 'is : ', rmse)
            mlflow.log_metric(f"RMSE_{cripto.split('_')[1]}", rmse)

            # Create a plot and specify colors for each time series                     
            plot_path = f"{cripto}_prediction_plot.png"
            plt.figure(figsize=(10, 6))
            plt.plot(df['pred'], label='Predictions', color='blue')
            plt.plot(df['valid'], label='Actual', color='red')

            # Add labels, title, and legend
            plt.xlabel('Time')
            plt.ylabel('Value')
            plt.title('Two Time Series')
            plt.legend(loc='upper right')

            plt.annotate(f"RMSE: {rmse}", xy=(0.05, 0.85), xycoords='axes fraction', fontsize=10)

            plt.savefig(plot_path)
            plt.show()
            mlflow.log_artifact(plot_path)
            os.remove(plot_path)

        mlflow.log_params({
            "Order": order,
            "Trend": trend,
            "AIC": aic
        })
    
    return pred, valid

In [None]:
for cluster, cripto in clusters_data.items():
    print(f'Cluster {cluster}: {cripto.columns[:-1]}\n')

    pred, valid = mts_var_training(cripto,
                     apply_differencing=False, 
                     max_lag=2)

    print("\n---------------------------------\n")

In [None]:
for cluster, cripto in clusters_data.items():
    print(f'Cluster {cluster}: {cripto.columns[:-1]}\n')

    pred, valid = mts_var_training(cripto,
                     apply_differencing=True, 
                     max_lag=2)

    print("\n---------------------------------\n")