# Transformer LSTM models

# Modules

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf
from matplotlib.dates import DateFormatter, AutoDateLocator
import matplotlib.dates as mdates

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error
from keras.optimizers import Adam



from sklearn.metrics import mean_absolute_percentage_error
import re
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from scipy import stats
from scipy.stats import ttest_ind


from datetime import datetime, timedelta

from tqdm import tqdm
import numpy as np
import random
from scipy.stats import pearsonr
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
np.set_printoptions(suppress=True)

#importing required libraries for Forecasting
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

from google.colab import files
import time

import glob

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
try:
    import google.colab
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

# Make use of a TPU
import torch
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "tpu"
print(f"Using device: {device}")

Note: using Google CoLab
Using device: cuda


In [5]:
from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [47]:
cd /content/drive/MyDrive/MSC_YORK/PROJECT/

/content/drive/MyDrive/MSC_YORK/PROJECT


In [48]:
pwd

'/content/drive/MyDrive/MSC_YORK/PROJECT'

# List of coins

In [8]:
cointegrated_pairs = pd.read_csv("MOST_COINTEGRATED_PAIRS/confirmed_cointegrated_pairs.csv")

In [9]:
unique_coins = pd.concat([
    cointegrated_pairs[['coin1', 'in_sample_coin1_lookback']].rename(columns={'coin1': 'coin', 'in_sample_coin1_lookback': 'lookback'}),
    cointegrated_pairs[['coin2', 'in_sample_coin2_lookback']].rename(columns={'coin2': 'coin', 'in_sample_coin2_lookback': 'lookback'})
], axis=0).drop_duplicates()

unique_coins = unique_coins.groupby('coin').agg({'lookback': 'min'}).reset_index()
unique_coins


Unnamed: 0,coin,lookback
0,ADAUSDT,2448
1,ATOMUSDT,2449
2,BTCUSDT,2423
3,DOGEUSDT,2461
4,DOTUSDT,2337
5,ETHUSDT,2443
6,JASMYUSDT,2161
7,LINKUSDT,2438
8,LTCUSDT,2313
9,LUNCUSDT,2152


# Build of Models

In [None]:
# Define the Positional Encoding class for the Transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Define the Transformer model for regression
class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=8, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()
        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

# Function to create sequences for training and testing
def create_sequences(dataset, lookback_window, lookforward_window):
    X, y, close_times = [], [], []
    for i in range(len(dataset) - lookback_window - lookforward_window + 1):
        X.append(dataset[i:i + lookback_window])
        y.append(dataset[i + lookback_window + lookforward_window - 1])
        close_times.append(i + lookback_window + lookforward_window - 1)
    return np.array(X), np.array(y), close_times



In [None]:
coins = []
lookback_windows = []
models = []
training_times = []
rmses = []
mses = []
maes = []
r2s = []
mapes = []

for index, row in unique_coins.iterrows():
    print(f"coin: {row['coin']}############################################################")
    coin = row['coin']
    lookback_window = row['lookback']
    in_sample_start_date = '2023-01-01'
    in_sample_end_date = '2023-01-19'
    out_of_sample_end_date = '2023-01-23'

    lookforward_window = 1
    dropout = 0.2
    learning_rate = 0.001

    input_dim = 1
    d_model = 64
    nhead = 8
    num_layers = 2
    early_stopping_var = 10

    print(f"coin: {row['coin']} / transformer - train and test dataset preparation************************")
    # # Load and preprocess the data
    train_df = pd.read_csv(f"PRICES/ACTUAL/training_set_{coin}_{in_sample_start_date}_{in_sample_end_date}.csv").sort_index(ascending=True)
    train_close_time = train_df['close_time']
    columns = [f'{coin}_mid']
    train_df = train_df[columns]

    test_df = pd.read_csv(f"PRICES/ACTUAL/test_set_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.csv").sort_index(ascending=True)
    test_close_time = test_df['close_time']
    test_df = test_df[columns]

    # Standardizarion
    scaler = StandardScaler()
    train_scaled = train_df.to_numpy().reshape(-1, 1)
    test_scaled = test_df.to_numpy().reshape(-1, 1)
    train_scaled = scaler.fit_transform(train_scaled).flatten().tolist()
    test_scaled = scaler.transform(test_scaled).flatten().tolist()


    # Function to create sequences for training and testing
    def create_sequences(dataset, lookback_window, lookforward_window):
        X, y, close_times = [], [], []
        for i in range(len(dataset) - lookback_window - lookforward_window +1):
            X.append(dataset[i:i + lookback_window])
            y.append(dataset[i + lookback_window + lookforward_window-1])
            close_times.append(i + lookback_window + lookforward_window-1)
        return torch.tensor(X, dtype=torch.float32).view(-1, lookback_window, 1), torch.tensor(y, dtype=torch.float32).view(-1, 1), close_times

    # Create training and testing sequences
    x_train, y_train, train_close_times = create_sequences(train_scaled, lookback_window, lookforward_window)
    x_test, y_test, test_close_times = create_sequences(test_scaled, lookback_window, lookforward_window)

    # Setup data loaders for batch processing
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    test_dataset = TensorDataset(x_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Map indices to close_time values
    train_close_time = train_close_time.iloc[train_close_times].reset_index(drop=True)
    test_close_time = test_close_time.iloc[test_close_times].reset_index(drop=True)

    # Display shapes of the datasets to confirm
    print(f"X_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {x_test.shape}, y_test shape: {y_test.shape}")
    #TRANSFORMER
    print(f"coin: {row['coin']} / transformer - build************************")
    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize model, loss function, optimizer, and learning rate scheduler
    model = TransformerModel(input_dim,d_model,nhead,num_layers,dropout).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)


    # Training and validation loss lists
    train_losses = []
    val_losses = []

    # Train the model
    epochs = 100
    early_stop_count = 0
    min_val_loss = float('inf')

    total_start_time = time.time()

    for epoch in range(epochs):
        model.train()
        batch_train_losses = []
        for batch in train_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            batch_train_losses.append(loss.item())

        train_loss = np.mean(batch_train_losses)
        train_losses.append(train_loss)

        # Validation
        model.eval()
        batch_val_losses = []
        with torch.no_grad():
            for batch in test_loader:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                batch_val_losses.append(loss.item())

        val_loss = np.mean(batch_val_losses)
        val_losses.append(val_loss)
        scheduler.step(val_loss)

        if val_loss < min_val_loss:
            min_val_loss = val_loss
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= early_stopping_var:
            print("Early stopping!")
            break
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")

    total_end_time = time.time()
    training_time = total_end_time - total_start_time
    print(f"Total training time: {training_time:.2f} seconds")

    print(f"coin: {row['coin']} / transformer - plot training_and_validation_loss************************")
    # Plot training and validation loss
    plt.figure(figsize=(12, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig(f'PRICES/PREDICTED/transformer_training_and_validation_loss_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.png', format='png')
    # plt.show()

    # Evaluation
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch = x_batch.to(device)
            outputs = model(x_batch)

            # Check if the output is a single float and handle it accordingly
            if isinstance(outputs.squeeze().tolist(), float):
                predictions.append(outputs.squeeze().tolist())
            else:
                predictions.extend(outputs.squeeze().tolist())


    predictions_inv = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
    y_test_inv = scaler.inverse_transform(y_test.numpy().reshape(-1, 1))

    print(f"coin: {row['coin']} / transformer - evaluate error metrics************************")
    # Calculate metrics
    rmse = np.sqrt(np.mean((predictions_inv - y_test_inv) ** 2))
    mse = np.mean((predictions_inv - y_test_inv) ** 2)
    mae = mean_absolute_error(y_test_inv, predictions_inv)
    r2 = r2_score(y_test_inv, predictions_inv)*100
    mape = np.mean(np.abs((predictions_inv - y_test_inv) / y_test_inv)) * 100

    print(f"Score (RMSE): {rmse:.10f}")
    print(f"Score (MSE): {mse:.10f}")
    print(f"Score (MAE): {mae:.10f}")
    print(f"Score (R-squared): {r2:.10f}")
    print(f"Score (MAPE): {mape:.10f}%")


    # Combine predictions with corresponding timestamps
    predictions_df = pd.DataFrame({
        'close_time': test_close_time,
        'predictions': predictions_inv.flatten(),
        'actual': y_test_inv.flatten()
    })

    # Display the DataFrame
    predictions_df.head()

    predictions_df.sort_values(by = ['close_time'], ascending = True).to_csv(f"PRICES/PREDICTED/transformers_predictions_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.csv", index = False)

    predictions_df.sort_values(by = ['close_time'], ascending = True).to_csv(f"PRICES/PREDICTED/transformers_predictions_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.csv", index = False)

    print(f"coin: {row['coin']} / transformer - save model************************")
    # Save the model
    torch.save(model.state_dict(), f'MODELS/transformer_model_{coin}_{in_sample_start_date}_{in_sample_end_date}.pth')

    test_close_time = pd.to_datetime(test_close_time)

    # Plot predictions and actuals against close_time with rotated x-axis labels
    plt.figure(figsize=(14, 7))
    plt.plot(test_close_time[:], y_test_inv[:], label='Actual Price', color='blue')
    plt.plot(test_close_time[:], predictions_inv[:], label='Predicted Price', color='red', linestyle='dashed')
    plt.xlabel('Close Time')
    plt.ylabel('Price')
    plt.title(f'Actual vs Transformer-Predicted Price {coin}')
    plt.legend()

    # Rotate x-axis labels
    plt.xticks(rotation=45)

    # Adjust x-axis ticks frequency
    locator = AutoDateLocator()
    plt.gca().xaxis.set_major_locator(locator)
    plt.gca().xaxis.set_major_formatter(DateFormatter('%Y-%m-%d %H:%M:%S'))

    plt.tight_layout()

    plt.savefig(f'PRICES/PREDICTED/transformers_predictions_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.png', format='png')

    coins.append(coin)
    lookback_windows.append(lookback_window)
    models.append('transformer')
    training_times.append(training_time)
    rmses.append(rmse)
    mses.append(mse)
    maes.append(mae)
    r2s.append(r2)
    mapes.append(mape)

    performance_resuls = {
    'coin' : coins,
    'lookback' : lookback_windows,
    'model' : models,
    'training_time' : training_times,
    'rmse' : rmses,
    'mse' : mses,
    'mae' : maes,
    'r2' : r2s,
    'mape' : mapes
    }

    performance_resuls_df = pd.DataFrame(performance_resuls)

    performance_resuls_df.to_csv(f"PRICES/PREDICTED/models_performance.csv", index = False)
    #####################################################################################################
    #LSTM
    print(f"coin: {row['coin']} / lstm - train and test dataset preparation************************")
    # Create training and testing sequences
    x_train, y_train, train_close_times = create_sequences(train_scaled, lookback_window, lookforward_window)
    x_test, y_test, test_close_times = create_sequences(test_scaled, lookback_window, lookforward_window)

    # Reshape input to be [samples, time steps, features]
    x_train = x_train.reshape((x_train.shape[0], lookback_window, 1))
    x_test = x_test.reshape((x_test.shape[0], lookback_window, 1))
    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    print(f"coin: {row['coin']} / lstm - build************************")
    # Define the LSTM model
    lstm_model = Sequential()
    lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(lookback_window, x_train.shape[2])))
    lstm_model.add(Dropout(dropout))
    lstm_model.add(LSTM(units=3))
    lstm_model.add(Dropout(dropout))
    lstm_model.add(Dense(1))

    # Compile the model
    optimizer = Adam(learning_rate=learning_rate )
    lstm_model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Measure training time
    start_time = time.time()

    # Define EarlyStopping callback
    from keras.callbacks import EarlyStopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stopping_var, mode='min', verbose=1)

    # Train the model
    epochs = 100
    batch_size = 32
    history = lstm_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=2, callbacks=[early_stopping])

    end_time = time.time()
    training_time = end_time - start_time

    print(f"Total training time: {training_time:.2f} seconds")

    print(f"coin: {row['coin']} / lstm - plot training_and_validation_loss************************")
    # Plot training & validation loss values
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'PRICES/PREDICTED/lstm_training_and_validation_loss_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.png', format='png')

    # plt.show()

    # Predict and inverse scale
    predictions = lstm_model.predict(x_test)
    predictions_inv = scaler.inverse_transform(predictions)
    y_test_inv = scaler.inverse_transform(y_test)

    print(f"coin: {row['coin']} / lstm - evaluate error metrics************************")
    # Calculate metrics
    rmse = np.sqrt(np.mean((predictions_inv - y_test_inv) ** 2))
    mse = np.mean((predictions_inv - y_test_inv) ** 2)
    mae = mean_absolute_error(y_test_inv, predictions_inv)
    r2 = r2_score(y_test_inv, predictions_inv)*100
    mape = np.mean(np.abs((predictions_inv - y_test_inv) / y_test_inv)) * 100

    print(f"Score (RMSE): {rmse:.10f}")
    print(f"Score (MSE): {mse:.10f}")
    print(f"Score (MAE): {mae:.10f}")
    print(f"Score (R-squared): {r2:.10f}")
    print(f"Score (MAPE): {mape:.10f}%")

    # Combine predictions with corresponding timestamps
    predictions_df = pd.DataFrame({
        'close_time': test_close_time,
        'predictions': predictions_inv.flatten(),
        'actual': y_test_inv.flatten()
    })

    # Display the DataFrame
    predictions_df.head()

    predictions_df.sort_values(by = ['close_time'], ascending = True).to_csv(f"PRICES/PREDICTED/lstm_predictions_{coin}_{in_sample_end_date}_{out_of_sample_end_date}.csv", index = False)

    # Save the model
    lstm_model.save(f"MODELS/lstm_model_{coin}_{in_sample_start_date}_{in_sample_end_date}.h5")

    # Plot predictions and actuals
    plt.figure(figsize=(14, 7))
    plt.plot(test_close_time[:], y_test_inv[:], label='Actual Price', color='blue')
    plt.plot(test_close_time[:], predictions_inv[:], label='Predicted Price', color='red', linestyle='dashed')
    plt.xlabel('Close Time')
    plt.ylabel('Price')
    plt.title(f'Actual vs LSTM-Predicted Price {coin}')
    plt.legend()

    # Rotate x-axis labels
    plt.xticks(rotation=45)

    # Adjust x-axis ticks frequency
    from matplotlib.dates import DateFormatter, AutoDateLocator
    locator = AutoDateLocator()
    plt.gca().xaxis.set_major_locator(locator)
    plt.gca().xaxis.set_major_formatter(DateFormatter('%Y-%m-%d %H:%M:%S'))

    # plt.show()

    plt.tight_layout()

    plt.savefig(f"PRICES/PREDICTED/lstm_predictions_{coin}_{in_sample_end_date}_{in_sample_end_date}.png", format='png')

    coins.append(coin)
    lookback_windows.append(lookback_window)
    models.append('lstm')
    training_times.append(training_time)
    rmses.append(rmse)
    mses.append(mse)
    maes.append(mae)
    r2s.append(r2)
    mapes.append(mape)

    performance_resuls = {
    'coin' : coins,
    'lookback' : lookback_windows,
    'model' : models,
    'training_time' : training_times,
    'rmse' : rmses,
    'mse' : mses,
    'mae' : maes,
    'r2' : r2s,
    'mape' : mapes
    }

    performance_resuls_df = pd.DataFrame(performance_resuls)

    performance_resuls_df.to_csv(f"PRICES/PREDICTED/models_performance.csv", index = False)


Output hidden; open in https://colab.research.google.com to view.

# Analysis of performance

## Functions

In [10]:
def describe(df, model_1, model_2):
  """
    Generate descriptive statistics for AI and standard model's metrics.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the metrics.
    model_1 : str
        The name of the model 1.
    model_2 : str
        The name of the model 2.

    Returns:
    -------
    pandas.DataFrame
        A DataFrame with descriptive statistics.
  """
  desc_ai = df[model_1].describe().to_frame().transpose()
  desc_std = df[model_2].describe().to_frame().transpose()

  comparison_table = pd.concat([desc_ai, desc_std], axis=0)
  comparison_table.index = [model_1, model_2]

  return comparison_table


def find_max_rejection_threshold(df, col,pace=0.01):
    """
    Find the maximum threshold where the null hypothesis (H0) can be rejected.

    This function starts with a threshold of 0 and increases it iteratively until the null hypothesis
    can no longer be rejected. It returns the last threshold where H0 was rejected.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the data.
    col : str
        The name of the column to test.

    Returns:
    -------
    max_threshold : float
        The maximum threshold where the null hypothesis can still be rejected.
    """
    import scipy.stats as stats

    # Significance level
    alpha = 0.05

    # Initial threshold
    threshold = 0
    max_threshold = 0

    # Check normality of the data
    _, p_value_col1 = stats.shapiro(df[col])
    print(f'P-value for normality test on {col}: {p_value_col1}')

    normally_distributed = p_value_col1 > alpha

    # Continue increasing the threshold until the null hypothesis is not rejected
    while True:
        if normally_distributed:
            # Perform one-sample t-test for normally distributed data
            result = stats.ttest_1samp(a=df[col], popmean=threshold, alternative='greater')
            p_value = result.pvalue
        else:
            # Perform Wilcoxon signed-rank test for non-normally distributed data
            _, p_value = stats.wilcoxon(df[col] - threshold, alternative='greater')

        if p_value < alpha:
            max_threshold = threshold
        else:
            break

        # Increase the threshold slightly
        threshold += pace

    print(f"The maximum threshold where the null hypothesis is rejected: {max_threshold}")
    return max_threshold



def t_test(df,col,threshold):
  """
    Perform a one-sample t-test to compare the mean of a column to a threshold.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the data.
    col : str
        The name of the column to test.
    threshold : float
        The threshold to compare against.
  """
  import scipy.stats as stats
  alpha = 0.05

  result_df = pd.DataFrame([])
  result_df['col'] = [col]
  result_df['threshold'] = [threshold]
  result_df['normality_test_p_value_col'] = [None]
  result_df['is_normal'] = [None]
  result_df['t_stat'] = [None]
  result_df['p_value_ttest'] = [None]
  result_df['wilcoxon_stat'] = [None]
  result_df['p_value_wilcoxon']  = [None]

  # Check normality
  _, p_value_col = stats.shapiro(df[col])
  print(f'P-value for normality test on {col}: {p_value_col}')
  result_df['normality_test_p_value_col'] = [p_value_col]


  if p_value_col > alpha:
    print('The distribution is normally distributed.')
    normal = 'yes'

    # Perform one-sample t-test
    result = stats.ttest_1samp(a=df[col], popmean=threshold, alternative='greater')

    # Print the results
    print(f"t-statistic: {result.statistic:.2f}")
    print(f"p-value: {result.pvalue:.3f}")

    result_df['t_stat'] = [result.statistic]
    result_df['p_value_ttest'] = [result.pvalue]

    # Interpretation
    alpha = 0.05  # Significance level
    if result.pvalue < alpha:
      print(f'Reject the null hypothesis: The median is significantly greater than {threshold}.')
      decision = 'Reject H0'
    else:
      print(f'Fail to reject the null hypothesis: The median is not significantly greater than {threshold}.')
      decision = 'Fail to reject H0'
  else:
    print('The series is not normally distributed.')
    normal = 'no'

    # Perform the Wilcoxon signed-rank test
    wilcoxon_stat, p_value_wilcoxon = stats.wilcoxon(df[col] - threshold, alternative='greater')

    print(f'Wilcoxon statistic: {wilcoxon_stat}')
    print(f'P-value for the Wilcoxon signed-rank test: {p_value_wilcoxon}')
    result_df['wilcoxon_stat'] = [wilcoxon_stat]
    result_df['p_value_wilcoxon'] = [p_value_wilcoxon]

    # Interpretation
    if p_value_wilcoxon < alpha:
      print(f'Reject the null hypothesis: The median is significantly greater than {threshold}.')
      decision = 'Reject H0'
    else:
      print(f'Fail to reject the null hypothesis: The median is not significantly greater than {threshold}.')
      decision = 'Fail to reject H0'

  result_df['is_normal'] = [normal]
  result_df['decision'] = [decision]
  return result_df



def two_sample_t_test(df, col1, col2):
    """
    Perform a two-sample t-test to compare AI and standard model's metrics.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the metrics.
    ai_col : str
        The name of the column with AI model's metrics.
    standard_col : str
        The name of the column with standard model's metrics.
    """
    from scipy import stats
    alpha = 0.05
    result_df = pd.DataFrame([])
    result_df['col1'] = [col1]
    result_df['col2'] = [col2]
    result_df['normality_test_p_value_col1'] = [None]
    result_df['normality_test_p_value_col2'] = [None]
    result_df['is_normal'] = [None]
    result_df['p_value_equal_variance_levene_test'] = [None]
    result_df['t_stat'] = [None]
    result_df['p_value_ttest'] = [None]
    result_df['u_stat'] = [None]
    result_df['p_value_mannwhitney'] = [None]


    # Check normality
    _, p_value_col1 = stats.shapiro(df[col1])
    _, p_value_col2 = stats.shapiro(df[col2])

    print(f'P-value for normality test on {col1}: {p_value_col1}')
    print(f'P-value for normality test on {col2}: {p_value_col2}')

    result_df['normality_test_p_value_col1'] = [p_value_col1]
    result_df['normality_test_p_value_col2'] = [p_value_col2]

    if p_value_col1 > alpha and p_value_col2 > alpha:
        print('Both groups are normally distributed.')
        normal = 'yes'

        # Perform Levene's test for equal variances
        _, p_value_var = stats.levene(df[col1], df[col2])
        print(f'P-value for equal variance test: {p_value_var}')
        result_df['p_value_equal_variance_levene_test'] = [p_value_var]

        # Perform the one-tailed two-sample t-test
        if p_value_var > alpha:  # Variances are equal
            t_stat, p_value_ttest = stats.ttest_ind(df[col1], df[col2], equal_var=True, alternative='greater')
        else:  # Variances are not equal
            t_stat, p_value_ttest = stats.ttest_ind(df[col1], df[col2], equal_var=False, alternative='greater')

        print(f'T-statistic: {t_stat}')
        print(f'P-value for the t-test: {p_value_ttest}')
        result_df['t_stat'] = [t_stat]
        result_df['p_value_ttest'] = [p_value_ttest]

        # Interpretation
        if p_value_ttest < alpha:
            print(f'Reject the null hypothesis: The mean of {col1} is significantly greater than the mean of {col2}.')
            decision = 'Reject H0'
        else:
            print(f'Fail to reject the null hypothesis: The mean of {col1} is not significantly greater than the mean of {col2}.')
            decision = 'Fail to reject H0'

    else:
        print('At least one of the groups is not normally distributed.')
        normal = 'no'

        # Perform the Mann-Whitney U test
        u_stat, p_value_mannwhitney = stats.mannwhitneyu(df[col1], df[col2], alternative='greater')

        print(f'U-statistic: {u_stat}')
        print(f'P-value for the Mann-Whitney U test: {p_value_mannwhitney}')
        result_df['u_stat'] = [u_stat]
        result_df['p_value_mannwhitney'] = [p_value_mannwhitney]

        # Interpretation
        if p_value_mannwhitney < alpha:
            print(f'Reject the null hypothesis: The median of {col1} is significantly greater than the median of {col2}.')
            decision = 'Reject H0'
        else:
            print(f'Fail to reject the null hypothesis: The median of {col1} is not significantly greater than the median of {col2}.')
            decision = 'Fail to reject H0'


    result_df['is_normal'] = [normal]
    result_df['decision'] = [decision]
    return result_df

In [11]:
models_performance = pd.read_csv("PRICES/PREDICTED/models_performance.csv")

In [12]:
models_performance.head(2)

Unnamed: 0,coin,lookback,model,training_time,rmse,mse,mae,r2,mape
0,ADAUSDT,2448,transformer,429.254808,0.007493,5.6e-05,0.005194,37.332957,1.373872
1,ADAUSDT,2448,lstm,632.634742,0.0103,0.000106,0.008963,-18.402169,2.393418


In [13]:
all_stats = pd.DataFrame()
two_samples_result_df = pd.DataFrame()
one_sample_result_df = pd.DataFrame()

## training_time

In [14]:
training_time = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'training_time')
training_time.head()

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,1337.592635,400.550937
ATOMUSDT,1179.941003,2248.831492
BTCUSDT,1636.799454,366.683014
DOGEUSDT,948.650335,742.358289
DOTUSDT,1435.342892,917.811633


In [15]:
stats = describe(training_time,'lstm', 'transformer')
stats['metric'] = 'training_time'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,1307.291089,528.410521,825.348425,948.650335,1176.546313,1435.342892,2932.920434,training_time
transformer,17.0,840.331243,467.775341,338.059718,570.936229,742.358289,1018.384896,2248.831492,training_time


In [16]:
two_samples_result_tmp_df =   two_sample_t_test(training_time, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'training_time'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(training_time, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'training_time'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 0.0017783649021727856
P-value for normality test on transformer: 0.00961292139326078
At least one of the groups is not normally distributed.
U-statistic: 227.0
P-value for the Mann-Whitney U test: 0.0023686858750804583
Reject the null hypothesis: The median of lstm is significantly greater than the median of transformer.
P-value for normality test on transformer: 0.00961292139326078
P-value for normality test on lstm: 0.0017783649021727856
At least one of the groups is not normally distributed.
U-statistic: 62.0
P-value for the Mann-Whitney U test: 0.9978738541692309
Fail to reject the null hypothesis: The median of transformer is not significantly greater than the median of lstm.


In [17]:
threshold = find_max_rejection_threshold(training_time, 'lstm')
one_sample_result_tmp_df = t_test(training_time,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'training_time'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(training_time, 'transformer')
one_sample_result_tmp_df = t_test(training_time,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'training_time'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 0.0017783649021727856
The maximum threshold where the null hypothesis is rejected: 1048.1999999991917
P-value for normality test on lstm: 0.0017783649021727856
The series is not normally distributed.
Wilcoxon statistic: 112.0
P-value for the Wilcoxon signed-rank test: 0.04918670654296875
Reject the null hypothesis: The median is significantly greater than 1048.1999999991917.
####################################
P-value for normality test on transformer: 0.00961292139326078
The maximum threshold where the null hypothesis is rejected: 642.239999999561
P-value for normality test on transformer: 0.00961292139326078
The series is not normally distributed.
Wilcoxon statistic: 112.0
P-value for the Wilcoxon signed-rank test: 0.04918670654296875
Reject the null hypothesis: The median is significantly greater than 642.239999999561.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,642.24,0.009613,no,,,112.0,0.049187,Reject H0,training_time
0,lstm,1048.2,0.001778,no,,,112.0,0.049187,Reject H0,training_time


## rmse

In [18]:
rmse = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'rmse')
rmse.applymap(lambda x: f"{x:.6f}")

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,0.01073,0.007176
ATOMUSDT,0.272416,0.045757
BTCUSDT,1766.512865,1361.80167
DOGEUSDT,0.001089,0.000559
DOTUSDT,0.109957,0.041912
ETHUSDT,83.075561,43.011916
JASMYUSDT,0.000135,1.8e-05
LINKUSDT,0.105747,0.030971
LTCUSDT,0.710305,0.425353
LUNCUSDT,2e-06,0.0


In [19]:
stats = describe(rmse,'lstm', 'transformer')
stats['metric'] = 'rmse'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,108.963447,427.611833,9.515986e-07,0.001089,0.095359,0.272416,1766.512865,rmse
transformer,17.0,82.705035,329.779489,5.456231e-08,0.000559,0.030971,0.052014,1361.80167,rmse


In [20]:
two_samples_result_tmp_df =   two_sample_t_test(rmse, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'rmse'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(rmse, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'rmse'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 2.8561600612044246e-08
P-value for normality test on transformer: 2.607813544168431e-08
At least one of the groups is not normally distributed.
U-statistic: 168.0
P-value for the Mann-Whitney U test: 0.21412167409791755
Fail to reject the null hypothesis: The median of lstm is not significantly greater than the median of transformer.
P-value for normality test on transformer: 2.607813544168431e-08
P-value for normality test on lstm: 2.8561600612044246e-08
At least one of the groups is not normally distributed.
U-statistic: 121.0
P-value for the Mann-Whitney U test: 0.7957807823442873
Fail to reject the null hypothesis: The median of transformer is not significantly greater than the median of lstm.


In [21]:
threshold = find_max_rejection_threshold(rmse, 'lstm')
one_sample_result_tmp_df = t_test(rmse,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'rmse'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(rmse, 'transformer')
one_sample_result_tmp_df = t_test(rmse,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'rmse'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 2.8561600612044246e-08
The maximum threshold where the null hypothesis is rejected: 0.05
P-value for normality test on lstm: 2.8561600612044246e-08
The series is not normally distributed.
Wilcoxon statistic: 112.0
P-value for the Wilcoxon signed-rank test: 0.04918670654296875
Reject the null hypothesis: The median is significantly greater than 0.05.
####################################
P-value for normality test on transformer: 2.607813544168431e-08
The maximum threshold where the null hypothesis is rejected: 0.01
P-value for normality test on transformer: 2.607813544168431e-08
The series is not normally distributed.
Wilcoxon statistic: 117.0
P-value for the Wilcoxon signed-rank test: 0.0284423828125
Reject the null hypothesis: The median is significantly greater than 0.01.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse
0,transformer,642.24,0.009612921,no,,,112.0,0.049187,Reject H0,training_time
0,lstm,1048.2,0.001778365,no,,,112.0,0.049187,Reject H0,training_time


## mse

In [22]:
mse = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'mse')
mse.applymap(lambda x: f"{x:.6f}")

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,0.000115,5.2e-05
ATOMUSDT,0.07421,0.002094
BTCUSDT,3120567.703039,1854503.789242
DOGEUSDT,1e-06,0.0
DOTUSDT,0.012091,0.001757
ETHUSDT,6901.548759,1850.024879
JASMYUSDT,0.0,0.0
LINKUSDT,0.011182,0.000959
LTCUSDT,0.504534,0.180925
LUNCUSDT,0.0,0.0


In [23]:
stats = describe(mse,'lstm', 'transformer')
stats['metric'] = 'mse'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,183968.919887,756746.029119,9.0554e-13,1.185542e-06,0.009093,0.07421,3120568.0,mse
transformer,17.0,109197.310058,449755.408111,2.977046e-15,3.120483e-07,0.000959,0.002705,1854504.0,mse


In [24]:
two_samples_result_tmp_df =   two_sample_t_test(mse, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'mse'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(mse, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'mse'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 2.1948035376488037e-08
P-value for normality test on transformer: 2.179447939738929e-08
At least one of the groups is not normally distributed.
U-statistic: 168.0
P-value for the Mann-Whitney U test: 0.21412167409791755
Fail to reject the null hypothesis: The median of lstm is not significantly greater than the median of transformer.
P-value for normality test on transformer: 2.179447939738929e-08
P-value for normality test on lstm: 2.1948035376488037e-08
At least one of the groups is not normally distributed.
U-statistic: 121.0
P-value for the Mann-Whitney U test: 0.7957807823442873
Fail to reject the null hypothesis: The median of transformer is not significantly greater than the median of lstm.


In [25]:
threshold = find_max_rejection_threshold(mse, 'lstm')
one_sample_result_tmp_df = t_test(mse,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'mse'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(mse, 'transformer')
one_sample_result_tmp_df = t_test(mse,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'mse'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 2.1948035376488037e-08
The maximum threshold where the null hypothesis is rejected: 0
P-value for normality test on lstm: 2.1948035376488037e-08
The series is not normally distributed.
Wilcoxon statistic: 153.0
P-value for the Wilcoxon signed-rank test: 7.62939453125e-06
Reject the null hypothesis: The median is significantly greater than 0.
####################################
P-value for normality test on transformer: 2.179447939738929e-08
The maximum threshold where the null hypothesis is rejected: 0
P-value for normality test on transformer: 2.179447939738929e-08
The series is not normally distributed.
Wilcoxon statistic: 153.0
P-value for the Wilcoxon signed-rank test: 7.62939453125e-06
Reject the null hypothesis: The median is significantly greater than 0.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.0,2.179448e-08,no,,,153.0,8e-06,Reject H0,mse
0,lstm,0.0,2.194804e-08,no,,,153.0,8e-06,Reject H0,mse
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse
0,transformer,642.24,0.009612921,no,,,112.0,0.049187,Reject H0,training_time
0,lstm,1048.2,0.001778365,no,,,112.0,0.049187,Reject H0,training_time


## mae

In [26]:
mae = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'mae')
mae.applymap(lambda x: f"{x:.6f}")

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,0.009797,0.00504
ATOMUSDT,0.260962,0.035109
BTCUSDT,1726.287548,1303.288081
DOGEUSDT,0.000979,0.000514
DOTUSDT,0.103177,0.038612
ETHUSDT,81.678358,38.899232
JASMYUSDT,0.00013,1.4e-05
LINKUSDT,0.102145,0.0255
LTCUSDT,0.652035,0.354218
LUNCUSDT,1e-06,0.0


In [27]:
stats = describe(mae,'lstm', 'transformer')
stats['metric'] = 'mae'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,106.505498,417.874384,9.366834e-07,0.000979,0.092773,0.260962,1726.287548,mae
transformer,17.0,79.009476,315.629415,4.333646e-08,0.000514,0.0255,0.047272,1303.288081,mae


In [28]:
two_samples_result_tmp_df =   two_sample_t_test(mae, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'mae'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(mae, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'mae'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 2.860175171684062e-08
P-value for normality test on transformer: 2.5808728702976624e-08
At least one of the groups is not normally distributed.
U-statistic: 169.0
P-value for the Mann-Whitney U test: 0.20421921765571271
Fail to reject the null hypothesis: The median of lstm is not significantly greater than the median of transformer.
P-value for normality test on transformer: 2.5808728702976624e-08
P-value for normality test on lstm: 2.860175171684062e-08
At least one of the groups is not normally distributed.
U-statistic: 120.0
P-value for the Mann-Whitney U test: 0.8054052936078888
Fail to reject the null hypothesis: The median of transformer is not significantly greater than the median of lstm.


In [29]:
threshold = find_max_rejection_threshold(mae, 'lstm')
one_sample_result_tmp_df = t_test(mae,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'mae'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(mae, 'transformer')
one_sample_result_tmp_df = t_test(mae,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'mae'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 2.860175171684062e-08
The maximum threshold where the null hypothesis is rejected: 0.04
P-value for normality test on lstm: 2.860175171684062e-08
The series is not normally distributed.
Wilcoxon statistic: 117.0
P-value for the Wilcoxon signed-rank test: 0.0284423828125
Reject the null hypothesis: The median is significantly greater than 0.04.
####################################
P-value for normality test on transformer: 2.5808728702976624e-08
The maximum threshold where the null hypothesis is rejected: 0.01
P-value for normality test on transformer: 2.5808728702976624e-08
The series is not normally distributed.
Wilcoxon statistic: 117.0
P-value for the Wilcoxon signed-rank test: 0.0284423828125
Reject the null hypothesis: The median is significantly greater than 0.01.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.01,2.580873e-08,no,,,117.0,0.028442,Reject H0,mae
0,lstm,0.04,2.860175e-08,no,,,117.0,0.028442,Reject H0,mae
0,transformer,0.0,2.179448e-08,no,,,153.0,8e-06,Reject H0,mse
0,lstm,0.0,2.194804e-08,no,,,153.0,8e-06,Reject H0,mse
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse
0,transformer,642.24,0.009612921,no,,,112.0,0.049187,Reject H0,training_time
0,lstm,1048.2,0.001778365,no,,,112.0,0.049187,Reject H0,training_time


## r2

In [30]:
r2 = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'r2')
r2.applymap(lambda x: f"{x:.6f}")

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,-28.707829,42.424403
ATOMUSDT,-83.730764,94.816507
BTCUSDT,-1597.468862,-908.778616
DOGEUSDT,61.392977,89.838184
DOTUSDT,18.297673,88.129582
ETHUSDT,-2002.793406,-463.673456
JASMYUSDT,49.77615,99.110512
LINKUSDT,11.941318,92.446591
LTCUSDT,68.294425,88.630398
LUNCUSDT,84.331541,98.365484


In [31]:
stats = describe(r2,'lstm', 'transformer')
stats['metric'] = 'r2'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,-386.830966,736.192115,-2002.793406,-121.527046,-28.707829,18.297673,84.331541,r2
transformer,17.0,-27.33385,270.639074,-908.778616,42.424403,88.129582,93.356778,99.110512,r2


In [32]:
two_samples_result_tmp_df =   two_sample_t_test(r2, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'r2'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(r2, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'r2'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 2.2958183259778164e-05
P-value for normality test on transformer: 2.619035739187603e-06
At least one of the groups is not normally distributed.
U-statistic: 52.0
P-value for the Mann-Whitney U test: 0.9993205665846983
Fail to reject the null hypothesis: The median of lstm is not significantly greater than the median of transformer.
P-value for normality test on transformer: 2.619035739187603e-06
P-value for normality test on lstm: 2.2958183259778164e-05
At least one of the groups is not normally distributed.
U-statistic: 237.0
P-value for the Mann-Whitney U test: 0.0007653354184001752
Reject the null hypothesis: The median of transformer is significantly greater than the median of lstm.


In [33]:
threshold = find_max_rejection_threshold(r2, 'lstm')
one_sample_result_tmp_df = t_test(r2,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'r2'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(r2, 'transformer')
one_sample_result_tmp_df = t_test(r2,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'r2'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 2.2958183259778164e-05
The maximum threshold where the null hypothesis is rejected: 0
P-value for normality test on lstm: 2.2958183259778164e-05
The series is not normally distributed.
Wilcoxon statistic: 40.0
P-value for the Wilcoxon signed-rank test: 0.9601593017578125
Fail to reject the null hypothesis: The median is not significantly greater than 0.
####################################
P-value for normality test on transformer: 2.619035739187603e-06
The maximum threshold where the null hypothesis is rejected: 0
P-value for normality test on transformer: 2.619035739187603e-06
The series is not normally distributed.
Wilcoxon statistic: 105.0
P-value for the Wilcoxon signed-rank test: 0.0950469970703125
Fail to reject the null hypothesis: The median is not significantly greater than 0.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.0,2.619036e-06,no,,,105.0,0.095047,Fail to reject H0,r2
0,lstm,0.0,2.295818e-05,no,,,40.0,0.960159,Fail to reject H0,r2
0,transformer,0.01,2.580873e-08,no,,,117.0,0.028442,Reject H0,mae
0,lstm,0.04,2.860175e-08,no,,,117.0,0.028442,Reject H0,mae
0,transformer,0.0,2.179448e-08,no,,,153.0,8e-06,Reject H0,mse
0,lstm,0.0,2.194804e-08,no,,,153.0,8e-06,Reject H0,mse
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse
0,transformer,642.24,0.009612921,no,,,112.0,0.049187,Reject H0,training_time
0,lstm,1048.2,0.001778365,no,,,112.0,0.049187,Reject H0,training_time


## mape

In [34]:
mape = models_performance.pivot_table(index = 'coin', columns = 'model', values = 'mape')
mape.applymap(lambda x: f"{x:.6f}")

model,lstm,transformer
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAUSDT,2.623996,1.334242
ATOMUSDT,1.97306,0.265467
BTCUSDT,7.571858,5.706866
DOGEUSDT,1.123217,0.592011
DOTUSDT,1.645508,0.621677
ETHUSDT,4.975386,2.362227
JASMYUSDT,2.521168,0.266356
LINKUSDT,1.466422,0.364892
LTCUSDT,0.730097,0.397183
LUNCUSDT,0.803174,0.233824


In [35]:
stats = describe(mape,'lstm', 'transformer')
stats['metric'] = 'mape'
stats
all_stats = pd.concat([stats,all_stats], axis = 0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,3.465049,3.124528,0.730097,1.310618,1.97306,5.030924,11.64304,mape
transformer,17.0,1.33963,1.782404,0.184272,0.361321,0.592011,1.334242,5.887486,mape


In [36]:
two_samples_result_tmp_df =   two_sample_t_test(mape, 'lstm', 'transformer')
two_samples_result_tmp_df['metric'] = 'mape'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)
two_samples_result_tmp_df =  two_sample_t_test(mape, 'transformer', 'lstm')
two_samples_result_tmp_df['metric'] = 'mape'
two_samples_result_df = pd.concat([two_samples_result_tmp_df,two_samples_result_df], axis = 0)

P-value for normality test on lstm: 0.00325242685560506
P-value for normality test on transformer: 3.0207304457014745e-05
At least one of the groups is not normally distributed.
U-statistic: 234.0
P-value for the Mann-Whitney U test: 0.0010866250999942055
Reject the null hypothesis: The median of lstm is significantly greater than the median of transformer.
P-value for normality test on transformer: 3.0207304457014745e-05
P-value for normality test on lstm: 0.00325242685560506
At least one of the groups is not normally distributed.
U-statistic: 55.0
P-value for the Mann-Whitney U test: 0.9990321284774569
Fail to reject the null hypothesis: The median of transformer is not significantly greater than the median of lstm.


In [37]:
threshold = find_max_rejection_threshold(mape, 'lstm')
one_sample_result_tmp_df = t_test(mape,'lstm',threshold)
one_sample_result_tmp_df['metric'] = 'mape'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
print("####################################")
threshold = find_max_rejection_threshold(mape, 'transformer')
one_sample_result_tmp_df = t_test(mape,'transformer',threshold)
one_sample_result_tmp_df['metric'] = 'mape'
one_sample_result_df = pd.concat([one_sample_result_tmp_df,one_sample_result_df], axis = 0)
one_sample_result_df

P-value for normality test on lstm: 0.00325242685560506
The maximum threshold where the null hypothesis is rejected: 1.6400000000000012
P-value for normality test on lstm: 0.00325242685560506
The series is not normally distributed.
Wilcoxon statistic: 113.0
P-value for the Wilcoxon signed-rank test: 0.0443267822265625
Reject the null hypothesis: The median is significantly greater than 1.6400000000000012.
####################################
P-value for normality test on transformer: 3.0207304457014745e-05
The maximum threshold where the null hypothesis is rejected: 0.4200000000000002
P-value for normality test on transformer: 3.0207304457014745e-05
The series is not normally distributed.
Wilcoxon statistic: 115.0
P-value for the Wilcoxon signed-rank test: 0.03570556640625
Reject the null hypothesis: The median is significantly greater than 0.4200000000000002.


Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.42,3.02073e-05,no,,,115.0,0.035706,Reject H0,mape
0,lstm,1.64,0.003252427,no,,,113.0,0.044327,Reject H0,mape
0,transformer,0.0,2.619036e-06,no,,,105.0,0.095047,Fail to reject H0,r2
0,lstm,0.0,2.295818e-05,no,,,40.0,0.960159,Fail to reject H0,r2
0,transformer,0.01,2.580873e-08,no,,,117.0,0.028442,Reject H0,mae
0,lstm,0.04,2.860175e-08,no,,,117.0,0.028442,Reject H0,mae
0,transformer,0.0,2.179448e-08,no,,,153.0,8e-06,Reject H0,mse
0,lstm,0.0,2.194804e-08,no,,,153.0,8e-06,Reject H0,mse
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse


# aggregated output

In [38]:
two_samples_result_df.head(10)

Unnamed: 0,col1,col2,normality_test_p_value_col1,normality_test_p_value_col2,is_normal,p_value_equal_variance_levene_test,t_stat,p_value_ttest,u_stat,p_value_mannwhitney,decision,metric
0,transformer,lstm,3.02073e-05,0.003252427,no,,,,55.0,0.999032,Fail to reject H0,mape
0,lstm,transformer,0.003252427,3.02073e-05,no,,,,234.0,0.001087,Reject H0,mape
0,transformer,lstm,2.619036e-06,2.295818e-05,no,,,,237.0,0.000765,Reject H0,r2
0,lstm,transformer,2.295818e-05,2.619036e-06,no,,,,52.0,0.999321,Fail to reject H0,r2
0,transformer,lstm,2.580873e-08,2.860175e-08,no,,,,120.0,0.805405,Fail to reject H0,mae
0,lstm,transformer,2.860175e-08,2.580873e-08,no,,,,169.0,0.204219,Fail to reject H0,mae
0,transformer,lstm,2.179448e-08,2.194804e-08,no,,,,121.0,0.795781,Fail to reject H0,mse
0,lstm,transformer,2.194804e-08,2.179448e-08,no,,,,168.0,0.214122,Fail to reject H0,mse
0,transformer,lstm,2.607814e-08,2.85616e-08,no,,,,121.0,0.795781,Fail to reject H0,rmse
0,lstm,transformer,2.85616e-08,2.607814e-08,no,,,,168.0,0.214122,Fail to reject H0,rmse


In [39]:
two_samples_result_df_2 = two_samples_result_df.copy()

In [40]:
two_samples_result_df_2['H0'] = two_samples_result_df_2.apply(lambda x : f"{x['col2']}>={x['col1']}", axis = 1)
two_samples_result_df_2['Ha'] = two_samples_result_df_2.apply(lambda x : f"{x['col1']}>{x['col2']}", axis = 1)
columns = [ 'metric',  'H0', 'Ha','col1', 'col2','normality_test_p_value_col1',
       'normality_test_p_value_col2', 'is_normal',
       'p_value_equal_variance_levene_test', 't_stat', 'p_value_ttest',
       'u_stat', 'p_value_mannwhitney', 'decision', ]
two_samples_result_df_2 = two_samples_result_df_2[columns].sort_values(by = ['metric','H0'], ascending = [True,True])

In [41]:
two_samples_result_df_2

Unnamed: 0,metric,H0,Ha,col1,col2,normality_test_p_value_col1,normality_test_p_value_col2,is_normal,p_value_equal_variance_levene_test,t_stat,p_value_ttest,u_stat,p_value_mannwhitney,decision
0,mae,lstm>=transformer,transformer>lstm,transformer,lstm,2.580873e-08,2.860175e-08,no,,,,120.0,0.805405,Fail to reject H0
0,mae,transformer>=lstm,lstm>transformer,lstm,transformer,2.860175e-08,2.580873e-08,no,,,,169.0,0.204219,Fail to reject H0
0,mape,lstm>=transformer,transformer>lstm,transformer,lstm,3.02073e-05,0.003252427,no,,,,55.0,0.999032,Fail to reject H0
0,mape,transformer>=lstm,lstm>transformer,lstm,transformer,0.003252427,3.02073e-05,no,,,,234.0,0.001087,Reject H0
0,mse,lstm>=transformer,transformer>lstm,transformer,lstm,2.179448e-08,2.194804e-08,no,,,,121.0,0.795781,Fail to reject H0
0,mse,transformer>=lstm,lstm>transformer,lstm,transformer,2.194804e-08,2.179448e-08,no,,,,168.0,0.214122,Fail to reject H0
0,r2,lstm>=transformer,transformer>lstm,transformer,lstm,2.619036e-06,2.295818e-05,no,,,,237.0,0.000765,Reject H0
0,r2,transformer>=lstm,lstm>transformer,lstm,transformer,2.295818e-05,2.619036e-06,no,,,,52.0,0.999321,Fail to reject H0
0,rmse,lstm>=transformer,transformer>lstm,transformer,lstm,2.607814e-08,2.85616e-08,no,,,,121.0,0.795781,Fail to reject H0
0,rmse,transformer>=lstm,lstm>transformer,lstm,transformer,2.85616e-08,2.607814e-08,no,,,,168.0,0.214122,Fail to reject H0


In [42]:
one_sample_result_df

Unnamed: 0,col,threshold,normality_test_p_value_col,is_normal,t_stat,p_value_ttest,wilcoxon_stat,p_value_wilcoxon,decision,metric
0,transformer,0.42,3.02073e-05,no,,,115.0,0.035706,Reject H0,mape
0,lstm,1.64,0.003252427,no,,,113.0,0.044327,Reject H0,mape
0,transformer,0.0,2.619036e-06,no,,,105.0,0.095047,Fail to reject H0,r2
0,lstm,0.0,2.295818e-05,no,,,40.0,0.960159,Fail to reject H0,r2
0,transformer,0.01,2.580873e-08,no,,,117.0,0.028442,Reject H0,mae
0,lstm,0.04,2.860175e-08,no,,,117.0,0.028442,Reject H0,mae
0,transformer,0.0,2.179448e-08,no,,,153.0,8e-06,Reject H0,mse
0,lstm,0.0,2.194804e-08,no,,,153.0,8e-06,Reject H0,mse
0,transformer,0.01,2.607814e-08,no,,,117.0,0.028442,Reject H0,rmse
0,lstm,0.05,2.85616e-08,no,,,112.0,0.049187,Reject H0,rmse


In [43]:
one_sample_result_d_2 = one_sample_result_df.copy()
one_sample_result_d_2['H0'] = one_sample_result_d_2.apply(lambda x : f"{x['col']}<={x['threshold']}", axis = 1)
one_sample_result_d_2['Ha'] = one_sample_result_d_2.apply(lambda x : f"{x['col']}>{x['threshold']}", axis = 1)
columns = [ 'metric', 'threshold', 'H0', 'Ha', 'normality_test_p_value_col', 'is_normal','t_stat',
       'p_value_ttest', 'wilcoxon_stat', 'p_value_wilcoxon', 'decision']
one_sample_result_d_2 = one_sample_result_d_2[columns].sort_values(by = ['metric','H0'], ascending = [True,True])

In [44]:
all_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,metric
lstm,17.0,3.465049,3.124528,0.7300973,1.310618,1.97306,5.030924,11.64304,mape
transformer,17.0,1.33963,1.782404,0.1842723,0.3613209,0.592011,1.334242,5.887486,mape
lstm,17.0,-386.830966,736.192115,-2002.793,-121.527,-28.707829,18.297673,84.33154,r2
transformer,17.0,-27.33385,270.639074,-908.7786,42.4244,88.129582,93.356778,99.11051,r2
lstm,17.0,106.505498,417.874384,9.366834e-07,0.0009793475,0.092773,0.260962,1726.288,mae
transformer,17.0,79.009476,315.629415,4.333646e-08,0.0005137227,0.0255,0.047272,1303.288,mae
lstm,17.0,183968.919887,756746.029119,9.0554e-13,1.185542e-06,0.009093,0.07421,3120568.0,mse
transformer,17.0,109197.310058,449755.408111,2.977046e-15,3.120483e-07,0.000959,0.002705,1854504.0,mse
lstm,17.0,108.963447,427.611833,9.515986e-07,0.001088826,0.095359,0.272416,1766.513,rmse
transformer,17.0,82.705035,329.779489,5.456231e-08,0.0005586128,0.030971,0.052014,1361.802,rmse


In [45]:
all_stats_2 = all_stats.copy()
all_stats_2 = all_stats_2.reset_index()
all_stats_2['model'] = all_stats_2.apply(lambda x : x['index'], axis = 1)
columns = ['metric', 'model', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
all_stats_2 = all_stats_2[columns]

In [46]:
all_stats_2.to_csv("EXPLORATIVE_DATA_ANALYSIS/all_stats_models.csv", index = False)
two_samples_result_df_2.to_csv("EXPLORATIVE_DATA_ANALYSIS/two_samples_result_models.csv", index = False)
one_sample_result_d_2.to_csv("EXPLORATIVE_DATA_ANALYSIS/one_sample_result_models.csv", index = False)