### Load the data

In [6]:
import os
import mlflow
import numpy as np

os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
mlflow.set_experiment('Training_LSTM')

<Experiment: artifact_location='mlflow-artifacts:/976886767624453903', creation_time=1698104820917, experiment_id='976886767624453903', last_update_time=1698104820917, lifecycle_stage='active', name='Training_LSTM', tags={}>

In [7]:
# Create sequences of data to be used for training
def create_sequences(data, sequence_length):
    sequences = []
    target = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i+sequence_length])
        target.append(data[i+sequence_length])
    return np.array(sequences), np.array(target)

### Create the Optuna Objective

In [28]:
import keras.backend as K

def mean_absolute_percentage_error(y_true, y_pred):
    diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
    return 100.0 * K.mean(diff, axis=-1)

In [31]:
import optuna
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import mlflow.keras
from sklearn.preprocessing import MinMaxScaler

def objective(trial, data):
    with mlflow.start_run() as run:
        mlflow.keras.autolog(log_models=False)

        # Define the search space for hyperparameters
        num_layers = trial.suggest_int('num_layers', 1, 4)
        units_per_layer = [trial.suggest_int(f'units_layer_{i}', 32, 256, 32) for i in range(num_layers)]
        sequence_length = trial.suggest_int('sequence_length', 5, 20)
        learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-3)
        dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
        min_max_scaling = trial.suggest_int('min_max_scaling', 0, 1)

        mlflow.log_params({
            'num_layers': num_layers,
            'units_per_layer': units_per_layer,
            'sequence_length': sequence_length,
            'learning_rate': learning_rate,
            'dropout_rate': dropout_rate,
            'min_max_scaling': min_max_scaling
        })

        if min_max_scaling == 1:
            scaler = MinMaxScaler()
            data = scaler.fit_transform(np.array(data))

        X, y = create_sequences(data, sequence_length)

        # Build and compile the LSTM model
        model = keras.Sequential()
        for units in units_per_layer:
            model.add(keras.layers.LSTM(units, activation='relu', return_sequences=True, input_shape=(sequence_length, 1)))
            model.add(keras.layers.Dropout(dropout_rate))
        model.add(keras.layers.Dense(1))

        optimizer = keras.optimizers.Adam(learning_rate=learning_rate, clipvalue=1.0)
        model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[mean_absolute_percentage_error])
        # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

        # Define an EarlyStopping callback for preemptive pruning
        early_stopping = EarlyStopping(monitor='val_mean_absolute_percentage_error', patience=10)  # You can adjust the patience value

        # Train the model with early stopping
        history = model.fit(
            X_train,
            y_train,
            epochs=100,
            batch_size=32,
            validation_data=(X_val, y_val), 
            verbose=1,
            callbacks=[early_stopping]
        )

        # Evaluate the model on the validation set
        loss = model.evaluate(X_val, y_val)

        return loss

### Run the optimization

In [33]:
import numpy as np
from common import get_dataframe

# Normalize the data to the range [0, 1] to help the LSTM model converge faster

df = get_dataframe()

for coin in df.iloc[:, 1:]:
    data = np.array(df[coin]).reshape(-1, 1)

    # Create an Optuna study
    study = optuna.create_study(study_name=coin, direction='minimize')

    # Start the optimization process
    study.optimize(lambda trial: objective(trial, data), n_trials=100)

    # Get the best hyperparameters
    best_params = study.best_params
    print("Best Hyperparameters:", best_params)

[I 2023-10-24 01:56:13,214] A new study created in memory with name: close_NULSUSDT
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


[W 2023-10-24 01:56:49,251] Trial 0 failed with parameters: {'num_layers': 1, 'units_layer_0': 192, 'sequence_length': 16, 'learning_rate': 0.0005126656520215088, 'dropout_rate': 0.0767748130030077, 'min_max_scaling': 1} because of the following error: The number of the values 2 did not match the number of the objectives 1.
[W 2023-10-24 01:56:49,251] Trial 0 failed with value [0.0005226924549788237, 29.448570251464844].
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

[W 2023-10-24 01:57:45,171] Trial 1 failed with parameters: {'num_layers': 1, 'units_layer_0': 256, 'sequence_length': 8, 'learning_rate': 0.0004220188479990986, 'dropout_rate': 0.3407961854115491, 'min_max_scaling': 0} because of the following error: The number of the values 2 did not match the number of the objectives 1.
[W 2023-10-24 01:57:45,172] Trial 1 failed with value [0.00031987534021027386, 6.3033246994018555].
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
 7/44 [===>..........................] - ETA: 2s - loss: 0.0095 - mean_absolute_percentage_error: 36.3655

### Load the best model configuration and train it

In [32]:

# Train the final model with the best hyperparameters
# Define the search space for hyperparameters
num_layers = best_params['num_layers']
units_per_layer = [best_params[f'units_layer_{i}'] for i in range(num_layers)]
sequence_length = best_params['sequence_length']
learning_rate = best_params['learning_rate']
dropout_rate = best_params['dropout_rate']

X, y = create_sequences(data, sequence_length)
_X_train, X_val, _y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(_X_train, _y_train, test_size=0.2, shuffle=False)

# Build and compile the LSTM model
best_model = keras.Sequential()
for units in units_per_layer:
    best_model.add(keras.layers.LSTM(units, activation='relu', return_sequences=True, input_shape=(sequence_length, 1)))
    best_model.add(keras.layers.Dropout(dropout_rate))
best_model.add(keras.layers.LSTM(units_per_layer[-1], activation='relu'))
best_model.add(keras.layers.Dropout(dropout_rate))
best_model.add(keras.layers.Dense(1))

optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
best_model.compile(optimizer=optimizer, loss='mean_squared_error')

best_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

NameError: name 'best_params' is not defined

In [None]:
from common import register_training_experiment
preds = best_model.predict([X_test])
register_training_experiment(y_test, preds)

#### Multivariate Time Series Forecasting with Deep Learning

[Source 1](https://towardsdatascience.com/multivariate-time-series-forecasting-with-deep-learning-3e7b3e2d2bcf) \
[Source 2](https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/)

In [None]:
from common import get_clustered_dataframes

clusters_data = get_clustered_dataframes()

In [None]:
clusters_data.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

def get_train_test_data(cluster_data, n_steps=10, test_size=0.2, shuffle=False):

    data = cluster_data.copy() 

    data['Date'] = pd.to_datetime(data['Date'])  # Convert the 'date' column to datetime

    # Sort the data by date
    data = data.sort_values(by='Date')

    # Create input data by shifting prices to create sequences
    X = data.drop(columns=['Date']).values
    X_seq = [X[i:i + n_steps] for i in range(len(X) - n_steps)]

    # Shift the closing price to predict the next closing price
    y = data.drop(columns=['Date']).shift(-n_steps).values

    # Split the data into training and testing sets
    _X_train, X_test, _y_train, y_test = train_test_split(X_seq, y[:-n_steps], test_size=0.2, shuffle=shuffle)

    # split the training set into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(_X_train, _y_train, test_size=0.2, shuffle=shuffle)

    # Reshape the data to 3D for LSTM
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    X_valid = np.array(X_valid)
    y_valid = np.array(y_valid)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    print('X_train shape:', X_train.shape)
    print('y_train shape:', y_train.shape)
    print('X_valid shape:', X_valid.shape)
    print('y_valid shape:', y_valid.shape)
    print('X_test shape:', X_test.shape)
    print('y_test shape:', y_test.shape)
    print("\n\n")

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import mlflow


def calculate_rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))


def build_model(n_steps, n_features):
    return tf.keras.Sequential([
        tf.keras.layers.LSTM(64, activation='relu', input_shape=(n_steps, n_features), return_sequences=True),
        tf.keras.layers.LSTM(64, activation='relu'),
        tf.keras.layers.Dense(n_features),
        tf.keras.layers.Dense(n_features)
    ])


def lstm_training(cluster_data, n_steps=10, test_size=0.2, shuffle=False):

    data = cluster_data.copy()
    X_train, X_valid, X_test, y_train, y_valid, y_test = get_train_test_data(data, n_steps, test_size, shuffle)
    
    model = build_model(n_steps, X_train.shape[2])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=250, batch_size=32, validation_data=(X_valid, y_valid))

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and print RMSE
    for i, cripto in enumerate(data.columns[:-1]):
        rmse = calculate_rmse(y_test[:, i], y_pred[:, i])
        print(f'Root Mean Squared Error (RMSE) for {cripto}: {rmse:.4f}')

In [None]:
for cluster, cripto in clusters_data.items():
    print(f'Cluster {cluster}: {cripto.columns[:-1]}\n')

    lstm_training(cripto, n_steps=2)

    print("\n---------------------------------\n")