## **Import all necessary packages, and mount drive**

In [186]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')

import os
import joblib
import random
import import_ipynb

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tabulate import tabulate

# Bokeh Visualization
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import column, row
from bokeh.palettes import Category10
from bokeh.models import ColumnDataSource, HoverTool, Legend

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    LSTM,
    Dense,
    Dropout,
    Input,
    BatchNormalization,
    Bidirectional
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau
)
from tensorflow.keras.regularizers import L1L2, l2
from tensorflow.keras.metrics import (
    MeanAbsolutePercentageError,
    RootMeanSquaredError,
    MeanAbsoluteError,
    MeanSquaredError
)

# Scikit-learn Metrics
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    root_mean_squared_error
)

# Project-specific Configuration
from config import *

# Enable Bokeh output in notebooks
output_notebook()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [187]:
seed=0
os.environ['PYTHONHASHSEED'] = str(seed)
# For working on GPUs from "TensorFlow Determinism"
os.environ["TF_DETERMINISTIC_OPS"] = str(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

In [188]:
def load_datasets(data_path, dataset_name):
    train_data = pd.read_csv(f"{data_path}/{dataset_name}_train.csv", index_col=0)
    val_data = pd.read_csv(f"{data_path}/{dataset_name}_val.csv", index_col=0)
    test_data = pd.read_csv(f"{data_path}/{dataset_name}_test.csv", index_col=0)

    try:
        train_data.index = pd.to_datetime(train_data.index, utc=True),
        val_data.index = pd.to_datetime(val_data.index, utc=True),
        test_data.index = pd.to_datetime(test_data.index, utc=True)
    except:
        pass

    features_scaler = joblib.load(f"{data_path}/{dataset_name}_features_scaler.joblib")
    target_scaler = joblib.load(f"{data_path}/{dataset_name}_target_scaler.joblib")

    return train_data, val_data, test_data, features_scaler, target_scaler

    return train_data, val_data, test_data, features_scaler, target_scaler

In [None]:
# Load minmax_split_first datasets
train_data, val_data, test_data, features_scaler, target_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_first",
    dataset_name="normal_data"
)

# Load minmax_split_last datasets
train_data_last, val_data_last, test_data_last, features_scaler_last, target_scaler_last = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_last",
    dataset_name="normal_data"
)

# Load log-transformed minmax_split_first datasets
train_log_data, val_log_data, test_log_data, features_log_scaler, target_log_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_first",
    dataset_name="log_data"
)

# Load log-transformed minmax_split_last datasets
train_log_data_last_last, val_log_data_last_last, test_log_data_last_last, features_log_scaler_last, target_log_scaler_last = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_last",
    dataset_name="log_data"
)

In [250]:
# Load custom_split_first datasets
train_data_custom, val_data_custom, test_data_custom, custom_features_scaler, custom_target_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_first",
    dataset_name="normal_data"
)

# Load custom_split_last datasets
train_data_custom_last, val_data_custom_last, test_data_custom_last, custom_features_scaler_last, custom_target_scaler_last = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_last",
    dataset_name="normal_data_last"
)

# **Get the train, val, test datasets using load_and_prepare_data()**

In [199]:
def create_sequences(data, feature_cols, target_cols, sequence_length, save_path=None, save_name=None):
    features = data[feature_cols].values
    targets = data[target_cols].values

    n_samples = len(data) - sequence_length
    n_features = len(feature_cols)
    n_targets = len(target_cols)

    X = np.zeros((n_samples, sequence_length, n_features))
    y = np.zeros((n_samples, n_targets))
    dates = np.empty(n_samples, dtype=object)

    for i in range(n_samples):
        X[i] = features[i:i+sequence_length]
        y[i] = targets[i+sequence_length-1]
        dates[i] = data.index[i+sequence_length]

    if save_path and save_name:
        os.makedirs(save_path, exist_ok=True)
        np.save(f"{save_path}/{save_name}_X.npy", X)
        np.save(f"{save_path}/{save_name}_y.npy", y)
        np.save(f"{save_path}/{save_name}_dates.npy", dates)

    return X, y, dates

def load_sequences(path, name):
    X = np.load(f"{path}/{name}_X.npy")
    y = np.load(f"{path}/{name}_y.npy")
    dates = np.load(f"{path}/{name}_dates.npy", allow_pickle=True)

    return X, y, dates

In [251]:
# Define parameters
feature_cols = ['Close']
target_cols = ['Target']
sequence_length = 21

# 1. Create sequences for minmax_split_first normal data
X_train, y_train, train_dates = create_sequences(
    data=train_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_val, y_val, val_dates = create_sequences(
    data=val_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_test, y_test, test_dates = create_sequences(
    data=test_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

# 2. Create sequences for minmax_split_last normal data
X_train_last, y_train_last, train_dates_last = create_sequences(
    data=train_data_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_val_last, y_val_last, val_dates_last = create_sequences(
    data=val_data_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_test_last, y_test_last, test_dates_last = create_sequences(
    data=test_data_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

# 3. Create sequences for minmax_split_first log data
X_train_log, y_train_log, train_dates_log = create_sequences(
    data=train_log_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_val_log, y_val_log, val_dates_log = create_sequences(
    data=val_log_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_test_log, y_test_log, test_dates_log = create_sequences(
    data=test_log_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

# 4. Create sequences for minmax_split_last log data
X_train_log_last, y_train_log_last, train_dates_log_last = create_sequences(
    data=train_log_data_last_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_val_log_last, y_val_log_last, val_dates_log_last = create_sequences(
    data=val_log_data_last_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_test_log_last, y_test_log_last, test_dates_log_last = create_sequences(
    data=test_log_data_last_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

# Print shapes to verify
print("Normal Data, Split First:")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

print("\nNormal Data, Split Last:")
print(f"X_train_last shape: {X_train_last.shape}, y_train_last shape: {y_train_last.shape}")
print(f"X_val_last shape: {X_val_last.shape}, y_val_last shape: {y_val_last.shape}")
print(f"X_test_last shape: {X_test_last.shape}, y_test_last shape: {y_test_last.shape}")

print("\nLog Data, Split First:")
print(f"X_train_log shape: {X_train_log.shape}, y_train_log shape: {y_train_log.shape}")
print(f"X_val_log shape: {X_val_log.shape}, y_val_log shape: {y_val_log.shape}")
print(f"X_test_log shape: {X_test_log.shape}, y_test_log shape: {y_test_log.shape}")

print("\nLog Data, Split Last:")
print(f"X_train_log_last shape: {X_train_log_last.shape}, y_train_log_last shape: {y_train_log_last.shape}")
print(f"X_val_log_last shape: {X_val_log_last.shape}, y_val_log_last shape: {y_val_log_last.shape}")
print(f"X_test_log_last shape: {X_test_log_last.shape}, y_test_log_last shape: {y_test_log_last.shape}")

Normal Data, Split First:
X_train shape: (17034, 21, 1), y_train shape: (17034, 1)
X_val shape: (3634, 21, 1), y_val shape: (3634, 1)
X_test shape: (3634, 21, 1), y_test shape: (3634, 1)

Normal Data, Split Last:
X_train_last shape: (17034, 21, 1), y_train_last shape: (17034, 1)
X_val_last shape: (3634, 21, 1), y_val_last shape: (3634, 1)
X_test_last shape: (3634, 21, 1), y_test_last shape: (3634, 1)

Log Data, Split First:
X_train_log shape: (17034, 21, 1), y_train_log shape: (17034, 1)
X_val_log shape: (3634, 21, 1), y_val_log shape: (3634, 1)
X_test_log shape: (3634, 21, 1), y_test_log shape: (3634, 1)

Log Data, Split Last:
X_train_log_last shape: (17034, 21, 1), y_train_log_last shape: (17034, 1)
X_val_log_last shape: (3634, 21, 1), y_val_log_last shape: (3634, 1)
X_test_log_last shape: (3634, 21, 1), y_test_log_last shape: (3634, 1)


In [252]:
feature_cols_custom = ['Custom_Normalized']
target_cols_custom = ['Target']

# 1.Create sequnces for custom custom_split_first normal data
X_train_custom, y_train_custom, train_dates_custom = create_sequences(
    data=train_data_custom,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

X_val_custom, y_val_custom, val_dates_custom = create_sequences(
    data=val_data_custom,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

X_test_custom, y_test_custom, test_dates_custom = create_sequences(
    data=test_data_custom,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

# 2. Create sequences for custom_split_last normal data
X_train_custom_last, y_train_custom_last, train_dates_custom_last = create_sequences(
    data=train_data_custom_last,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

X_val_custom_last, y_val_custom_last, val_dates_custom = create_sequences(
    data=val_data_custom_last,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

X_test_custom_last, y_test_custom_last, test_dates_custom = create_sequences(
    data=test_data_custom_last,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

# **Create the LSTM model and callbacks (EarlyStop, Reduce Learning Rate, Save best model)**

In [205]:
def create_lstm_model(input_shape, units, dropout_rate=0.01):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(units, activation='relu'),
        Dense(1)
    ])

    optimizer = Adam()

    model.compile(
        optimizer=optimizer,
        loss='mse',
    )

    return model

In [206]:
def get_callbacks(model_name, base_path=CHECKPOINTS_PATH):
    checkpoint_path = os.path.join(base_path, f'{model_name}.keras')

    callbacks = [
        ModelCheckpoint(
            filepath=checkpoint_path,
            monitor='val_loss',
            mode='min',
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=7,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1,
            mode='min'
        )
    ]

    return callbacks

## **Create model**

In [207]:
units = 128
batch_size = 32
epochs = 100
dropout_rate = 0.01

In [None]:
# Get input shape from training data
input_shape = X_train.shape[1:]
print(f"Using input shape: {input_shape}")

# 1. Train model on normal data, split first
# model_normal_first = create_lstm_model(input_shape, units, dropout_rate)
# NORM_TYPE = "minmax_split_first"
# SEQ_LENGTH = input_shape[0]  # sequence length from data
# SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}"
# callbacks = get_callbacks(SAVED_MODEL_NAME)

# history_normal_first = model_normal_first.fit(
#     X_train, y_train,
#     validation_data=(X_val, y_val),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

# 2. Train model on normal data, split last
# model_normal_last = create_lstm_model(input_shape, units, dropout_rate)
# NORM_TYPE = "minmax_split_last"
# SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}"
# callbacks = get_callbacks(SAVED_MODEL_NAME)

# history_normal_last = model_normal_last.fit(
#     X_train_last, y_train_last,
#     validation_data=(X_val_last, y_val_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

# 3. Train model on log data, split first
# Check if shapes match, if not extract from this data
log_input_shape = X_train_log.shape[1:] if X_train_log.shape[1:] != input_shape else input_shape
model_log_first = create_lstm_model(log_input_shape, units, dropout_rate)
NORM_TYPE = "log_minmax_split_first"
SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}_v4"
callbacks = get_callbacks(SAVED_MODEL_NAME)

history_log_first = model_log_first.fit(
    X_train_log, y_train_log,
    validation_data=(X_val_log, y_val_log),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=0
)

# 4. Train model on log data, split last
# Check if shapes match, if not extract from this data
# log_last_input_shape = X_train_log_last.shape[1:] if X_train_log_last.shape[1:] != input_shape else input_shape
# model_log_last = create_lstm_model(log_last_input_shape, units, dropout_rate)
# NORM_TYPE = "log_minmax_split_last"
# SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}"
# callbacks = get_callbacks(SAVED_MODEL_NAME)

# history_log_last = model_log_last.fit(
#     X_train_log_last, y_train_log_last,
#     validation_data=(X_val_log_last, y_val_log_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

# Store histories in a dictionary for comparison
histories = {
    # 'normal_first': history_normal_first.history,
    # 'norma l_last': history_normal_last.history,
    'log_first': history_log_first.history,
    # 'log_last': history_log_last.history
}

print("All models trained and histories saved.")

Using input shape: (21, 1)

Epoch 1: val_loss improved from inf to 0.00034, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_log_minmax_split_first_sequence21_v4.keras

Epoch 2: val_loss did not improve from 0.00034

Epoch 3: val_loss improved from 0.00034 to 0.00027, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_log_minmax_split_first_sequence21_v4.keras

Epoch 4: val_loss improved from 0.00027 to 0.00014, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_log_minmax_split_first_sequence21_v4.keras

Epoch 5: val_loss improved from 0.00014 to 0.00009, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_log_minmax_split_first_sequence21_v4.keras

Epoch 6: val_loss improved from 0.00009 to 0.00006, saving model to /content/drive/MyDri

In [208]:
# 1. Train model on custom_split_first normal data
model_custom_first = create_lstm_model(input_shape, units, dropout_rate)
NORM_TYPE = "custom_split_first"
SEQ_LENGTH = input_shape[0]
SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}"
callbacks = get_callbacks(SAVED_MODEL_NAME)

history_custom_first = model_custom_first.fit(
    X_train_custom, y_train_custom,
    validation_data=(X_val_custom, y_val_custom),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=0
)

# 2. Train model on custom_split_last normal data
model_custom_last = create_lstm_model(input_shape, units, dropout_rate)
NORM_TYPE = "custom_split_last"
SEQ_LENGTH = input_shape[0]
SAVED_MODEL_NAME = f"{units}_{NORM_TYPE}_sequence{SEQ_LENGTH}"
callbacks = get_callbacks(SAVED_MODEL_NAME)

history_custom_last = model_custom_last.fit(
    X_train_custom_last, y_train_custom_last,
    validation_data=(X_val_custom_last, y_val_custom_last),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=0
)

histories = {
    'custom_first': history_custom_first.history,
    'custom_last': history_custom_last.history
}


Epoch 1: val_loss improved from inf to 0.00011, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_custom_split_first_sequence21.keras

Epoch 2: val_loss improved from 0.00011 to 0.00010, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_custom_split_first_sequence21.keras

Epoch 3: val_loss improved from 0.00010 to 0.00010, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_custom_split_first_sequence21.keras

Epoch 4: val_loss improved from 0.00010 to 0.00010, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/128_custom_split_first_sequence21.keras

Epoch 5: val_loss did not improve from 0.00010

Epoch 6: val_loss did not improve from 0.00010

Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 7: val_loss 

## **Make predictions**

In [None]:
model_minmax_first = load_model(f"{CHECKPOINTS_PATH}/{units}_minmax_split_first_sequence{SEQ_LENGTH}.keras")
model_minmax_last = load_model(f"{CHECKPOINTS_PATH}/{units}_minmax_split_last_sequence{SEQ_LENGTH}.keras")
model_minmax_log_first = load_model(f"{CHECKPOINTS_PATH}/{units}_log_minmax_split_first_sequence{SEQ_LENGTH}_v4.keras")
model_minmax_log_last = load_model(f"{CHECKPOINTS_PATH}/{units}_log_minmax_split_last_sequence{SEQ_LENGTH}.keras")

In [None]:
y_pred_minmax_first = model_minmax_first.predict(X_test)
y_pred_minmax_last = model_minmax_last.predict(X_test_last)
y_pred_minmax_log_first = model_minmax_log_first.predict(X_test_log)
y_pred_minmax_log_last = model_minmax_log_last.predict(X_test_log_last)


[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step


In [None]:
y_pred_minmax_first_real = inverse_transform_simple(y_pred_minmax_first, target_scaler)
y_test_minmax_first_real = inverse_transform_simple(y_test, target_scaler)

metrics = evaluate_predictions(y_test_minmax_first_real, y_pred_minmax_first_real,should_print=True)

plot_training_history(history_normal_first)
plot_predictions_bokeh(y_test_minmax_first_real, y_pred_minmax_first_real)


Model Performance Metrics:
--------------------------------------------------
MAPE: 2.4202%
RMSE: 125.07848129
MSE: 15644.62648202
MAE: 85.57895917
MPD (Maximum Percentage Deviation): 17.3505%

Point of Maximum Deviation (Index 2428):
--------------------------------------------------
True Value: 2386.129883
Predicted Value: 2800.135010
Absolute Difference: 414.005127
Percentage Deviation: 17.35%


In [None]:
y_pred_minmax_last_real = inverse_transform_simple(y_pred_minmax_last, target_scaler_last)
y_test_minmax_last_real = inverse_transform_simple(y_test_last, target_scaler_last)

metrics = evaluate_predictions(y_test_minmax_last_real, y_pred_minmax_last_real,should_print=True)

plot_training_history(history_normal_last)
plot_predictions_bokeh(y_test_minmax_last_real, y_pred_minmax_last_real)


Model Performance Metrics:
--------------------------------------------------
MAPE: 1.4784%
RMSE: 50.88033784
MSE: 2588.80877903
MAE: 40.01706351
MPD (Maximum Percentage Deviation): 13.2621%

Point of Maximum Deviation (Index 2426):
--------------------------------------------------
True Value: 2480.639893
Predicted Value: 2809.625488
Absolute Difference: 328.985596
Percentage Deviation: 13.26%


In [None]:
y_pred_minmax_log_first_real = inverse_transform_simple(y_pred_minmax_log_first, target_log_scaler,log_scaled=True)
y_test_minmax_log_first_real = inverse_transform_simple(y_test_log, target_log_scaler,log_scaled=True)

metrics = evaluate_predictions(y_test_minmax_log_first_real, y_pred_minmax_log_first_real,should_print=True)

plot_training_history(history_log_first)
plot_predictions_bokeh(y_test_minmax_log_first_real, y_pred_minmax_log_first_real)


Model Performance Metrics:
--------------------------------------------------
MAPE: 0.7566%
RMSE: 33.97877636
MSE: 1154.55724273
MAE: 21.33683482
MPD (Maximum Percentage Deviation): 12.1454%

Point of Maximum Deviation (Index 2426):
--------------------------------------------------
True Value: 2480.639893
Predicted Value: 2781.924316
Absolute Difference: 301.284424
Percentage Deviation: 12.15%


In [None]:
y_pred_minmax_log_last_real = inverse_transform_simple(y_pred_minmax_log_last, target_log_scaler_last,log_scaled=True)
y_test_minmax_log_last_real = inverse_transform_simple(y_test_log_last, target_log_scaler_last,log_scaled=True)

metrics = evaluate_predictions(y_test_minmax_log_last_real, y_pred_minmax_log_last_real,should_print=True)

plot_training_history(history_log_last)
plot_predictions_bokeh(y_test_minmax_log_last_real, y_pred_minmax_log_last_real)


Model Performance Metrics:
--------------------------------------------------
MAPE: 1.1502%
RMSE: 50.01840917
MSE: 2501.84125608
MAE: 32.64425953
MPD (Maximum Percentage Deviation): 15.9030%

Point of Maximum Deviation (Index 2426):
--------------------------------------------------
True Value: 2480.639893
Predicted Value: 2875.137207
Absolute Difference: 394.497314
Percentage Deviation: 15.90%


In [209]:
model_custom_first = load_model(f"{CHECKPOINTS_PATH}/{units}_custom_split_first_sequence{SEQ_LENGTH}.keras")
model_custom_last = load_model(f"{CHECKPOINTS_PATH}/{units}_custom_split_last_sequence{SEQ_LENGTH}.keras")

In [210]:
y_pred_custom_first = model_custom_first.predict(X_test_custom)
y_pred_custom_last = model_custom_last.predict(X_test_custom_last)

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step


In [301]:
raw_data = pd.read_csv(f"{RAW_DATA_PATH}/^GSPC_data.csv", index_col='Date', parse_dates=True)

first_value = raw_data['Close'].iloc[0]
first_index = 0
last_value = raw_data['Close'].iloc[-1] - first_value
last_index = len(raw_data)-1

print(f"First Value: {first_value}\nFirst Index: {first_index}")
print(f"Last Value: {last_value}\nLast Index: {last_index}")
print(raw_data.tail())

First Value: 17.65999984741211
First Index: 0
Last Value: 5889.279941558838
Last Index: 24365
                                  Open         High          Low        Close  \
Date                                                                            
2024-12-23 00:00:00-05:00  5940.250000  5978.250000  5902.569824  5974.069824   
2024-12-24 00:00:00-05:00  5984.629883  6040.100098  5981.439941  6040.040039   
2024-12-26 00:00:00-05:00  6024.970215  6049.750000  6007.370117  6037.589844   
2024-12-27 00:00:00-05:00  6006.169922  6006.169922  5932.950195  5970.839844   
2024-12-30 00:00:00-05:00  5920.669922  5940.790039  5869.160156  5906.939941   

                               Volume  Dividends  Stock Splits  
Date                                                            
2024-12-23 00:00:00-05:00  3593280000        0.0           0.0  
2024-12-24 00:00:00-05:00  1757720000        0.0           0.0  
2024-12-26 00:00:00-05:00  2904530000        0.0           0.0  
2024-12-27 00

In [319]:
len(train_data_custom) + len(val_data_custom) + len(test_data_custom)

24365

In [338]:
y_test_custom_real = inverse_transform_custom(y_test_custom,custom_target_scaler,last_index,first_value,last_value)
y_pred_custom_real = inverse_transform_custom(y_pred_custom_first,custom_target_scaler,last_index,first_value,last_value)

metrics = evaluate_predictions(y_test_custom_real['Inverse_Real_Values'].to_numpy(), y_pred_custom_real['Inverse_Real_Values'].to_numpy(),should_print=True)

plot_training_history(history_custom_first)
plot_predictions_bokeh(y_test_custom_real['Inverse_Real_Values'], y_pred_custom_real['Inverse_Real_Values'])

print(y_test_custom_real['Inverse_Real_Values'].tail(15))
# print(test_dates_custom)

      Custom_Normalized
3624         142.464408
3625         164.576171
3626         141.527973
3627         -32.161640
3628         -37.334465
3629          24.415604
3630          66.190610
3631         130.079288
3632         127.462732
3633          62.346206
[6033.429843902588, 6056.420078277588, 6032.949863433839, 5854.500156402589, 5849.420078277589, 5913.190097808838, 5956.409824371339, 6022.380039215088, 6019.929843902587, 5953.179843902588]
      Custom_Normalized
3624          59.854752
3625          63.200600
3626          66.375557
3627          72.067162
3628          77.067619
3629          64.433876
3630          52.782131
3631          49.461956
3632          48.800701
3633          52.072048
[5948.44124635105, 5952.1251567223635, 5955.633255079003, 5961.730473609969, 5967.11664207578, 5954.360790803242, 5942.615216951074, 5939.44114010108, 5939.002553462009, 5942.609817361457]

Model Performance Metrics:
--------------------------------------------------
MAPE: 2.0351%

24329    6072.610020
24330    6035.190098
24331    6017.250156
24332    6066.529942
24333    6033.590000
24334    6033.429844
24335    6056.420078
24336    6032.949863
24337    5854.500156
24338    5849.420078
24339    5913.190098
24340    5956.409824
24341    6022.380039
24342    6019.929844
24343    5953.179844
Name: Inverse_Real_Values, dtype: float64


## **Function to inverz transform DataFrame scaled with MinMaxScaler**

In [307]:
def inverse_transform_simple(df, scaler, log_scaled=False,):
    df = df.copy()
    inverse_scaled = scaler.inverse_transform(df)

    if log_scaled:
        inverse_scaled = np.exp(inverse_scaled)

    return inverse_scaled

In [337]:
def inverse_custom_normalize(normalized_value, last_value, index, n):
    if index == 0:
        return 0
    part1 = last_value * (index / n)
    sqrt_part = np.sqrt(index**2 + ((last_value * index) / n)**2)
    part2 = normalized_value * (sqrt_part / index)
    return part1 + part2

def add_first_value(data, first_value):
    return data + first_value

def inverse_transform_custom(arr, scaler,n, first_value, last_value, column_name='Custom_Normalized'):
    if isinstance(arr, np.ndarray):
        df = pd.DataFrame(arr, columns=[column_name])
    else:
        df = arr.copy()

    df_real = pd.DataFrame(
        scaler.inverse_transform(df),
        columns=[column_name],
        index=df.index
    )

    print(df_real.tail(10))

    start_index = len(train_data_custom) + len(val_data_custom)
    original_indices = np.arange(start_index, start_index + len(arr))
    df_real.index = original_indices
    real_values = []

    for i, index in enumerate(df_real.index):
        real_value = inverse_custom_normalize(df_real.iloc[i, 0], last_value, index+21, n)
        real_values.append(real_value)

    # real_values = add_first_value(np.array(real_values), first_value)
    print(real_values[-10:])
    df_real['Inverse_Real_Values'] = real_values
    return df_real

In [None]:
def calculate_mpd(y_true, y_pred):
    # Convert inputs to numpy arrays if they aren't already
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()

    # Calculate percentage deviations
    epsilon = 1e-7  # Avoid division by zero
    percentage_deviations = np.abs((y_true - y_pred) / (y_true + epsilon)) * 100

    # Find maximum deviation and its index
    max_deviation = np.max(percentage_deviations)
    max_deviation_idx = np.argmax(percentage_deviations)

    return {
        'mpd': max_deviation,
        'index': max_deviation_idx,
        'true_value': y_true[max_deviation_idx],
        'pred_value': y_pred[max_deviation_idx],
        'all_deviations': percentage_deviations
    }

def evaluate_predictions(y_true, y_pred, n_samples=None, should_print=False):
    # Flatten arrays if needed
    y_pred = y_pred.copy().flatten()
    y_true = y_true.copy().flatten()

    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    rmse = root_mean_squared_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    # Calculate MPD
    mpd_results = calculate_mpd(y_true, y_pred)
    mpd = mpd_results['mpd']
    mpd_index = mpd_results['index']
    true_value_at_mpd = mpd_results['true_value']
    pred_value_at_mpd = mpd_results['pred_value']
    percentage_deviations = mpd_results['all_deviations']

    if should_print:
        # Print metrics
        print("\nModel Performance Metrics:")
        print("-" * 50)
        print(f"MAPE: {mape:.4f}%")
        print(f"RMSE: {rmse:.8f}")
        print(f"MSE: {mse:.8f}")
        print(f"MAE: {mae:.8f}")
        print(f"MPD (Maximum Percentage Deviation): {mpd:.4f}%")

         # Print point of maximum deviation
        print(f"\nPoint of Maximum Deviation (Index {mpd_index}):")
        print("-" * 50)
        print(f"True Value: {y_true[mpd_index]:.6f}")
        print(f"Predicted Value: {y_pred[mpd_index]:.6f}")
        print(f"Absolute Difference: {abs(y_true[mpd_index] - y_pred[mpd_index]):.6f}")
        print(f"Percentage Deviation: {percentage_deviations[mpd_index]:.2f}%")

        # Print sample predictions
        if n_samples:
          print(f"\nFirst {n_samples} Predictions:")
          print("-" * 50)
          print("Index    True Value    Predicted    Difference    % Deviation")
          print("-" * 65)
          for i in range(min(n_samples, len(y_true))):
              diff = y_true[i] - y_pred[i]
              dev = percentage_deviations[i]
              print(f"{i:<8d} {y_true[i]:11.6f}  {y_pred[i]:11.6f}  {diff:11.6f}  {dev:11.2f}%")

    return {
        'mape': mape,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpd': mpd,
        'mpd_index': mpd_index,
        'percentage_deviations': percentage_deviations}

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import row
from bokeh.palettes import Category10
from bokeh.models.scales import LogScale

def plot_training_history(history):
    output_notebook()

    # Create data sources
    epochs = list(range(1, len(history.history['loss']) + 1))

    # Ensure values are positive for log scale (add small epsilon if needed)
    epsilon = 1e-10
    train_loss = [max(val, epsilon) for val in history.history['loss']]
    val_loss = [max(val, epsilon) for val in history.history['val_loss']]

    source_loss = ColumnDataSource(data={
        'epoch': epochs,
        'train_loss': train_loss,
        'val_loss': val_loss
    })

    p1 = figure(title='Model Loss Over Time (Log Scale)',
               x_axis_label='Epoch',
               y_axis_label='Loss (log)',
               width=600, height=400,
               y_axis_type="log")

    # Add hover tool
    hover_loss = HoverTool(tooltips=[
        ('Epoch', '@epoch'),
        ('Training Loss', '@train_loss{0.000}'),
        ('Validation Loss', '@val_loss{0.000}')
    ])
    p1.add_tools(hover_loss)

    # Plot loss lines
    l1 = p1.line('epoch', 'train_loss', line_color=Category10[3][0],
                 line_width=2, source=source_loss, legend_label='Training Loss')
    l2 = p1.line('epoch', 'val_loss', line_color=Category10[3][1],
                 line_width=2, source=source_loss, legend_label='Validation Loss')

    # Configure legends
    for p in [p1]:
        p.legend.click_policy = "hide"
        p.legend.location = "top_right"
        p.grid.grid_line_alpha = 0.3

    # Show plots
    show(p1)

In [None]:
def plot_predictions_bokeh(y_test, y_pred, n_samples=None):
    output_notebook()

    if n_samples is None:
        n_samples = len(y_test)
    else:
        n_samples = min(n_samples, len(y_test))

    # Prepare data
    x_range = list(range(n_samples))
    source = ColumnDataSource(data={
        'index': x_range,
        'actual': y_test[:n_samples],
        'predicted': y_pred[:n_samples],
        'error': y_test[:n_samples] - y_pred[:n_samples]
    })

    # Create time series plot
    p1 = figure(title='Actual vs Predicted Values',
                x_axis_label='Sample Index',
                y_axis_label='Value',
                width=800, height=400)

    # Add hover tool
    hover = HoverTool(tooltips=[
        ('Index', '@index'),
        ('Actual', '@actual{0.000}'),
        ('Predicted', '@predicted{0.000}'),
        ('Error', '@error{0.000}')
    ])
    p1.add_tools(hover)

    # Plot lines
    l1 = p1.line('index', 'actual', line_color=Category10[3][0],
                 line_width=2, source=source, legend_label='Actual')
    l2 = p1.line('index', 'predicted', line_color=Category10[3][1],
                 line_width=2, source=source, legend_label='Predicted')

    # Show plots
    show(row(p1))



In [None]:
def evaluate_all_models(checkpoint_dir, X_test, y_test, scaler=None):
    # Find all .keras files
    model_files = []
    for root, dirs, files in os.walk(checkpoint_dir):
        for file in files:
            if file.endswith('.keras'):
                model_files.append(os.path.join(root, file))

    print(f"\nFound {len(model_files)} models to evaluate")
    print("=" * 80)

    # Store results
    results = {}

    for model_path in model_files:
        model_name = os.path.basename(model_path).replace('.keras', '')
         # Load model
        model = load_model(model_path)

        # Make predictions
        y_pred = model.predict(X_test, verbose=0)

        # Get metrics
        metrics = evaluate_predictions(y_test, y_pred)
        results[model_name] = {'metrics':metrics}

    sorted_results = sorted(results.items(), key=lambda item: item[1]['metrics']['mape'])

    # Prepare data for tabulate
    table_data = []
    for model_name, data in sorted_results:
        metrics = data['metrics']
        table_data.append([
            model_name,
            metrics['mape'],
            metrics['rmse'],
            metrics['mse'],
            metrics['mae'],
            metrics['mpd']
        ])

    # Define table headers
    headers = ['Model Name', 'MAPE (%)', 'MSE', 'RMSE', 'MAE', 'MPD (%)']

    # Print the table
    print("\nModel Performance Metrics (Sorted by MAPE Ascending):")
    print(tabulate(table_data, headers, tablefmt="fancy_grid", floatfmt=".4f"))

    # Convert back to a dictionary if needed
    sorted_results_dict = dict(sorted_results)
    return results

In [None]:
# Evaluate all models
results = evaluate_all_models(CHECKPOINTS_PATH, X_test, y_test)


Found 13 models to evaluate

Model Performance Metrics (Sorted by MAPE Ascending):
╒════════════════════════════════════════════╤════════════╤════════╤════════╤════════╤═══════════╕
│ Model Name                                 │   MAPE (%) │    MSE │   RMSE │    MAE │   MPD (%) │
╞════════════════════════════════════════════╪════════════╪════════╪════════╪════════╪═══════════╡
│ best_128_minmax_split_first_log_sequence20 │     1.0980 │ 0.0361 │ 0.0013 │ 0.0245 │   16.7150 │
├────────────────────────────────────────────┼────────────┼────────┼────────┼────────┼───────────┤
│ best_128_custom_split_first_sequence20     │     1.7054 │ 0.0547 │ 0.0030 │ 0.0413 │   15.8269 │
├────────────────────────────────────────────┼────────────┼────────┼────────┼────────┼───────────┤
│ best_128_minmax_split_first_sequence20     │     3.5209 │ 0.2532 │ 0.0641 │ 0.1066 │   42.3228 │
├────────────────────────────────────────────┼────────────┼────────┼────────┼────────┼───────────┤
│ best_128_minmax_split_l