## **Imports, mount drive**

In [10]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')

import os
import joblib
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tabulate import tabulate

from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import column, row
from bokeh.palettes import Category10
from bokeh.models import ColumnDataSource, HoverTool, Legend

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    LSTM,
    Dense,
    Dropout,
    Input,
    BatchNormalization,
    Bidirectional
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau
)
from tensorflow.keras.regularizers import L1L2, l2
from tensorflow.keras.metrics import (
    MeanAbsolutePercentageError,
    RootMeanSquaredError,
    MeanAbsoluteError,
    MeanSquaredError
)

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    root_mean_squared_error
)

from config import *

output_notebook()

Mounted at /content/drive


# **Seed initialization for reproducibility across all libraries**

In [2]:
seed=0
os.environ['PYTHONHASHSEED'] = str(seed)
# For working on GPUs from "TensorFlow Determinism"
os.environ["TF_DETERMINISTIC_OPS"] = str(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# **Function to load training, validation, and test datasets along with feature and target scalers.**

In [3]:
def load_datasets(data_path, dataset_name):
    train_data = pd.read_csv(f"{data_path}/{dataset_name}_train.csv", index_col=0)
    val_data = pd.read_csv(f"{data_path}/{dataset_name}_val.csv", index_col=0)
    test_data = pd.read_csv(f"{data_path}/{dataset_name}_test.csv", index_col=0)

    try:
        train_data.index = pd.to_datetime(train_data.index, utc=True),
        val_data.index = pd.to_datetime(val_data.index, utc=True),
        test_data.index = pd.to_datetime(test_data.index, utc=True)
    except:
        pass

    scaler = joblib.load(f"{data_path}/{dataset_name}_scaler.joblib")
    return train_data, val_data, test_data, scaler

# **Load datasets with different preprocessing methods and splits**

In [4]:
train_data, val_data, test_data, minmax_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_first",
    dataset_name="normal_data"
)

train_data_last, val_data_last, test_data_last, minmax_last_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_last",
    dataset_name="normal_data"
)

train_log_data, val_log_data, test_log_data, minmax_log_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_first",
    dataset_name="log_data"
)

train_log_data_last_last, val_log_data_last_last, test_log_data_last_last, minmax_last_log_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/minmax_split_last",
    dataset_name="log_data"
)

In [5]:
train_data_custom, val_data_custom, test_data_custom, custom_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_first",
    dataset_name="normal_data"
)

train_data_custom_last, val_data_custom_last, test_data_custom_last, custom_last_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_last",
    dataset_name="normal_data_last"
)

train_log_data_custom, val_log_data_custom, test_log_data_custom, custom_log_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_first",
    dataset_name="log_data"
)

train_log_data_custom_last, val_log_data_custom_last, test_log_data_custom_last, custom_last_log_scaler = load_datasets(
    data_path=f"{PROCESSED_DATA_PATH}/custom_split_last",
    dataset_name="log_data_last"
)

# **Creates sequences of features and targets for time series modeling.**

In [6]:
def create_sequences(data, feature_cols, target_cols, sequence_length):
    features = data[feature_cols].values
    targets = data[target_cols].values

    n_samples = len(data) - sequence_length
    n_features = len(feature_cols)
    n_targets = len(target_cols)

    X = np.zeros((n_samples, sequence_length, n_features))
    y = np.zeros((n_samples, n_targets))

    for i in range(n_samples):
        X[i] = features[i:i+sequence_length]
        y[i] = targets[i+sequence_length-1]

    return X, y

In [7]:
def create_all_sequences(train,val,test, feature_cols, target_cols, sequence_length):
    X_train, y_train = create_sequences(
        data=train,
        feature_cols=feature_cols,
        target_cols=target_cols,
        sequence_length=sequence_length,
    )
    X_val, y_val = create_sequences(
        data=val,
        feature_cols=feature_cols,
        target_cols=target_cols,
        sequence_length=sequence_length,
    )
    X_test, y_test = create_sequences(
        data=test,
        feature_cols=feature_cols,
        target_cols=target_cols,
        sequence_length=sequence_length,
    )
    return X_train, y_train, X_val, y_val, X_test, y_test

# **Generate input and target sequences for all datasets using a specified sequence length.**
### This section creates sequences for four dataset variations:
### 1. Normal data with "minmax_split_first" strategy
### 2. Normal data with "minmax_split_last" strategy
### 3. Log-transformed data with "minmax_split_first" strategy
### 4. Log-transformed data with "minmax_split_last" strategy
### The create_sequences function is used to transform raw data into sequences suitable for time series modeling.
### Finally, the shapes of all generated datasets are printed to verify the correctness of the data preparation process.

In [8]:
feature_cols = ['Close']
target_cols = ['Target']
sequence_length = 21

X_train, y_train, X_val, y_val, X_test, y_test = create_all_sequences(
    train=train_data,
    val=val_data,
    test=test_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_train_last, y_train_last, X_val_last, y_val_last, X_test_last, y_test_last = create_all_sequences(
    train=train_data_last,
    val=val_data_last,
    test=test_data_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_train_log, y_train_log, X_val_log, y_val_log, X_test_log, y_test_log = create_all_sequences(
    train=train_log_data,
    val=val_log_data,
    test=test_log_data,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

X_train_log_last, y_train_log_last, X_val_log_last, y_val_log_last, X_test_log_last, y_test_log_last = create_all_sequences(
    train=train_log_data_last_last,
    val=val_log_data_last_last,
    test=test_log_data_last_last,
    feature_cols=feature_cols,
    target_cols=target_cols,
    sequence_length=sequence_length,
)

print("Normal Data, Split First:")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

print("\nNormal Data, Split Last:")
print(f"X_train_last shape: {X_train_last.shape}, y_train_last shape: {y_train_last.shape}")
print(f"X_val_last shape: {X_val_last.shape}, y_val_last shape: {y_val_last.shape}")
print(f"X_test_last shape: {X_test_last.shape}, y_test_last shape: {y_test_last.shape}")

print("\nLog Data, Split First:")
print(f"X_train_log shape: {X_train_log.shape}, y_train_log shape: {y_train_log.shape}")
print(f"X_val_log shape: {X_val_log.shape}, y_val_log shape: {y_val_log.shape}")
print(f"X_test_log shape: {X_test_log.shape}, y_test_log shape: {y_test_log.shape}")

print("\nLog Data, Split Last:")
print(f"X_train_log_last shape: {X_train_log_last.shape}, y_train_log_last shape: {y_train_log_last.shape}")
print(f"X_val_log_last shape: {X_val_log_last.shape}, y_val_log_last shape: {y_val_log_last.shape}")
print(f"X_test_log_last shape: {X_test_log_last.shape}, y_test_log_last shape: {y_test_log_last.shape}")

Normal Data, Split First:
X_train shape: (1915, 21, 1), y_train shape: (1915, 1)
X_val shape: (394, 21, 1), y_val shape: (394, 1)
X_test shape: (394, 21, 1), y_test shape: (394, 1)

Normal Data, Split Last:
X_train_last shape: (1915, 21, 1), y_train_last shape: (1915, 1)
X_val_last shape: (394, 21, 1), y_val_last shape: (394, 1)
X_test_last shape: (394, 21, 1), y_test_last shape: (394, 1)

Log Data, Split First:
X_train_log shape: (1915, 21, 1), y_train_log shape: (1915, 1)
X_val_log shape: (394, 21, 1), y_val_log shape: (394, 1)
X_test_log shape: (394, 21, 1), y_test_log shape: (394, 1)

Log Data, Split Last:
X_train_log_last shape: (1915, 21, 1), y_train_log_last shape: (1915, 1)
X_val_log_last shape: (394, 21, 1), y_val_log_last shape: (394, 1)
X_test_log_last shape: (394, 21, 1), y_test_log_last shape: (394, 1)


# **Generate input and target sequences for all custom datasets using the specified sequence length.**
  ### This section covers four scenarios:
  ### 1. Custom normal data with "custom_split_first" strategy
  ### 2. Custom normal data with "custom_split_last" strategy
  ### 3. Log-transformed custom data with "custom_split_first" strategy
  ### 4. Log-transformed custom data with "custom_split_last" strategy
  ### Each scenario uses the create_sequences function to transform raw data into time series sequences.
  ### The feature and target columns are customized as 'Custom_Normalized' and 'Target', respectively.

In [9]:
feature_cols_custom = ['Custom_Normalized']
target_cols_custom = ['Target']

# 1.Create sequnces for custom custom_split_first normal data
X_train_custom, y_train_custom, X_val_custom, y_val_custom, X_test_custom, y_test_custom = create_all_sequences(
    train=train_data_custom,
    val=val_data_custom,
    test=test_data_custom,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

# 2. Create sequences for custom_split_last normal data
X_train_custom_last, y_train_custom_last, X_val_custom_last, y_val_custom_last, X_test_custom_last, y_test_custom_last = create_all_sequences(
    train=train_data_custom_last,
    val=val_data_custom_last,
    test=test_data_custom_last,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)


# 3. Create sequences for custom_split_first log data
X_train_log_custom, y_train_log_custom, X_val_log_custom, y_val_log_custom, X_test_log_custom, y_test_log_custom = create_all_sequences(
    train=train_log_data_custom,
    val=val_log_data_custom,
    test=test_log_data_custom,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

# 4. Create sequences for custom_split_last log data
X_train_log_custom_last, y_train_log_custom_last, X_val_log_custom_last, y_val_log_custom_last, X_test_log_custom_last, y_test_log_custom_last = create_all_sequences(
    train=train_log_data_custom_last,
    val=val_log_data_custom_last,
    test=test_log_data_custom_last,
    feature_cols=feature_cols_custom,
    target_cols=target_cols_custom,
    sequence_length=sequence_length,
)

# **Create the LSTM model**

In [11]:
def create_lstm_model(input_shape, units, dropout_rate=0.01):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(units, activation='relu'),
        Dense(sequence_length+1, activation='relu'),
        Dense(1)
    ])

    optimizer = AdamW()

    model.compile(
        optimizer=optimizer,
        loss='mae',
    )

    return model

# **Returns callbacks (EarlyStop, ReduceLROnPlateau, ModelCheckpoint)**

In [12]:
def get_callbacks(model_name, base_path=f"{CHECKPOINTS_PATH}/LSTM"):
    checkpoint_path = os.path.join(base_path, f'{model_name}.keras')

    callbacks = [
        ModelCheckpoint(
            filepath=checkpoint_path,
            monitor='val_loss',
            mode='min',
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=6,
            restore_best_weights=True,
            verbose=0
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1,
            mode='min'
        )
    ]

    return callbacks

In [13]:
def inverse_transform_simple(df, scaler, log_scaled=False):
    df = df.copy()
    inverse_scaled = scaler.inverse_transform(df)

    if log_scaled:
        inverse_scaled = np.exp(inverse_scaled)

    return inverse_scaled

# **Define hyperparameters for the model training process**

In [14]:
units = 121
batch_size = 32
epochs = 100
dropout_rate = 0.01

# **Create models**

In [15]:
NORM_TYPE = "minmax_split_first"

model_normal_first = create_lstm_model(X_train.shape[1:], units, dropout_rate)

SAVED_MODEL_NAME_minmax_split_first = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"

callbacks = get_callbacks(SAVED_MODEL_NAME_minmax_split_first)

# history_normal_first = model_normal_first.fit(
#     X_train, y_train,
#     validation_data=(X_val, y_val),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [20]:
model_minmax_first = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_minmax_split_first}.keras")

y_pred_minmax_first = model_minmax_first.predict(X_test)
y_pred_minmax_first_extended = np.hstack([y_pred_minmax_first, np.zeros_like(y_pred_minmax_first)])

y_pred_minmax_first_real = inverse_transform_simple(y_pred_minmax_first_extended, minmax_scaler)[:,0].reshape(-1,1)

u_test_minmax_first_extended = np.hstack([y_test, np.zeros_like(y_test)])
y_test_minmax_first_real = inverse_transform_simple(u_test_minmax_first_extended, minmax_scaler)[:,0].reshape(-1,1)

metrics = evaluate_predictions(SAVED_MODEL_NAME_minmax_split_first,y_test_minmax_first_real,
y_pred_minmax_first_real, should_print=True)

print(y_test.shape)
print(y_test_minmax_first_real[-5:])

plot_training_history(history_normal_first)
plot_predictions_bokeh(y_test_minmax_first_real, y_pred_minmax_first_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.6469%
RMSE: 42.99768877
MSE: 1848.80123940
MAE: 32.49787957
MPD (Maximum Percentage Deviation): 4.1349%

Point of Maximum Deviation (Index 292):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5400.780762
Absolute Difference: 214.450684
Percentage Deviation: 4.13%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
(394, 1)
[[5930.85009766]
 [5974.06982422]
 [6040.04003906]
 [6037.58984375]
 [5970.83984375]]


NameError: name 'plot_training_history' is not defined

In [21]:
NORM_TYPE = "minmax_split_last"

model_normal_last = create_lstm_model(X_train_last.shape[1:], units, dropout_rate)

SAVED_MODEL_NAME_minmax_split_last = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"

callbacks = get_callbacks(SAVED_MODEL_NAME_minmax_split_last)

# history_normal_last = model_normal_last.fit(
#     X_train_last, y_train_last,
#     validation_data=(X_val_last, y_val_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [22]:
model_minmax_last = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_minmax_split_last}.keras")

y_pred_minmax_last = model_minmax_last.predict(X_test_last)

y_pred_minmax_last_extended = np.hstack([y_pred_minmax_last, np.zeros_like(y_pred_minmax_last)])

y_pred_minmax_last_real = inverse_transform_simple(y_pred_minmax_last_extended, minmax_last_scaler)[:,0].reshape(-1,1)

y_test_minmax_last_extended = np.hstack([y_test_last, np.zeros_like(y_test_last)])

y_test_minmax_last_real = inverse_transform_simple(y_test_minmax_last_extended, minmax_last_scaler)[:,0].reshape(-1,1)

metrics = evaluate_predictions(SAVED_MODEL_NAME_minmax_split_last,y_test_minmax_last_real, y_pred_minmax_last_real,should_print=True)

print(y_test_minmax_last_real[-5:])

plot_training_history(history_normal_last)
plot_predictions_bokeh(y_test_minmax_last_real, y_pred_minmax_last_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.7346%
RMSE: 49.36210478
MSE: 2436.61738815
MAE: 36.92418256
MPD (Maximum Percentage Deviation): 4.8620%

Point of Maximum Deviation (Index 292):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5438.491699
Absolute Difference: 252.161621
Percentage Deviation: 4.86%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[[5930.85009766]
 [5974.06982422]
 [6040.04003906]
 [6037.58984375]
 [5970.83984375]]


NameError: name 'plot_training_history' is not defined

In [23]:
NORM_TYPE = "log_minmax_split_first"

log_input_shape = X_train_log.shape[1:]

model_log_first = create_lstm_model(log_input_shape, units, dropout_rate)

SAVED_MODEL_NAME_minmax_split_first_log = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"

callbacks = get_callbacks(SAVED_MODEL_NAME_minmax_split_first_log)

# history_log_first = model_log_first.fit(
#     X_train_log, y_train_log,
#     validation_data=(X_val_log, y_val_log),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [24]:
model_minmax_log_first = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_minmax_split_first_log}.keras")

y_pred_minmax_log_first = model_minmax_log_first.predict(X_test_log)

y_pred_minmax_log_first_extended = np.hstack([y_pred_minmax_log_first, np.zeros_like(y_pred_minmax_log_first)])

y_pred_minmax_log_first_real = inverse_transform_simple(y_pred_minmax_log_first_extended, minmax_log_scaler, log_scaled=True)[:,0].reshape(-1,1)

y_test_minmax_log_first_extended = np.hstack([y_test_log, np.zeros_like(y_test_log)])

y_test_minmax_log_first_real = inverse_transform_simple(y_test_minmax_log_first_extended, minmax_log_scaler, log_scaled=True)[:,0].reshape(-1,1)

metrics = evaluate_predictions(SAVED_MODEL_NAME_minmax_split_first_log,y_test_minmax_log_first_real, y_pred_minmax_log_first_real, should_print=True)

print(y_test_minmax_log_first_real[-5:])

plot_training_history(history_log_first)
plot_predictions_bokeh(y_test_minmax_log_first_real, y_pred_minmax_log_first_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.7779%
RMSE: 49.66943246
MSE: 2467.05252125
MAE: 39.59314399
MPD (Maximum Percentage Deviation): 4.1990%

Point of Maximum Deviation (Index 292):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5404.106445
Absolute Difference: 217.776367
Percentage Deviation: 4.20%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[[5930.85009766]
 [5974.06982422]
 [6040.04003906]
 [6037.58984375]
 [5970.83984375]]


NameError: name 'plot_training_history' is not defined

In [25]:
NORM_TYPE = "log_minmax_split_last"

log_last_input_shape = X_train_log_last.shape[1:]
model_log_last = create_lstm_model(log_last_input_shape, units, dropout_rate)

SAVED_MODEL_NAME_minmax_split_last_log = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"

callbacks = get_callbacks(SAVED_MODEL_NAME_minmax_split_last_log)

# history_log_last = model_log_last.fit(
#     X_train_log_last, y_train_log_last,
#     validation_data=(X_val_log_last, y_val_log_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [26]:
model_minmax_log_last = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_minmax_split_last_log}.keras")

y_pred_minmax_log_last = model_minmax_log_last.predict(X_test_log_last)

y_pred_minmax_log_last_extended = np.hstack([y_pred_minmax_log_last, np.zeros_like(y_pred_minmax_log_last)])

y_pred_minmax_log_last_real = inverse_transform_simple(y_pred_minmax_log_last_extended, minmax_last_log_scaler, log_scaled=True)[:,0].reshape(-1,1)

y_test_minmax_log_last_extended = np.hstack([y_test_log_last, np.zeros_like(y_test_log_last)])

y_test_minmax_log_last_real = inverse_transform_simple(y_test_minmax_log_last_extended, minmax_last_log_scaler, log_scaled=True)[:,0].reshape(-1,1)

metrics = evaluate_predictions(SAVED_MODEL_NAME_minmax_split_last_log,y_test_minmax_log_last_real, y_pred_minmax_log_last_real,should_print=True)

print(y_test_minmax_log_last_real[-5:])

plot_training_history(history_log_last)
plot_predictions_bokeh(y_test_minmax_log_last_real, y_pred_minmax_log_last_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.7609%
RMSE: 52.38225037
MSE: 2743.90015361
MAE: 38.63934109
MPD (Maximum Percentage Deviation): 5.1099%

Point of Maximum Deviation (Index 292):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5451.345703
Absolute Difference: 265.015625
Percentage Deviation: 5.11%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[[5930.85009766]
 [5974.06982422]
 [6040.04003906]
 [6037.58984375]
 [5970.83984375]]


NameError: name 'plot_training_history' is not defined

In [28]:
def inverse_custom_normalize(normalized_value, last_value, index, n):
    if index == 0:
        return 0
    part1 = last_value * (index / n)
    sqrt_part = np.sqrt(index**2 + ((last_value * index) / n)**2)
    part2 = normalized_value * (sqrt_part / index)
    return part1 + part2

def add_first_value(data, first_value):
    return data + first_value

def inverse_transform_custom(arr, scaler, n, first_value, last_value, train_data,val_data,column_name='Custom_Normalized', log_scaled=False):
    start_index = len(train_data) + len(val_data)
    original_indices = np.arange(start_index, start_index + len(arr))

    inverse_minmax = scaler.inverse_transform(arr)[:,0]

    df_real = pd.DataFrame(inverse_minmax, columns=[column_name],
    index=original_indices)

    real_values = []

    for i, index in enumerate(df_real.index):
        real_value = inverse_custom_normalize(df_real.iloc[i, 0], last_value, index+sequence_length, n)
        real_values.append(real_value)

    real_values = add_first_value(np.array(real_values), first_value)

    if log_scaled:
        real_values = np.exp(real_values)

    return real_values

In [29]:
NORM_TYPE = "custom_split_first"

model_custom_first = create_lstm_model(X_train_custom.shape[1:], units, dropout_rate)

SAVED_MODEL_NAME_custom_split_first = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"

callbacks = get_callbacks(SAVED_MODEL_NAME_custom_split_first)

# history_custom_first = model_custom_first.fit(
#     X_train_custom, y_train_custom,
#     validation_data=(X_val_custom, y_val_custom),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [30]:
reference_values = pd.read_csv(f"{PROCESSED_DATA_PATH}/custom_split_first/normal_data_custom_scaler.csv")

first_value = reference_values['first_value'].iloc[0]
last_value = reference_values['last_value'].iloc[0]
last_index = reference_values['last_index'].iloc[0]

print(f"First Value: {first_value}\nLast Value: {last_value}\nLast Index: {last_index}")

model_custom_first = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_custom_split_first}.keras")

y_pred_custom_first = model_custom_first.predict(X_test_custom)

y_pred_custom_extended = np.hstack([y_pred_custom_first, np.zeros_like(y_pred_custom_first)])

y_pred_custom_real = inverse_transform_custom(y_pred_custom_extended,custom_scaler, last_index ,first_value ,last_value, train_data_custom,val_data_custom)

y_test_custom_extended = np.hstack([y_test_custom, np.zeros_like(y_test_custom)])

y_test_custom_real = inverse_transform_custom(y_test_custom_extended, custom_scaler, last_index, first_value, last_value, train_data_custom,val_data_custom)


metrics = evaluate_predictions(SAVED_MODEL_NAME_custom_split_first,y_test_custom_real, y_pred_custom_real,should_print=True)

print(y_test_custom_real[-5:])

plot_training_history(history_custom_first)
plot_predictions_bokeh(y_test_custom_real, y_pred_custom_real)

First Value: 1831.97998046875
Last Value: 2661.2998046875
Last Index: 1935
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.6161%
RMSE: 41.42692501
MSE: 1716.19011548
MAE: 30.99057575
MPD (Maximum Percentage Deviation): 3.7645%

Point of Maximum Deviation (Index 293):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5381.571470
Absolute Difference: 195.241392
Percentage Deviation: 3.76%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[5867.08007812 5930.85009766 5974.06982422 6040.04003906 6037.58984375]


NameError: name 'plot_training_history' is not defined

In [31]:
model_custom_last = create_lstm_model(X_train_custom_last.shape[1:], units, dropout_rate)
NORM_TYPE = "custom_split_last"
SAVED_MODEL_NAME_custom_split_last = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"
callbacks = get_callbacks(SAVED_MODEL_NAME_custom_split_last)

# history_custom_last = model_custom_last.fit(
#     X_train_custom_last, y_train_custom_last,
#     validation_data=(X_val_custom_last, y_val_custom_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [32]:
reference_values = pd.read_csv(f"{PROCESSED_DATA_PATH}/custom_split_last/normal_data_last_custom_scaler.csv")

first_value = reference_values['first_value'].iloc[0]
last_value = reference_values['last_value'].iloc[0]
last_index = reference_values['last_index'].iloc[0]

print(f"First Value: {first_value}\nLast Value: {last_value}\nLast Index: {last_index}")

model_custom_last = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_custom_split_last}.keras")

y_pred_custom_last = model_custom_last.predict(X_test_custom_last)

y_pred_custom_last_extended = np.hstack([y_pred_custom_last, np.zeros_like(y_pred_custom_last)])

y_pred_custom_last_real = inverse_transform_custom(y_pred_custom_last_extended,custom_last_scaler,last_index,first_value,last_value, train_data_custom_last, val_data_custom_last)

y_test_custom_last_extended = np.hstack([y_test_custom_last, np.zeros_like(y_test_custom_last)])

y_test_custom_last_real = inverse_transform_custom(y_test_custom_last_extended, custom_last_scaler, last_index, first_value, last_value, train_data_custom_last, val_data_custom_last)

metrics = evaluate_predictions(SAVED_MODEL_NAME_custom_split_last,y_test_custom_last_real, y_pred_custom_last_real, should_print=True)

print(y_test_custom_last_real[-5:])

plot_training_history(history_custom_last)
plot_predictions_bokeh(y_test_custom_last_real, y_pred_custom_last_real)

First Value: 1831.97998046875
Last Value: 4138.85986328125
Last Index: 2765
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.6631%
RMSE: 43.70595626
MSE: 1910.21061254
MAE: 33.47495258
MPD (Maximum Percentage Deviation): 3.7416%

Point of Maximum Deviation (Index 293):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5380.384184
Absolute Difference: 194.054106
Percentage Deviation: 3.74%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[5867.08007812 5930.85009766 5974.06982422 6040.04003906 6037.58984375]


NameError: name 'plot_training_history' is not defined

In [33]:
NORM_TYPE = "custom_log_split_first"

model_custom_log_first = create_lstm_model(X_train_log_custom.shape[1:], units, dropout_rate)

SAVED_MODEL_NAME_custom_log_split_first = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"
callbacks = get_callbacks(SAVED_MODEL_NAME_custom_log_split_first)

# history_custom_log_first = model_custom_log_first.fit(
#     X_train_log_custom, y_train_log_custom,
#     validation_data=(X_val_log_custom, y_val_log_custom),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=1
# )

In [34]:
log_data_path = f"{PROCESSED_DATA_PATH}/custom_split_first/log_data_custom_scaler.csv"
raw_data_log = pd.read_csv(log_data_path)

first_value_log = raw_data_log['first_value'].iloc[0]
first_index_log = 0
last_value_log = raw_data_log['last_value'].iloc[0]
last_index_log = raw_data_log['last_index'].iloc[0]

print(f"First Value: {first_value_log}\nFirst Index: {first_index_log}")
print(f"Last Value: {last_value_log}\nLast Index: {last_index_log}")

First Value: 7.513152617482637
First Index: 0
Last Value: 0.8971855609944477
Last Index: 1935


In [35]:
model_custom_log_first = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_custom_log_split_first}.keras")

y_pred_custom_log_first = model_custom_log_first.predict(X_test_log_custom)

y_pred_custom_log_first_extended = np.hstack([y_pred_custom_log_first, np.zeros_like(y_pred_custom_log_first)])

y_pred_custom_log_real = inverse_transform_custom(y_pred_custom_log_first_extended, custom_log_scaler, last_index_log,first_value_log, last_value_log, train_log_data_custom, val_log_data_custom,log_scaled=True)

y_test_custom_log_first_extended = np.hstack([y_test_log_custom, np.zeros_like(y_test_log_custom)])

y_test_custom_log_real = inverse_transform_custom(y_test_custom_log_first_extended, custom_log_scaler, last_index_log,first_value_log, last_value_log, train_log_data_custom, val_log_data_custom,log_scaled=True)

metrics = evaluate_predictions(SAVED_MODEL_NAME_custom_log_split_first,y_test_custom_log_real, y_pred_custom_log_real, should_print=True)

print(y_test_custom_log_real[-5:])


plot_training_history(history_custom_log_first)
plot_predictions_bokeh(y_test_custom_log_real, y_pred_custom_log_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.6605%
RMSE: 44.07974487
MSE: 1943.02390813
MAE: 33.24341572
MPD (Maximum Percentage Deviation): 4.1759%

Point of Maximum Deviation (Index 293):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5402.906751
Absolute Difference: 216.576672
Percentage Deviation: 4.18%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[5867.08007812 5930.85009766 5974.06982422 6040.04003906 6037.58984375]


NameError: name 'plot_training_history' is not defined

In [36]:
NORM_TYPE = "custom_log_split_last"

model_custom_log_last = create_lstm_model(X_train_log_custom_last.shape[1:], units, dropout_rate)

SAVED_MODEL_NAME_custom_log_last = f"{units}_{NORM_TYPE}_AdamW_mae_sequence{sequence_length}"
callbacks = get_callbacks(SAVED_MODEL_NAME_custom_log_last)

# history_custom_log_last = model_custom_log_last.fit(
#     X_train_log_custom_last, y_train_log_custom_last,
#     validation_data=(X_val_log_custom_last, y_val_log_custom_last),
#     epochs=epochs,
#     batch_size=batch_size,
#     callbacks=callbacks,
#     verbose=0
# )

In [37]:
log_data_path = f"{PROCESSED_DATA_PATH}/custom_split_last/log_data_last_custom_scaler.csv"
raw_data_log = pd.read_csv(log_data_path)

first_value_log = raw_data_log['first_value'].iloc[0]
first_index_log = 0
last_value_log = raw_data_log['last_value'].iloc[0]
last_index_log = raw_data_log['last_index'].iloc[0]

print(f"First Value: {first_value_log}\nFirst Index: {first_index_log}")
print(f"Last Value: {last_value_log}\nLast Index: {last_index_log}")

First Value: 7.513152617482637
First Index: 0
Last Value: 1.1814902563549392
Last Index: 2765


In [38]:
model_custom_log_last = load_model(f"{CHECKPOINTS_PATH}/LSTM/{SAVED_MODEL_NAME_custom_log_last}.keras")

y_pred_custom_log_last = model_custom_log_last.predict(X_test_log_custom_last)

y_pred_custom_log_last_extended = np.hstack([y_pred_custom_log_last, np.zeros_like(y_pred_custom_log_last)])

y_pred_custom_log_last_real = inverse_transform_custom(y_pred_custom_log_last_extended,
custom_last_log_scaler, last_index_log, first_value_log, last_value_log, train_log_data_custom_last, val_log_data_custom_last, log_scaled=True)

y_test_custom_log_last_extended = np.hstack([y_test_log_custom_last, np.zeros_like(y_test_log_custom_last)])

y_test_custom_log_last_real = inverse_transform_custom(y_test_custom_log_last_extended, custom_last_log_scaler, last_index_log, first_value_log, last_value_log, train_log_data_custom_last,val_log_data_custom_last, log_scaled=True)

metrics = evaluate_predictions(SAVED_MODEL_NAME_custom_log_last,y_test_custom_log_last_real, y_pred_custom_log_last_real,should_print=True)

print(y_test_custom_log_last_real[-5:])

plot_training_history(history_custom_log_last)
plot_predictions_bokeh(y_test_custom_log_last_real, y_pred_custom_log_last_real)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

Model Performance Metrics:
--------------------------------------------------
MAPE: 0.6317%
RMSE: 42.19003192
MSE: 1779.99879340
MAE: 31.82607155
MPD (Maximum Percentage Deviation): 3.9317%

Point of Maximum Deviation (Index 293):
--------------------------------------------------
True Value: 5186.330078
Predicted Value: 5390.238857
Absolute Difference: 203.908779
Percentage Deviation: 3.93%
Metrics saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/metrics/model_metrics.csv
[5867.08007812 5930.85009766 5974.06982422 6040.04003906 6037.58984375]


NameError: name 'plot_training_history' is not defined

## **Function to inverz transform DataFrame scaled with MinMaxScaler**

In [19]:
def calculate_mpd(y_true, y_pred):
    # Convert inputs to numpy arrays if they aren't already
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()

    # Calculate percentage deviations
    epsilon = 1e-7  # Avoid division by zero
    percentage_deviations = np.abs((y_true - y_pred) / (y_true + epsilon)) * 100

    # Find maximum deviation and its index
    max_deviation = np.max(percentage_deviations)
    max_deviation_idx = np.argmax(percentage_deviations)

    return {
        'mpd': max_deviation,
        'index': max_deviation_idx,
        'true_value': y_true[max_deviation_idx],
        'pred_value': y_pred[max_deviation_idx],
        'all_deviations': percentage_deviations
    }

In [18]:
def save_model_metrics(model_name, metrics, save_path):
    # Define column names
    columns = ['model_name', 'mape', 'mse', 'rmse', 'mae', 'mpd', 'mpd_index']

    # Convert metrics to DataFrame
    new_data = pd.DataFrame([{**{'model_name': model_name}, **metrics}])[columns]

    # Check if file exists
    if os.path.exists(save_path):
        df = pd.read_csv(save_path)

        # If model exists, update the row
        if model_name in df['model_name'].values:
            df.loc[df['model_name'] == model_name, columns[1:]] = new_data.iloc[:, 1:].values
        else:
            df = pd.concat([df, new_data], ignore_index=True)
    else:
        df = new_data

    # Save the updated DataFrame
    df.to_csv(save_path, index=False)
    print(f"Metrics saved to {save_path}")

In [17]:
def evaluate_predictions(model_name, y_true, y_pred, n_samples=None, should_print=False):
    # Flatten arrays if needed
    y_pred = y_pred.copy().flatten()
    y_true = y_true.copy().flatten()

    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    rmse = root_mean_squared_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    # Calculate MPD
    mpd_results = calculate_mpd(y_true, y_pred)
    mpd = mpd_results['mpd']
    mpd_index = mpd_results['index']
    true_value_at_mpd = mpd_results['true_value']
    pred_value_at_mpd = mpd_results['pred_value']
    percentage_deviations = mpd_results['all_deviations']

    if should_print:
        # Print metrics
        print("\nModel Performance Metrics:")
        print("-" * 50)
        print(f"MAPE: {mape:.4f}%")
        print(f"RMSE: {rmse:.8f}")
        print(f"MSE: {mse:.8f}")
        print(f"MAE: {mae:.8f}")
        print(f"MPD (Maximum Percentage Deviation): {mpd:.4f}%")

         # Print point of maximum deviation
        print(f"\nPoint of Maximum Deviation (Index {mpd_index}):")
        print("-" * 50)
        print(f"True Value: {y_true[mpd_index]:.6f}")
        print(f"Predicted Value: {y_pred[mpd_index]:.6f}")
        print(f"Absolute Difference: {abs(y_true[mpd_index] - y_pred[mpd_index]):.6f}")
        print(f"Percentage Deviation: {percentage_deviations[mpd_index]:.2f}%")

        # Print sample predictions
        if n_samples:
          print(f"\nFirst {n_samples} Predictions:")
          print("-" * 50)
          print("Index    True Value    Predicted    Difference    % Deviation")
          print("-" * 65)
          for i in range(min(n_samples, len(y_true))):
              diff = y_true[i] - y_pred[i]
              dev = percentage_deviations[i]
              print(f"{i:<8d} {y_true[i]:11.6f}  {y_pred[i]:11.6f}  {diff:11.6f}  {dev:11.2f}%")

    save_model_metrics(model_name, {
        'mape': mape,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpd': mpd,
        'mpd_index': mpd_index,
    }, f"{METRICS_PATH}/model_metrics.csv")

    return {
        'mape': mape,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpd': mpd,
        'mpd_index': mpd_index,
        'percentage_deviations': percentage_deviations}

In [18]:
def plot_training_history(history):
    output_notebook()

    # Create data sources
    epochs = list(range(1, len(history.history['loss']) + 1))

    # Ensure values are positive for log scale (add small epsilon if needed)
    epsilon = 1e-10
    train_loss = [max(val, epsilon) for val in history.history['loss']]
    val_loss = [max(val, epsilon) for val in history.history['val_loss']]

    source_loss = ColumnDataSource(data={
        'epoch': epochs,
        'train_loss': train_loss,
        'val_loss': val_loss
    })

    p1 = figure(title='Model Loss Over Time (Log Scale)',
               x_axis_label='Epoch',
               y_axis_label='Loss (log)',
               width=600, height=400,
               y_axis_type="log")

    # Add hover tool
    hover_loss = HoverTool(tooltips=[
        ('Epoch', '@epoch'),
        ('Training Loss', '@train_loss{0.000}'),
        ('Validation Loss', '@val_loss{0.000}')
    ])
    p1.add_tools(hover_loss)

    # Plot loss lines
    l1 = p1.line('epoch', 'train_loss', line_color=Category10[3][0],
                 line_width=2, source=source_loss, legend_label='Training Loss')
    l2 = p1.line('epoch', 'val_loss', line_color=Category10[3][1],
                 line_width=2, source=source_loss, legend_label='Validation Loss')

    # Configure legends
    for p in [p1]:
        p.legend.click_policy = "hide"
        p.legend.location = "top_right"
        p.grid.grid_line_alpha = 0.3

    # Show plots
    show(p1)

In [17]:
def plot_predictions_bokeh(y_test, y_pred, n_samples=None):
    output_notebook()

    if n_samples is None:
        n_samples = len(y_test)
    else:
        n_samples = min(n_samples, len(y_test))

    # Prepare data
    x_range = list(range(n_samples))
    source = ColumnDataSource(data={
        'index': x_range,
        'actual': y_test[:n_samples],
        'predicted': y_pred[:n_samples],
        'error': y_test[:n_samples] - y_pred[:n_samples]
    })

    # Create time series plot
    p1 = figure(title='Actual vs Predicted Values',
                x_axis_label='Sample Index',
                y_axis_label='Value',
                width=800, height=400)

    # Add hover tool
    hover = HoverTool(tooltips=[
        ('Index', '@index'),
        ('Actual', '@actual{0.000}'),
        ('Predicted', '@predicted{0.000}'),
        ('Error', '@error{0.000}')
    ])
    p1.add_tools(hover)

    # Plot lines
    l1 = p1.line('index', 'actual', line_color=Category10[3][0],
                 line_width=2, source=source, legend_label='Actual')
    l2 = p1.line('index', 'predicted', line_color=Category10[3][1],
                 line_width=2, source=source, legend_label='Predicted')

    # Show plots
    show(row(p1))

