# **Package to import from another .ipynb file**

In [5]:
!pip install import-ipynb

Collecting import-ipynb
  Downloading import_ipynb-0.2-py3-none-any.whl.metadata (2.3 kB)
Collecting jedi>=0.16 (from IPython->import-ipynb)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading import_ipynb-0.2-py3-none-any.whl (4.0 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, import-ipynb
Successfully installed import-ipynb-0.2 jedi-0.19.2


## **Import all necessary packages, and mount drive**

In [6]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Add project directory to system path
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')

# Import necessary modules
import os
import numpy as np
import pandas as pd
import import_ipynb
import matplotlib.pyplot as plt
from config import *

# TensorFlow and Keras imports
import tensorflow as tf
from keras.layers import Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.metrics import (
    MeanAbsolutePercentageError,
    RootMeanSquaredError,
    MeanAbsoluteError,
    MeanSquaredError
)

# Bokeh visualization imports
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import column, row
from bokeh.palettes import Category10
from bokeh.models import ColumnDataSource, HoverTool, Legend

# Enable Bokeh output in notebooks
output_notebook()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Get the train, val, test datasets using load_and_prepare_data()**

In [7]:
%cd {TRAINING_PATH}
import prepare_data

SEQ_LENGTH = 20

(X_train, y_train), (X_val, y_val), (X_test, y_test) = prepare_data.load_and_prepare_data(processed_path=PROCESSED_DATA_PATH, seq_length=SEQ_LENGTH,
    train_file='train_custom_splitlast.csv',
    val_file='val_custom_splitlast.csv',
    test_file='test_custom_splitlast.csv',
    feature_col='Scaled',
    target_col='Target',
)

print(f'Training set shape: {X_train.shape}')
print(f'Validation set shape: {X_val.shape}')
print(f'Test set shape: {X_test.shape}')

/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/training
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Training set shape: (17035, 20, 1)
Validation set shape: (3635, 20, 1)
Test set shape: (3635, 20, 1)




# **Create the LSTM model and callbacks (EarlyStop, Reduce Learning Rate, Save best model)**

In [8]:
def create_lstm_model(input_shape,units, dropout_rate=0.1):
    model = Sequential([
        LSTM(units, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        Dense(1)
    ])

    optimizer = Adam()

    metrics = [
        MeanAbsolutePercentageError(name="MAPE"),
        RootMeanSquaredError(name="RMSE"),
        MeanSquaredError(name="MSE"),
        MeanAbsoluteError(name="MAE"),
    ]

    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=metrics
    )

    return model

def get_callbacks(model_name, base_path=CHECKPOINTS_PATH):
    # Define file paths
    checkpoint_path = os.path.join(base_path, f'best_{model_name}.keras')

    # Define callbacks
    callbacks = [
        ModelCheckpoint(
            filepath=checkpoint_path,
            monitor='val_loss',
            mode='min',
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=10,
            min_lr=1e-7,
            verbose=1,
            mode='min'
        )
    ]

    return callbacks


## **Create model**

In [9]:
seed=0
import os
os.environ['PYTHONHASHSEED'] = str(seed)
# For working on GPUs from "TensorFlow Determinism"
os.environ["TF_DETERMINISTIC_OPS"] = str(seed)
import numpy as np
np.random.seed(seed)
import random
random.seed(seed)
import tensorflow as tf
tf.random.set_seed(seed)


units = 32;
input_shape = (X_train.shape[1], X_train.shape[2])
model = create_lstm_model(input_shape, units)
model.summary()


  super().__init__(**kwargs)


## **Get callbacks and train the model**

In [None]:
callbacks = get_callbacks(f"{units}_custom_split_last_relu_sequence{SEQ_LENGTH}")

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100
[1m529/533[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - MAE: 0.0402 - MAPE: 1858.0687 - MSE: 0.0094 - RMSE: 0.0812 - loss: 0.0094
Epoch 1: val_loss improved from inf to 0.13289, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/best_32_custom_split_last_relu_sequence20.keras
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - MAE: 0.0400 - MAPE: 1852.6428 - MSE: 0.0094 - RMSE: 0.0809 - loss: 0.0094 - val_MAE: 0.3496 - val_MAPE: 52.7555 - val_MSE: 0.1329 - val_RMSE: 0.3645 - val_loss: 0.1329 - learning_rate: 0.0010
Epoch 2/100
[1m528/533[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - MAE: 0.0163 - MAPE: 2242.2070 - MSE: 4.2239e-04 - RMSE: 0.0205 - loss: 4.2239e-04
Epoch 2: val_loss improved from 0.13289 to 0.01274, saving model to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/models/checkpoints/best_32_custom_split_last

## **Make predictions**

In [10]:
y_pred = model.predict(X_test)
print(f"\nPrediction shape: {y_pred.shape}")
y_pred[-5:]

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

Prediction shape: (3635, 1)


array([[-0.00375519],
       [-0.00369359],
       [-0.00363277],
       [-0.0035725 ],
       [-0.00351205]], dtype=float32)

## **Evaluate the model**

In [11]:
metrics = evaluate_predictions(y_test, y_pred, n_samples=5, scaler=None)
plot_training_history(history)
plot_predictions_bokeh(y_test, y_pred)

Shapes - Predictions: (3635,), True Values: (3635,)

Model Performance Metrics:
--------------------------------------------------
MAPE: 110.3097%
RMSE: 0.48376000
MSE: 0.23402373
MAE: 0.46238175
MPD (Maximum Percentage Deviation): 112.0767%

Point of Maximum Deviation (Index 2426):
--------------------------------------------------
True Value: 0.337064
Predicted Value: -0.040706
Absolute Difference: 0.377770
Percentage Deviation: 112.08%

First 5 Predictions:
--------------------------------------------------
Index    True Value    Predicted    Difference    % Deviation
-----------------------------------------------------------------
0           0.434640    -0.044123     0.478763       110.15%
1           0.437887    -0.044058     0.481945       110.06%
2           0.442387    -0.044183     0.486570       109.99%
3           0.441837    -0.044369     0.486206       110.04%
4           0.438847    -0.044619     0.483466       110.17%


NameError: name 'plot_training_history' is not defined

In [3]:
def calculate_mpd(y_true, y_pred):
    # Convert inputs to numpy arrays if they aren't already
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()

    # Calculate percentage deviations
    epsilon = 1e-7  # Avoid division by zero
    percentage_deviations = np.abs((y_true - y_pred) / (y_true + epsilon)) * 100

    # Find maximum deviation and its index
    max_deviation = np.max(percentage_deviations)
    max_deviation_idx = np.argmax(percentage_deviations)

    return {
        'mpd': max_deviation,
        'index': max_deviation_idx,
        'true_value': y_true[max_deviation_idx],
        'pred_value': y_pred[max_deviation_idx],
        'all_deviations': percentage_deviations
    }

def evaluate_predictions(y_test, y_pred, n_samples=5, scaler=None):
    # Flatten arrays if needed
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()

    print(f"Shapes - Predictions: {y_pred.shape}, True Values: {y_test.shape}")

    # Initialize Keras metrics
    mape_metric = MeanAbsolutePercentageError()
    mse_metric = MeanSquaredError()
    mae_metric = MeanAbsoluteError()

    # Update metrics
    mape_metric.update_state(y_test, y_pred)
    mse_metric.update_state(y_test, y_pred)
    mae_metric.update_state(y_test, y_pred)

    # Get metric values
    mape = float(mape_metric.result())
    mse = float(mse_metric.result())
    rmse = np.sqrt(mse)
    mae = float(mae_metric.result())

    # Calculate MPD
    mpd_results = calculate_mpd(y_test, y_pred)
    mpd = mpd_results['mpd']
    mpd_index = mpd_results['index']
    true_value_at_mpd = mpd_results['true_value']
    pred_value_at_mpd = mpd_results['pred_value']
    percentage_deviations = mpd_results['all_deviations']

    # Print metrics
    print("\nModel Performance Metrics:")
    print("-" * 50)
    print(f"MAPE: {mape:.4f}%")
    print(f"RMSE: {rmse:.8f}")
    print(f"MSE: {mse:.8f}")
    print(f"MAE: {mae:.8f}")
    print(f"MPD (Maximum Percentage Deviation): {mpd:.4f}%")

    # Print point of maximum deviation
    print(f"\nPoint of Maximum Deviation (Index {mpd_index}):")
    print("-" * 50)
    print(f"True Value: {y_test[mpd_index]:.6f}")
    print(f"Predicted Value: {y_pred[mpd_index]:.6f}")
    print(f"Absolute Difference: {abs(y_test[mpd_index] - y_pred[mpd_index]):.6f}")
    print(f"Percentage Deviation: {percentage_deviations[mpd_index]:.2f}%")

     # Print sample predictions
    print(f"\nFirst {n_samples} Predictions:")
    print("-" * 50)
    print("Index    True Value    Predicted    Difference    % Deviation")
    print("-" * 65)
    for i in range(min(n_samples, len(y_test))):
        diff = y_test[i] - y_pred[i]
        dev = percentage_deviations[i]
        print(f"{i:<8d} {y_test[i]:11.6f}  {y_pred[i]:11.6f}  {diff:11.6f}  {dev:11.2f}%")

    return {
        'mape': mape,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpd': mpd,
        'mpd_index': mpd_index,
        'percentage_deviations': percentage_deviations
    }

In [None]:
def plot_training_history(history):
    output_notebook()

    # Create data sources
    epochs = list(range(1, len(history.history['loss']) + 1))
    source_loss = ColumnDataSource(data={
        'epoch': epochs,
        'train_loss': history.history['loss'],
        'val_loss': history.history['val_loss']
    })

    source_mape = ColumnDataSource(data={
        'epoch': epochs,
        'train_mape': history.history['MAPE'],
        'val_mape': history.history['val_MAPE']
    })

    # Create loss plot
    p1 = figure(title='Model Loss Over Time',
               x_axis_label='Epoch',
               y_axis_label='Loss',
               width=600, height=400)

    # Add hover tool
    hover_loss = HoverTool(tooltips=[
        ('Epoch', '@epoch'),
        ('Training Loss', '@train_loss{0.000}'),
        ('Validation Loss', '@val_loss{0.000}')
    ])
    p1.add_tools(hover_loss)

    # Plot loss lines
    l1 = p1.line('epoch', 'train_loss', line_color=Category10[3][0],
                 line_width=2, source=source_loss, legend_label='Training Loss')
    l2 = p1.line('epoch', 'val_loss', line_color=Category10[3][1],
                 line_width=2, source=source_loss, legend_label='Validation Loss')

    # Create MAPE plot
    p2 = figure(title='MAPE Over Time',
                x_axis_label='Epoch',
                y_axis_label='MAPE (%)',
                width=600, height=400)

    # Add hover tool
    hover_mape = HoverTool(tooltips=[
        ('Epoch', '@epoch'),
        ('Training MAPE', '@train_mape{0.00}%'),
        ('Validation MAPE', '@val_mape{0.00}%')
    ])
    p2.add_tools(hover_mape)

    # Plot MAPE lines
    l3 = p2.line('epoch', 'train_mape', line_color=Category10[3][0],
                 line_width=2, source=source_mape, legend_label='Training MAPE')
    l4 = p2.line('epoch', 'val_mape', line_color=Category10[3][1],
                 line_width=2, source=source_mape, legend_label='Validation MAPE')

    # Configure legends
    for p in [p1, p2]:
        p.legend.click_policy = "hide"
        p.legend.location = "top_right"
        p.grid.grid_line_alpha = 0.3

    # Show plots
    show(row(p1, p2))



In [1]:
def plot_predictions_bokeh(y_test, y_pred, n_samples=None):
    output_notebook()

    if n_samples is None:
        n_samples = len(y_test)
    else:
        n_samples = min(n_samples, len(y_test))

    # Prepare data
    x_range = list(range(n_samples))
    source = ColumnDataSource(data={
        'index': x_range,
        'actual': y_test[:n_samples],
        'predicted': y_pred[:n_samples],
        'error': y_test[:n_samples] - y_pred[:n_samples]
    })

    # Create time series plot
    p1 = figure(title='Actual vs Predicted Values',
                x_axis_label='Sample Index',
                y_axis_label='Value',
                width=800, height=400)

    # Add hover tool
    hover = HoverTool(tooltips=[
        ('Index', '@index'),
        ('Actual', '@actual{0.000}'),
        ('Predicted', '@predicted{0.000}'),
        ('Error', '@error{0.000}')
    ])
    p1.add_tools(hover)

    # Plot lines
    l1 = p1.line('index', 'actual', line_color=Category10[3][0],
                 line_width=2, source=source, legend_label='Actual')
    l2 = p1.line('index', 'predicted', line_color=Category10[3][1],
                 line_width=2, source=source, legend_label='Predicted')

    # Show plots
    show(row(p1))



In [133]:
def evaluate_all_models(checkpoint_dir, X_test, y_test, scaler=None):
    # Find all .keras files
    model_files = []
    for root, dirs, files in os.walk(checkpoint_dir):
        for file in files:
            if file.endswith('.keras'):
                model_files.append(os.path.join(root, file))

    print(f"\nFound {len(model_files)} models to evaluate")
    print("=" * 80)

    # Store results
    results = {}

    for model_path in model_files:
        model_name = os.path.basename(model_path).replace('.keras', '')
        print(f"\nEvaluating model: {model_name}")
        print("-" * 80)

        try:
            # Load model
            model = load_model(model_path)

            # Make predictions
            y_pred = model.predict(X_test, verbose=0)

            # Get metrics
            metrics = evaluate_predictions(y_test, y_pred, n_samples=5, scaler=scaler)

            # Store results
            results[model_name] = {
                'metrics': metrics,
                'model': model
            }

        except Exception as e:
            print(f"Error evaluating {model_name}: {str(e)}")

    # Create comparison DataFrame
    metrics_df = pd.DataFrame({
        model_name: {
            'MAPE (%)': results[model_name]['metrics']['mape'],
            'RMSE': results[model_name]['metrics']['rmse'],
            'MSE': results[model_name]['metrics']['mse'],
            'MAE': results[model_name]['metrics']['mae'],
            'MPD (%)': results[model_name]['metrics']['mpd']
        }
        for model_name in results.keys()
    }).T

    metrics_df = metrics_df.sort_values(by='MAPE (%)')

    print("\nModel Comparison Summary (Sorted by MAPE %):")
    print("=" * 80)
    print(metrics_df.to_string())
    print("=" * 80)

    return results, metrics_df

In [137]:
# Evaluate all models
results, metrics_df = evaluate_all_models(CHECKPOINTS_PATH, X_test, y_test)


Found 50 models to evaluate

Evaluating model: best_256_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_150_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_100_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_64_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_115_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_512_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_50_simple_split_first
--------------------------------------------------------------------------------

Evaluating model: best_128_simple_split_first
------------------------------