# **Imports**

In [21]:
import sys
import os
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')
from config import *

import yfinance as yf
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from bokeh.plotting import figure, show, output_notebook, reset_output, output_file
from bokeh.io import export_png
from bokeh.models import ColumnDataSource, HoverTool, DatetimeTickFormatter, Div, PanTool, BoxZoomTool, ResetTool, SaveTool
from bokeh.palettes import Category10
from bokeh.layouts import row, column

import matplotlib.pyplot as plt

output_notebook()

### **Get historical data for S&P 500**

In [62]:
def fetch_stock_data(symbol,end, save_path):
   df = yf.Ticker(symbol).history(period='max', end=end)
   df.to_csv(f"{save_path}/{symbol}_data.csv")
   print(f"Data saved to {save_path}/{symbol}_data.csv")
   print(f"Date range: {df.index[0]} to {df.index[-1]}")
   print(f"Shape: {df.shape}")
   return df

In [63]:
SYMBOL_NAME = '^GSPC'
END_DATE = '2024-12-31'

raw_data = fetch_stock_data(SYMBOL_NAME,END_DATE,RAW_DATA_PATH)

Data saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/raw/^GSPC_data.csv
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
Shape: (24366, 7)


# **Function to plot time series data**

In [24]:
def plot_timeseries(df, title, y_col):
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index, utc=True)
    else:
        df.index = pd.to_datetime(df.index, utc=True)

    source = ColumnDataSource(data={
        'date': df.index,
        'value': df[y_col],
        'date_str': [d.strftime('%Y-%m-%d') for d in df.index]
    })

    p = figure(
        width=800, height=400,
        x_axis_type='datetime',
        title=title
    )

    p.line('date', 'value', source=source, line_width=2)

    hover = HoverTool(
        tooltips=[
            ('Date', '@date_str'),
            ('Value', '@value{0,0.00}')
        ]
    )
    p.add_tools(hover)

    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = y_col

    show(p)

    return p

### **Load saved raw data**

In [64]:
file_path = f"{RAW_DATA_PATH}/{SYMBOL_NAME}_data.csv"
raw_data = pd.read_csv(file_path, index_col='Date', parse_dates=True)

print(f"Loaded data shape: {raw_data.shape}\nDate range: {raw_data.index[0]} to {raw_data.index[-1]}")
print(raw_data.tail().to_markdown())

Loaded data shape: (24366, 7)
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
| Date                      |    Open |    High |     Low |   Close |      Volume |   Dividends |   Stock Splits |
|:--------------------------|--------:|--------:|--------:|--------:|------------:|------------:|---------------:|
| 2024-12-23 00:00:00-05:00 | 5940.25 | 5978.25 | 5902.57 | 5974.07 | 3.59328e+09 |           0 |              0 |
| 2024-12-24 00:00:00-05:00 | 5984.63 | 6040.1  | 5981.44 | 6040.04 | 1.75772e+09 |           0 |              0 |
| 2024-12-26 00:00:00-05:00 | 6024.97 | 6049.75 | 6007.37 | 6037.59 | 2.90453e+09 |           0 |              0 |
| 2024-12-27 00:00:00-05:00 | 6006.17 | 6006.17 | 5932.95 | 5970.84 | 3.15961e+09 |           0 |              0 |
| 2024-12-30 00:00:00-05:00 | 5920.67 | 5940.79 | 5869.16 | 5906.94 | 3.43325e+09 |           0 |              0 |


## **Check data types**

In [65]:
raw_data.dtypes

Unnamed: 0,0
Open,float64
High,float64
Low,float64
Close,float64
Volume,int64
Dividends,float64
Stock Splits,float64


# **Select feature and target columns**

In [67]:
df = raw_data[['Close']].copy()
df['Target'] = df['Close'].shift(-1)
df.dropna(inplace=True)

print(df.tail().to_markdown())

| Date                      |   Close |   Target |
|:--------------------------|--------:|---------:|
| 2024-12-20 00:00:00-05:00 | 5930.85 |  5974.07 |
| 2024-12-23 00:00:00-05:00 | 5974.07 |  6040.04 |
| 2024-12-24 00:00:00-05:00 | 6040.04 |  6037.59 |
| 2024-12-26 00:00:00-05:00 | 6037.59 |  5970.84 |
| 2024-12-27 00:00:00-05:00 | 5970.84 |  5906.94 |


##**S&P500 data chart (1928-2024)**

In [28]:
plot_timeseries(df,'S&P 500 Historical Close Price','Close')

## **Apply log scale on DataDrame**

In [29]:
def apply_log_scale(df):
    df_copy = df.copy()
    for col in df.columns:
        if col in df_copy.columns:
            df_copy[col] = np.log(df_copy[col].replace(0, np.nan))
    return df_copy

# **Apply log scale on dataframe and save it**

In [30]:
df_log = apply_log_scale(df)
df_log.to_csv(f"{RAW_DATA_PATH}/{SYMBOL_NAME}_log_data.csv")

## **S&P500 log scale data chart (1928-2024)**

In [31]:
plot_timeseries(df_log,'S&P 500 Historical Prices (Log Scale)','Close')

##**Function to split data into train, validation and test**

In [32]:
def split_data(df, train_size=0.8, val_size=0.10):
    n = len(df)
    train_end = int(n * train_size)
    val_end = int(n * (train_size + val_size))

    train = df[:train_end]
    val = df[train_end:val_end]
    test = df[val_end:]

    return train, val, test

## **Simple MinMaxScaler: split the data, fit scaler on train**

In [33]:
def minmax_split_first(df, feature_cols, target_col, save_path, file_name):
    train_data, val_data, test_data = split_data(df)

    features_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit scalers on training data only
    features_scaler.fit(train_data[feature_cols])
    target_scaler.fit(train_data[target_col])

    # Scale features - transform returns numpy arrays
    train_scaled_features_array = features_scaler.transform(train_data[feature_cols])
    val_scaled_features_array = features_scaler.transform(val_data[feature_cols])
    test_scaled_features_array = features_scaler.transform(test_data[feature_cols])

    # Scale targets
    train_scaled_target_array = target_scaler.transform(train_data[target_col])
    val_scaled_target_array = target_scaler.transform(val_data[target_col])
    test_scaled_target_array = target_scaler.transform(test_data[target_col])

    # Convert arrays back to DataFrames to preserve column names and indexes
    train_scaled_features = pd.DataFrame(
        train_scaled_features_array,
        index=train_data.index,
        columns=feature_cols
    )
    val_scaled_features = pd.DataFrame(
        val_scaled_features_array,
        index=val_data.index,
        columns=feature_cols
    )
    test_scaled_features = pd.DataFrame(
        test_scaled_features_array,
        index=test_data.index,
        columns=feature_cols
    )

    # Convert target arrays back to DataFrames
    train_scaled_target = pd.DataFrame(
        train_scaled_target_array,
        index=train_data.index,
        columns=target_col
    )
    val_scaled_target = pd.DataFrame(
        val_scaled_target_array,
        index=val_data.index,
        columns=target_col
    )
    test_scaled_target = pd.DataFrame(
        test_scaled_target_array,
        index=test_data.index,
        columns=target_col,
    )

    os.makedirs(f"{save_path}/minmax_split_first", exist_ok=True)

    train_scaled = pd.concat([train_scaled_features, train_scaled_target], axis=1)
    val_scaled = pd.concat([val_scaled_features, val_scaled_target], axis=1)
    test_scaled = pd.concat([test_scaled_features, test_scaled_target], axis=1)

    train_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_train.csv")
    val_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_val.csv")
    test_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_test.csv")

    # Save scalers using joblib
    joblib.dump(features_scaler, f"{save_path}/minmax_split_first/{file_name}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/minmax_split_first/{file_name}_target_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_first/")
    print(f"Scalers saved to {save_path}/")

    return train_scaled, val_scaled, test_scaled, features_scaler, target_scaler

# **Fit MinMaxScaler on the entire dataframe, then split**

In [34]:
def minmax_split_last(df, feature_cols, target_col, save_path,file_name):
    df = df.copy()

    features_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit scalers on entire dataset
    features_scaler.fit(df[feature_cols])
    target_scaler.fit(df[target_col])

    # Scale features
    scaled_features = features_scaler.transform(df[feature_cols])
    df_scaled_features = pd.DataFrame(scaled_features, index=df.index, columns=feature_cols)

    # Scale target
    scaled_target = target_scaler.transform(df[target_col])
    df_scaled_target = pd.DataFrame(scaled_target, index=df.index, columns=target_col)

    # Combine features and target
    df_scaled = pd.concat([df_scaled_features, df_scaled_target], axis=1)

    # Split data after scaling
    train_data, val_data, test_data = split_data(df_scaled)

    os.makedirs(f"{save_path}/minmax_split_last", exist_ok=True)

    train_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_train.csv")
    val_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_val.csv")
    test_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_test.csv")

    # Save scalers using joblib
    joblib.dump(features_scaler, f"{save_path}/minmax_split_last/{file_name}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/minmax_split_last/{file_name}_target_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_last/")
    print(f"Scalers saved to {save_path}/")

    return train_data, val_data, test_data, features_scaler, target_scaler

# **Normalize datasets**

In [35]:
train_minmax_first, val_minmax_first, test_minmax_first, features_scaler, target_scaler = minmax_split_first(df,['Close'],['Target'],PROCESSED_DATA_PATH,'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_first/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [36]:
train_minmax_last, val_minmax_last, test_minmax_last, features_scaler_last, target_scaler_last = minmax_split_last(df, ['Close'], ['Target'], PROCESSED_DATA_PATH,'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_last/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


# **Compare the two normalization method (fit minmax scaler on train vs on the entire dataset)**

In [37]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_minmax_first, val_minmax_first, test_minmax_first),
    split_last_data=(train_minmax_last, val_minmax_last, test_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="minmax_first_vs_last.png",
    title="Comparison of fitting MinMaxScaler on train vs on the entire dataset"
)

Output hidden; open in https://colab.research.google.com to view.

# **Normalize log scaled data**

In [38]:
train_log_minmax_first, val_log_minmax_first, test_log_minmax_first, features_scaler_log_first, target_scaler_log_first = minmax_split_first(df_log, ['Close'], ['Target'], PROCESSED_DATA_PATH,'log_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_first/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [39]:
train_log_minmax_last, val_log_minmax_last, test_log_minmax_last, features_scaler_log_last, target_scaler_log_last = minmax_split_last(df_log, ['Close'], ['Target'], PROCESSED_DATA_PATH,'log_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_last/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [40]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_log_minmax_first, val_log_minmax_first, test_log_minmax_first),
    split_last_data=(train_log_minmax_last, val_log_minmax_last, test_log_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="minmax_first_vs_last_log.png",
    title="Comparison of fitting MinMaxScaler on train vs on the entire dataset in case of log scaled data"
)

Output hidden; open in https://colab.research.google.com to view.

# **Custom data normalization method combined with MinMaxScaler**

In [41]:
def subtract_first_value(df, column_name):
    df = df.copy()
    first_value = df[column_name].iloc[0]
    df[column_name] = df[column_name] - first_value
    return df

def custom_normalize(data,last_value, index, n):
    A_prime = data
    numerator = ((A_prime - (last_value / n) * index) * index)  # use abs ?
    denominator = np.sqrt(index**2 + ((last_value / n) * index)**2)
    return 0 if denominator == 0 else numerator / denominator

def normalize(df, column_name):
    df = df.copy().reset_index(drop=True)
    n = len(df) - 1
    last_value = df[column_name].iloc[n]

    df['Custom_Normalized'] = [
        custom_normalize(row[column_name],last_value, i, n)
        for i, row in df.iterrows()
    ]
    return df

In [75]:
def custom_split_first(df, feature_columns, save_dir, file_prefix):
    save_path = f"{save_dir}/custom_split_first"
    os.makedirs(save_path, exist_ok=True)

     # Save reference values needed for inverse transform
    reference_values = {
        'first_value': df['Close'].iloc[0],
        'last_value': df['Close'].iloc[-1],
        'last_index': len(df) -1
    }

    print(reference_values)

    # Save all reference values in one NPY file
    np.save(f"{save_path}/{file_prefix}_reference_values.npy", reference_values)


    df = subtract_first_value(df,'Close')

    df_norm = normalize(df,'Close')

    print(df_norm.tail().to_markdown())

    # Create target column (next day's normalized value)
    df_norm['Target'] = df_norm['Custom_Normalized'].shift(-1)
    df_norm.dropna(inplace=True)

    train, val, test = split_data(df_norm)

    features_scaler = MinMaxScaler(feature_range=(0,1))
    target_scaler = MinMaxScaler(feature_range=(0,1))

    features_scaler.fit(train[feature_columns])
    target_scaler.fit(train[['Target']])

    train_scaled_features = features_scaler.transform(train[feature_columns])
    val_scaled_features = features_scaler.transform(val[feature_columns])
    test_scaled_features = features_scaler.transform(test[feature_columns])

    train_scaled_target = target_scaler.transform(train[['Target']])
    val_scaled_target = target_scaler.transform(val[['Target']])
    test_scaled_target = target_scaler.transform(test[['Target']])

    # Convert to DataFrames with original indices
    train_scaled = pd.DataFrame(
        np.hstack([train_scaled_features, train_scaled_target]),
        index=train.index,
        columns=feature_columns + ['Target']
    )

    val_scaled = pd.DataFrame(
        np.hstack([val_scaled_features, val_scaled_target]),
        index=val.index,
        columns=feature_columns + ['Target']
    )

    test_scaled = pd.DataFrame(
        np.hstack([test_scaled_features, test_scaled_target]),
        index=test.index,
        columns=feature_columns + ['Target']
    )

    train_scaled.to_csv(f"{save_path}/{file_prefix}_train.csv")
    val_scaled.to_csv(f"{save_path}/{file_prefix}_val.csv")
    test_scaled.to_csv(f"{save_path}/{file_prefix}_test.csv")

    joblib.dump(features_scaler, f"{save_path}/{file_prefix}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/{file_prefix}_target_scaler.joblib")

    return train_scaled, val_scaled, test_scaled, features_scaler, target_scaler

In [43]:
def custom_split_last(df, feature_columns, save_dir, file_prefix):
    save_path = f"{save_dir}/custom_split_last"
    os.makedirs(save_path, exist_ok=True)

    df = subtract_first_value(df,'Close')

    df_norm = normalize(df,'Close')

    # Create target column (next day's normalized value)
    df_norm['Target'] = df_norm['Custom_Normalized'].shift(-1)
    df_norm.dropna(inplace=True)

    features_scaler = MinMaxScaler(feature_range=(0, 1))
    target_scaler = MinMaxScaler(feature_range=(0, 1))

    features_scaler.fit(df_norm[feature_columns])
    target_scaler.fit(df_norm[['Target']])

    features_scaled = features_scaler.transform(df_norm[feature_columns])
    target_scaled = target_scaler.transform(df_norm[['Target']])

    # Create scaled DataFrame
    df_scaled = pd.DataFrame(
        np.hstack([features_scaled, target_scaled]),
        index=df_norm.index,
        columns=feature_columns + ['Target']
    )

    train_scaled, val_scaled, test_scaled = split_data(df_scaled)

    joblib.dump(features_scaler, f"{save_path}/{file_prefix}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/{file_prefix}_target_scaler.joblib")

    train_scaled.to_csv(f"{save_path}/{file_prefix}_train.csv")
    val_scaled.to_csv(f"{save_path}/{file_prefix}_val.csv")
    test_scaled.to_csv(f"{save_path}/{file_prefix}_test.csv")

    return train_scaled, val_scaled, test_scaled, features_scaler, target_scaler

In [76]:
train_custom_first, val_custom_first, test_custom_first, features_scaler_custom_first, target_scaler_custom_first = custom_split_first(
    df,
    feature_columns=['Custom_Normalized'],
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='normal_data'
)

{'first_value': 17.65999984741211, 'last_value': 5970.83984375, 'last_index': 24364}
|       |   Close |   Target |   Custom_Normalized |
|------:|--------:|---------:|--------------------:|
| 24360 | 5913.19 |  5974.07 |           -37.8975  |
| 24361 | 5956.41 |  6040.04 |             3.84975 |
| 24362 | 6022.38 |  6037.59 |            67.6973  |
| 24363 | 6019.93 |  5970.84 |            65.0798  |
| 24364 | 5953.18 |  5906.94 |             0       |


In [45]:
train_custom_last, val_custom_last, test_custom_last, features_scaler_custom_last, target_scaler_custom_last = custom_split_last(
    df,
    feature_columns=['Custom_Normalized'],
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='normal_data_last'
)

In [58]:
len(test_custom_first)

2437

In [47]:
plot_comparison_bokeh(
    split_first_data=(train_custom_first, val_custom_first, test_custom_first),
    split_last_data=(train_custom_last, val_custom_last, test_custom_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="custom_normalized_first_vs_last.png",
    column_name="Custom_Normalized",
    title="Comparison of split First vs split Last Custom Normalization"
)

Plot saved as PNG: custom_normalized_first_vs_last.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/results/figures/data


In [48]:
train_custom_log_first, val_custom_log_first, test_custom_log_first, features_scaler_custom_log_first, target_scaler_custom_log_first = custom_split_first(
    df_log,
    feature_columns=['Custom_Normalized'],
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='log_data'
)

In [49]:
train_custom_log_last, val_custom_log_last, test_custom_log_last, features_scaler_custom_log_last, target_scaler_custom_log_last = custom_split_last(
    df_log,
    feature_columns=['Custom_Normalized'],
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='log_data_last'
)

In [50]:
plot_comparison_bokeh(
    split_first_data=(train_custom_log_first, val_custom_log_first, test_custom_log_first),
    split_last_data=(train_custom_log_last, val_custom_log_last, test_custom_log_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_custom_log_scaled_data.png",
    column_name="Custom_Normalized",
    title="Comparison of Split First vs Split Last Custom Log Normalization"
)

Plot saved as PNG: comparison_custom_log_scaled_data.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/results/figures/data


In [51]:
def plot_comparison_datetime_bokeh(split_first_data, split_last_data, save_path, file_name, col_name="Close",
                                  title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Make continuous indices for split_first
    train_first = train_first.copy().reset_index()
    val_first = val_first.copy().reset_index()
    test_first = test_first.copy().reset_index()

    train_first['plot_index'] = range(len(train_first))
    val_first['plot_index'] = range(len(train_first), len(train_first) + len(val_first))
    test_first['plot_index'] = range(len(train_first) + len(val_first),
                                    len(train_first) + len(val_first) + len(test_first))

    # Make continuous indices for split_last
    train_last = train_last.copy().reset_index()
    val_last = val_last.copy().reset_index()
    test_last = test_last.copy().reset_index()

    train_last['plot_index'] = range(len(train_last))
    val_last['plot_index'] = range(len(train_last), len(train_last) + len(val_last))
    test_last['plot_index'] = range(len(train_last) + len(val_last),
                                   len(train_last) + len(val_last) + len(test_last))

    # Create sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=700, height=500, title="Split First")
    p1.title.text_font_size = '12pt'

    colors = Category10[3]
    p1.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = col_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p1.add_tools(hover1)
    p1.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=700, height=500, title="Split Last" )
    p2.title.text_font_size = '12pt'

    p2.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = col_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p2.add_tools(hover2)
    p2.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1400)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first['plot_index'], train_first[col_name], label="Train", linewidth=2)
    ax1.plot(val_first['plot_index'], val_first[col_name], label="Validation", linewidth=2)
    ax1.plot(test_first['plot_index'], test_first[col_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(col_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last['plot_index'], train_last[col_name], label="Train", linewidth=2)
    ax2.plot(val_last['plot_index'], val_last[col_name], label="Validation", linewidth=2)
    ax2.plot(test_last['plot_index'], test_last[col_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(col_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)

In [52]:
def plot_comparison_bokeh(split_first_data, split_last_data, save_path, file_name, column_name,
                         title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Process split first data
    train_first = train_first.copy().reset_index(drop=True)
    val_first = val_first.copy().reset_index(drop=True)
    test_first = test_first.copy().reset_index(drop=True)

    val_first["Index"] = val_first.index + len(train_first)
    test_first["Index"] = test_first.index + len(train_first) + len(val_first)
    train_first["Index"] = train_first.index

    # Process split last data
    train_last = train_last.copy().reset_index(drop=True)
    val_last = val_last.copy().reset_index(drop=True)
    test_last = test_last.copy().reset_index(drop=True)

    val_last["Index"] = val_last.index + len(train_last)
    test_last["Index"] = test_last.index + len(train_last) + len(val_last)
    train_last["Index"] = train_last.index

    # Create data sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create data sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=700, height=400, title="Split First")
    p1.title.text_font_size = '12pt'

    p1.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = column_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p1.add_tools(hover1)

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=700, height=400, title="Split Last")
    p2.title.text_font_size = '12pt'

    p2.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = column_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p2.add_tools(hover2)

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1400)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first["Index"], train_first[column_name], label="Train", linewidth=2)
    ax1.plot(val_first["Index"], val_first[column_name], label="Validation", linewidth=2)
    ax1.plot(test_first["Index"], test_first[column_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(column_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last["Index"], train_last[column_name], label="Train", linewidth=2)
    ax2.plot(val_last["Index"], val_last[column_name], label="Validation", linewidth=2)
    ax2.plot(test_last["Index"], test_last[column_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(column_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)