In [3]:
# System and Configuration
import sys
import os
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')
from config import *

# Data Handling and Processing
import pandas as pd
import numpy as np
import yfinance as yf
import joblib

# Data Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Visualization with Bokeh
from bokeh.plotting import figure, show, output_notebook, reset_output, output_file
from bokeh.io import export_png
from bokeh.models import ColumnDataSource, HoverTool, DatetimeTickFormatter, Div
from bokeh.palettes import Category10
from bokeh.layouts import row, column
from bokeh.models import PanTool, BoxZoomTool, ResetTool, SaveTool

# Visualization with Matplotlib
import matplotlib.pyplot as plt

# Enable Bokeh output in notebooks
output_notebook()

### **Get historical data for S&P 500**

In [4]:
def fetch_stock_data(symbol,end, save_path):
   df = yf.Ticker(symbol).history(period='max', end=end)
   df.to_csv(f"{save_path}/{symbol}_data.csv")
   print(f"Data saved to {save_path}/{symbol}_data.csv")
   print(f"Date range: {df.index[0]} to {df.index[-1]}")
   print(f"Shape: {df.shape}")
   return df

In [6]:
SYMBOL_NAME = '^GSPC'
END_DATE = '2024-12-31'
raw_data = fetch_stock_data(SYMBOL_NAME,END_DATE,RAW_DATA_PATH)

Data saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/raw/^GSPC_data.csv
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
Shape: (24366, 7)


### **Load saved raw data**

In [7]:
def plot_timeseries(df, title='S&P 500 Historical Prices', x_col='Date', y_col='Close'):
    # Convert index to datetime if not already
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

    # Create ColumnDataSource with formatted date for tooltip
    source = ColumnDataSource(data={
        'date': df.index,
        'value': df[y_col],
        'formatted_date': df.index.strftime('%Y-%m-%d')
    })

    # Create figure
    p = figure(
        width=800, height=400,
        x_axis_type='datetime',
        title=title,
        tools="pan,wheel_zoom,box_zoom,reset,save"
    )

    # Add line
    p.line('date', 'value', source=source, line_width=2, color=Category10[3][0])

    # Configure hover tool with date and price
    hover = HoverTool(
        tooltips=[
            ('Date', '@formatted_date'),
            (y_col, '@value{0,0.00}')  # Format with 2 decimal places
        ],
        mode='vline'  # Show tooltip for all points along a vertical line
    )
    p.add_tools(hover)

    # Format axes
    p.xaxis.formatter = DatetimeTickFormatter(
        days=["%d %b %Y"],
        months=["%b %Y"],
        years=["%Y"]
    )

    # Labels and grid
    p.xaxis.axis_label = x_col
    p.yaxis.axis_label = y_col
    p.grid.grid_line_alpha = 0.3

    show(p)

    return p  # Return the plot for potential further customization

In [65]:
raw_data = pd.read_csv(f"{RAW_DATA_PATH}/{SYMBOL_NAME}_data.csv", index_col='Date', parse_dates=True)
print(f"Loaded data shape: {raw_data.shape}\nDate range: {raw_data.index[0]} to {raw_data.index[-1]}")
print(raw_data.tail().to_markdown())

df = raw_data[['Close']].copy()
df['Target'] = df['Close'].shift(-1)
df.dropna(inplace=True)

Loaded data shape: (24366, 7)
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
| Date                      |    Open |    High |     Low |   Close |      Volume |   Dividends |   Stock Splits |
|:--------------------------|--------:|--------:|--------:|--------:|------------:|------------:|---------------:|
| 2024-12-23 00:00:00-05:00 | 5940.25 | 5978.25 | 5902.57 | 5974.07 | 3.59328e+09 |           0 |              0 |
| 2024-12-24 00:00:00-05:00 | 5984.63 | 6040.1  | 5981.44 | 6040.04 | 1.75772e+09 |           0 |              0 |
| 2024-12-26 00:00:00-05:00 | 6024.97 | 6049.75 | 6007.37 | 6037.59 | 2.90453e+09 |           0 |              0 |
| 2024-12-27 00:00:00-05:00 | 6006.17 | 6006.17 | 5932.95 | 5970.84 | 3.15961e+09 |           0 |              0 |
| 2024-12-30 00:00:00-05:00 | 5920.67 | 5940.79 | 5869.16 | 5906.94 | 3.43325e+09 |           0 |              0 |


## **Check data types**

In [66]:
raw_data.dtypes

Unnamed: 0,0
Open,float64
High,float64
Low,float64
Close,float64
Volume,int64
Dividends,float64
Stock Splits,float64


##**S&P500 data chart (1928-2024)**

In [67]:
plot_timeseries(df)

## **Apply log scale on DataDrame**

In [69]:
def apply_log_scale(df):
    df_copy = df.copy()
    for col in df.columns:
        if col in df_copy.columns:
            df_copy[col] = np.log(df_copy[col].replace(0, np.nan))
    return df_copy

In [70]:
df_log = apply_log_scale(df)

## **S&P500 log scale data chart (1928-2024)**

In [71]:
plot_timeseries(df_log, title='S&P 500 Historical Prices (Log Scale)')

##**Function to split data into train,validation and test**

In [15]:
def split_data(df, train_size=0.7, val_size=0.15):
    n = len(df)
    train_end = int(n * train_size)
    val_end = int(n * (train_size + val_size))

    train = df[:train_end]
    val = df[train_end:val_end]
    test = df[val_end:]

    return train, val, test

## **Simple MinMaxScaler: first split the data then scale**

In [107]:
def minmax_split_first(df, feature_cols, target_col, save_path, file_name):
    # Split data into train, validation, and test sets
    train_data, val_data, test_data = split_data(df)

    # Create scalers
    features_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit scalers on training data only
    features_scaler.fit(train_data[feature_cols])
    target_scaler.fit(train_data[target_col])

    # Scale features - transform returns numpy arrays
    train_scaled_features_array = features_scaler.transform(train_data[feature_cols])
    val_scaled_features_array = features_scaler.transform(val_data[feature_cols])
    test_scaled_features_array = features_scaler.transform(test_data[feature_cols])

    # Scale targets - transform returns numpy arrays
    train_scaled_target_array = target_scaler.transform(train_data[target_col])
    val_scaled_target_array = target_scaler.transform(val_data[target_col])
    test_scaled_target_array = target_scaler.transform(test_data[target_col])

    # Convert arrays back to DataFrames to preserve column names and indexes
    train_scaled_features = pd.DataFrame(
        train_scaled_features_array,
        index=train_data.index,
        columns=feature_cols
    )
    val_scaled_features = pd.DataFrame(
        val_scaled_features_array,
        index=val_data.index,
        columns=feature_cols
    )
    test_scaled_features = pd.DataFrame(
        test_scaled_features_array,
        index=test_data.index,
        columns=feature_cols
    )

    # Convert target arrays back to DataFrames
    train_scaled_target = pd.DataFrame(
        train_scaled_target_array,
        index=train_data.index,
        columns=target_col
    )
    val_scaled_target = pd.DataFrame(
        val_scaled_target_array,
        index=val_data.index,
        columns=target_col
    )
    test_scaled_target = pd.DataFrame(
        test_scaled_target_array,
        index=test_data.index,
        columns=target_col,
    )

    # Create directories if they don't exist
    os.makedirs(f"{save_path}/minmax_split_first", exist_ok=True)

    train_scaled = pd.concat([train_scaled_features, train_scaled_target], axis=1)
    val_scaled = pd.concat([val_scaled_features, val_scaled_target], axis=1)
    test_scaled = pd.concat([test_scaled_features, test_scaled_target], axis=1)

    train_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_train.csv")
    val_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_val.csv")
    test_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_test.csv")

    # Save scalers using joblib
    joblib.dump(features_scaler, f"{save_path}/minmax_split_first/{file_name}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/minmax_split_first/{file_name}_target_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_first/")
    print(f"Scalers saved to {save_path}/")

    return train_scaled, val_scaled, test_scaled, features_scaler, target_scaler

In [108]:
def minmax_split_last(df, feature_cols, target_col, save_path,file_name):
    df = df.copy()

    # Create directories if they don't exist
    os.makedirs(f"{save_path}/minmax_split_last", exist_ok=True)

    # Create scalers
    features_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit scalers on entire dataset
    features_scaler.fit(df[feature_cols])
    target_scaler.fit(df[target_col])

    # Scale features
    scaled_features = features_scaler.transform(df[feature_cols])
    df_scaled_features = pd.DataFrame(scaled_features, index=df.index, columns=feature_cols)

    # Scale target
    scaled_target = target_scaler.transform(df[target_col])
    df_scaled_target = pd.DataFrame(scaled_target, index=df.index, columns=target_col)

    # Combine features and target
    df_scaled = pd.concat([df_scaled_features, df_scaled_target], axis=1)

    # Split data after scaling
    train_data, val_data, test_data = split_data(df_scaled)

    train_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_train.csv")
    val_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_val.csv")
    test_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_test.csv")

    # Save scalers using joblib
    joblib.dump(features_scaler, f"{save_path}/minmax_split_last/{file_name}_features_scaler.joblib")
    joblib.dump(target_scaler, f"{save_path}/minmax_split_last/{file_name}_target_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_last/")
    print(f"Scalers saved to {save_path}/")

    return train_data, val_data, test_data, features_scaler, target_scaler


In [111]:
train_minmax_first, val_minmax_first, test_minmax_first, features_scaler, target_scaler = minmax_split_first(df,['Close'],['Target'],PROCESSED_DATA_PATH,'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_first/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [110]:
train_minmax_last, val_minmax_last, test_minmax_last, features_scaler_last, target_scaler_last = minmax_split_last(df, ['Close'], ['Target'], PROCESSED_DATA_PATH,'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_last/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [112]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_minmax_first, val_minmax_first, test_minmax_first),
    split_last_data=(train_minmax_last, val_minmax_last, test_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_minmax_scaled_data.png",
    title="Comparison of fitting MinMaxScaler on train vs on the entire dataset"
)

Output hidden; open in https://colab.research.google.com to view.

In [113]:
train_log_minmax_first, val_log_minmax_first, test_log_minmax_first, features_scaler_log_first, target_scaler_log_first = minmax_split_first(df_log, ['Close'], ['Target'], PROCESSED_DATA_PATH,'log_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_first/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [114]:
train_log_minmax_last, val_log_minmax_last, test_log_minmax_last, features_scaler_log_last, target_scaler_log_last = minmax_split_last(df_log, ['Close'], ['Target'], PROCESSED_DATA_PATH,'log_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/minmax_split_last/
Scalers saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/processed/


In [115]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_log_minmax_first, val_log_minmax_first, test_log_minmax_first),
    split_last_data=(train_log_minmax_last, val_log_minmax_last, test_log_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_log_minmax_scaled_data.png",
    title="Comparison of Split First vs Split Last Normalization using MinMaxScaler on Log Scaled Data"
)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
def custom_split_first(df, split_fn, feature_columns, scaler_path, save_path, file_prefix, feature_range=(0, 1)):
    # Subtract the first value to normalize the trend
    df = subtract_first_value(df)

    # Normalize the data
    df_norm = normalize(df)

    df_norm['Target'] = df_norm['Custom_Normalized'].shift(-1)

    df_norm.dropna(inplace=True)


    # Split the data into training, validation, and test sets
    train, val, test = split_fn(df_norm)



    #     # Create sequences for each dataset
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = map(
    lambda split: create_sequences(split, seq_length=21, feature_cols=feature_columns, target_col='Target'),
    [train, val, test]
    )

    print(X_test[:5])


    # # print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    # # print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    # # print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


    # # Initialize scalers
    # scaler_X = MinMaxScaler(feature_range=feature_range)
    # scaler_y = MinMaxScaler(feature_range=feature_range)

    # # Reshape the input for the scaler
    # X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])  # Reshape to (samples, features)

    # # Fit the scalers on training data only to avoid data leakage
    # scaler_X.fit(X_train_reshaped)
    # scaler_y.fit(y_train.reshape(-1, 1))

    # # Scale the data
    # X_train_scaled = scaler_X.transform(X_train_reshaped).reshape(X_train.shape)
    # X_val_scaled = scaler_X.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)
    # X_test_scaled = scaler_X.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

    # y_train_scaled = scaler_y.transform(y_train.reshape(-1, 1)).flatten()
    # y_val_scaled = scaler_y.transform(y_val.reshape(-1, 1)).flatten()
    # y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

    # print(f"X_train_scaled shape: {X_train_scaled.shape}, y_train_scaled shape: {y_train_scaled.shape}")


    #  # Save the scaled data
    # os.makedirs(save_path, exist_ok=True)
    # np.save(f"{save_path}/{file_prefix}_X_train.npy", X_train_scaled)
    # np.save(f"{save_path}/{file_prefix}_y_train.npy", y_train_scaled)
    # np.save(f"{save_path}/{file_prefix}_X_val.npy", X_val_scaled)
    # np.save(f"{save_path}/{file_prefix}_y_val.npy", y_val_scaled)
    # np.save(f"{save_path}/{file_prefix}_X_test.npy", X_test_scaled)
    # np.save(f"{save_path}/{file_prefix}_y_test.npy", y_test_scaled)

    # # Save the scalers for later use
    # os.makedirs(scaler_path, exist_ok=True)
    # joblib.dump(scaler_X, f"{scaler_path}/{file_prefix}_scaler_X.joblib")
    # joblib.dump(scaler_y, f"{scaler_path}/{file_prefix}_scaler_y.joblib")

    # # Return the scaled datasets and scalers
    # return (X_train_scaled, y_train_scaled), (X_val_scaled, y_val_scaled), (X_test_scaled, y_test_scaled), (scaler_X, scaler_y)


def custom_split_last(df, split_fn, feature_columns, scaler_path, save_path, file_prefix, feature_range=(0, 1)):
    df = subtract_first_value(df)
    df_norm = normalize(df)

    scaler = MinMaxScaler(feature_range=feature_range)
    scaler.fit(df_norm[['Custom_Normalized']])

    df_norm['Scaled'] = scaler.transform(df_norm[['Custom_Normalized']])
    df_norm['Target'] = df_norm['Scaled'].shift(-1)
    df_norm.dropna(inplace=True)

    train_norm, val_norm, test_norm = split_fn(df_norm)

    os.makedirs(save_path, exist_ok=True)
    train_norm.to_csv(f"{save_path}/train/{file_prefix}_train.csv")
    val_norm.to_csv(f"{save_path}/val/{file_prefix}_val.csv")
    test_norm.to_csv(f"{save_path}/test/{file_prefix}_test.csv")
    joblib.dump(scaler, f"{scaler_path}/{file_prefix}_scaler.joblib")

    return train_norm, val_norm, test_norm, scaler

def subtract_first_value(df, column_name='Close'):
    df = df.copy()
    first_value = df[column_name].iloc[0]
    df[column_name] = df[column_name] - first_value
    return df

def custom_normalize(data,last_value, index, n):
    A_prime = data
    numerator = ((A_prime - (last_value / n) * index) * index)  # use abs ?
    denominator = np.sqrt(index**2 + ((last_value / n) * index)**2)
    return 0 if denominator == 0 else numerator / denominator

def normalize(df, column_name='Close'):
    df = df.copy().reset_index(drop=True)
    n = len(df) - 1
    last_value = df[column_name].iloc[n]

    df['Custom_Normalized'] = [
        custom_normalize(row[column_name],last_value, i, n)
        for i, row in df.iterrows()
    ]
    return df

In [None]:
train_custom_first, val_custom_first, test_custom_first, scaler_custom_first = custom_split_first(
    df=df,
    split_fn=split_data,
    feature_columns=['Close','Custom_Normalized'],  # Corrected to match the 'Close' column used in create_sequences
    scaler_path=SCALERS_PATH,
    save_path=PROCESSED_DATA_PATH,
    file_prefix='custom_split_first',
    feature_range=(0, 1)  # Optional, ensures scaling between 0 and 1
)

[[[ 2100.03   -3110.6233]
  [ 2091.26   -3119.3828]
  [ 2097.1    -3113.9412]
  [ 2089.1902 -3121.8645]
  [ 2067.85   -3142.8423]
  [ 2090.6301 -3120.9348]
  [ 2096.83   -3115.1433]
  [ 2071.8    -3139.7078]
  [ 2062.49   -3148.9922]
  [ 2070.34   -3141.5967]
  [ 2098.4402 -3114.518 ]
  [ 2087.6702 -3125.2217]
  [ 2081.4602 -3131.4927]
  [ 2080.82   -3132.3499]
  [ 2103.4402 -3110.598 ]
  [ 2105.07   -3109.2485]
  [ 2111.54   -3103.1946]
  [ 2110.1702 -3104.7612]
  [ 2108.1902 -3106.9207]
  [ 2113.1602 -3102.3247]
  [ 2108.4001 -3107.1865]]

 [[ 2091.26   -3119.3828]
  [ 2097.1    -3113.9412]
  [ 2089.1902 -3121.8645]
  [ 2067.85   -3142.8423]
  [ 2090.6301 -3120.9348]
  [ 2096.83   -3115.1433]
  [ 2071.8    -3139.7078]
  [ 2062.49   -3148.9922]
  [ 2070.34   -3141.5967]
  [ 2098.4402 -3114.518 ]
  [ 2087.6702 -3125.2217]
  [ 2081.4602 -3131.4927]
  [ 2080.82   -3132.3499]
  [ 2103.4402 -3110.598 ]
  [ 2105.07   -3109.2485]
  [ 2111.54   -3103.1946]
  [ 2110.1702 -3104.7612]
  [ 2108.1

TypeError: cannot unpack non-iterable NoneType object

In [None]:
train_custom_last, val_custom_last, test_custom_last,scaler_custom_last = custom_split_last(df, split_data, ['Scaled'], SCALERS_PATH, PROCESSED_DATA_PATH, 'custom_split_last')

In [None]:
plot_comparison_bokeh(
    split_first_data=(train_custom_first, val_custom_first, test_custom_first),
    split_last_data=(train_custom_last, val_custom_last, test_custom_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_custom_scaled_data.png",
    column_name="Scaled",
    title="Comparison of Split First vs Split Last Custom Normalization"
)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
train_custom_first.tail()

Unnamed: 0,Close,Custom_Normalized,Scaled,Target
19486,1211.470005,242.293012,0.64058,0.644805
19487,1220.150059,243.974673,0.644805,0.640961
19488,1212.730015,242.444966,0.640961,0.64258
19489,1216.209995,243.089092,0.64258,0.635177
19490,1201.679966,240.142742,0.635177,0.635504


In [None]:
train_custom_log_first, val_custom_log_first, test_custom_log_first, scaler_custom_log_first = custom_split_first(
    df_log, split_data, ['Scaled'], SCALERS_PATH, PROCESSED_DATA_PATH, 'custom_split_first_log'
)

In [None]:
train_custom_log_last, val_custom_log_last, test_custom_log_last, scaler_custom_log_last = custom_split_last(
    df_log, split_data, ['Scaled'], SCALERS_PATH, PROCESSED_DATA_PATH, 'custom_split_last_log'
)

In [None]:
plot_comparison_bokeh(
    split_first_data=(train_custom_log_first, val_custom_log_first, test_custom_log_first),
    split_last_data=(train_custom_log_last, val_custom_log_last, test_custom_log_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_custom_log_scaled_data.png",
    column_name="Scaled",
    title="Comparison of Split First vs Split Last Custom Log Normalization"
)

Output hidden; open in https://colab.research.google.com to view.

In [62]:
def plot_comparison_datetime_bokeh(split_first_data, split_last_data, save_path, file_name, col_name="Close",
                                  title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Make continuous indices for split_first
    train_first = train_first.copy().reset_index()
    val_first = val_first.copy().reset_index()
    test_first = test_first.copy().reset_index()

    train_first['plot_index'] = range(len(train_first))
    val_first['plot_index'] = range(len(train_first), len(train_first) + len(val_first))
    test_first['plot_index'] = range(len(train_first) + len(val_first),
                                    len(train_first) + len(val_first) + len(test_first))

    # Make continuous indices for split_last
    train_last = train_last.copy().reset_index()
    val_last = val_last.copy().reset_index()
    test_last = test_last.copy().reset_index()

    train_last['plot_index'] = range(len(train_last))
    val_last['plot_index'] = range(len(train_last), len(train_last) + len(val_last))
    test_last['plot_index'] = range(len(train_last) + len(val_last),
                                   len(train_last) + len(val_last) + len(test_last))

    # Create sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=700, height=500, title="Split First")
    p1.title.text_font_size = '12pt'

    colors = Category10[3]
    p1.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = col_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p1.add_tools(hover1)
    p1.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=700, height=500, title="Split Last" )
    p2.title.text_font_size = '12pt'

    p2.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = col_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p2.add_tools(hover2)
    p2.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1400)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first['plot_index'], train_first[col_name], label="Train", linewidth=2)
    ax1.plot(val_first['plot_index'], val_first[col_name], label="Validation", linewidth=2)
    ax1.plot(test_first['plot_index'], test_first[col_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(col_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last['plot_index'], train_last[col_name], label="Train", linewidth=2)
    ax2.plot(val_last['plot_index'], val_last[col_name], label="Validation", linewidth=2)
    ax2.plot(test_last['plot_index'], test_last[col_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(col_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)

In [63]:
def plot_comparison_bokeh(split_first_data, split_last_data, save_path, file_name, column_name,
                         title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Process split first data
    train_first = train_first.copy().reset_index(drop=True)
    val_first = val_first.copy().reset_index(drop=True)
    test_first = test_first.copy().reset_index(drop=True)

    val_first["Index"] = val_first.index + len(train_first)
    test_first["Index"] = test_first.index + len(train_first) + len(val_first)
    train_first["Index"] = train_first.index

    # Process split last data
    train_last = train_last.copy().reset_index(drop=True)
    val_last = val_last.copy().reset_index(drop=True)
    test_last = test_last.copy().reset_index(drop=True)

    val_last["Index"] = val_last.index + len(train_last)
    test_last["Index"] = test_last.index + len(train_last) + len(val_last)
    train_last["Index"] = train_last.index

    # Create data sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create data sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=700, height=400, title="Split First")
    p1.title.text_font_size = '12pt'

    p1.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = column_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p1.add_tools(hover1)

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=700, height=400, title="Split Last")
    p2.title.text_font_size = '12pt'

    p2.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = column_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p2.add_tools(hover2)

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1400)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first["Index"], train_first[column_name], label="Train", linewidth=2)
    ax1.plot(val_first["Index"], val_first[column_name], label="Validation", linewidth=2)
    ax1.plot(test_first["Index"], test_first[column_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(column_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last["Index"], train_last[column_name], label="Train", linewidth=2)
    ax2.plot(val_last["Index"], val_last[column_name], label="Validation", linewidth=2)
    ax2.plot(test_last["Index"], test_last[column_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(column_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)

In [None]:
train_log_minmax_first

Unnamed: 0_level_0,Close,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1927-12-30 00:00:00-05:00,0.237565,0.238530
1928-01-03 00:00:00-05:00,0.238530,0.238145
1928-01-04 00:00:00-05:00,0.238145,0.236497
1928-01-05 00:00:00-05:00,0.236497,0.237565
1928-01-06 00:00:00-05:00,0.237565,0.236009
...,...,...
2005-08-10 00:00:00-04:00,0.962853,0.964056
2005-08-11 00:00:00-04:00,0.964056,0.963028
2005-08-12 00:00:00-04:00,0.963028,0.963511
2005-08-15 00:00:00-04:00,0.963511,0.961486


In [None]:
def plot_custom_normalized_bokeh_continuous(train_scaled, val_scaled, test_scaled,save_path,file_name, column_name, title="Custom Normalized Data"):
    reset_output()
    output_notebook()

    # Ensure train, val, and test have continuous indices
    train_scaled = train_scaled.copy().reset_index(drop=True)
    val_scaled = val_scaled.copy().reset_index(drop=True)
    test_scaled = test_scaled.copy().reset_index(drop=True)

    # Adjust validation index to start after train
    val_scaled["Index"] = val_scaled.index + len(train_scaled)

    # Adjust test index to start after validation (train + validation)
    test_scaled["Index"] = test_scaled.index + len(train_scaled) + len(val_scaled)

    train_scaled["Index"] = train_scaled.index

    # Create data sources
    train_source = ColumnDataSource(train_scaled)
    val_source = ColumnDataSource(val_scaled)
    test_source = ColumnDataSource(test_scaled)

    # Create Bokeh figure
    p = figure(width=1000, height=500, title=title)
    p.title.text_font_size = '14pt'

    p.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_source, legend_label='Train')
    p.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_source, legend_label='Validation')
    p.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_source, legend_label='Test')

    p.xaxis.axis_label = 'Index'
    p.yaxis.axis_label = column_name
    p.xaxis.axis_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'

    hover = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p.add_tools(hover)

    p.legend.click_policy = "hide"
    p.legend.location = "top_left"
    p.legend.label_text_font_size = '10pt'

    # Save as PNG using Matplotlib
    plt.figure(figsize=(10, 5))
    plt.plot(train_scaled["Index"], train_scaled[column_name], label="Train", linewidth=2)
    plt.plot(val_scaled["Index"], val_scaled[column_name], label="Validation", linewidth=2)
    plt.plot(test_scaled["Index"], test_scaled[column_name], label="Test", linewidth=2)

    plt.title(title, fontsize=14)
    plt.xlabel("Index", fontsize=12)
    plt.ylabel(column_name, fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(p)

In [None]:
def display_split_info(train, val, test):
    print("\nData Split Information:")
    print("-" * 50)
    print(f"Total samples: {len(train) + len(val) + len(test)}")
    print(f"Train samples: {len(train)} ({len(train)/(len(train) + len(val) + len(test)):.2%})")
    print(f"Val samples: {len(val)} ({len(val)/(len(train) + len(val) + len(test)):.2%})")
    print(f"Test samples: {len(test)} ({len(test)/(len(train) + len(val) + len(test)):.2%})")

    print("\nTrain Data Tail:")
    print("-" * 50)
    print(train.tail())

    print("\nValidation Data Tail:")
    print("-" * 50)
    print(val.tail())

    print("\nTest Data Tail:")
    print("-" * 50)
    print(test.tail())

    print("\nValue Ranges:")
    print("-" * 50)
    print("Train - Scaled:", f"min: {train['Scaled'].min():.3f}, max: {train['Scaled'].max():.3f}")
    print("Val - Scaled:", f"min: {val['Scaled'].min():.3f}, max: {val['Scaled'].max():.3f}")
    print("Test - Scaled:", f"min: {test['Scaled'].min():.3f}, max: {test['Scaled'].max():.3f}")