# **Imports**

In [1]:
import sys
import os
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')
from config import *

import yfinance as yf
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from bokeh.plotting import figure, show, output_notebook, reset_output, output_file
from bokeh.io import export_png
from bokeh.models import ColumnDataSource, HoverTool, DatetimeTickFormatter, Div, PanTool, BoxZoomTool, ResetTool, SaveTool
from bokeh.palettes import Category10
from bokeh.layouts import row, column

import matplotlib.pyplot as plt

output_notebook()

### **Get historical data for S&P 500**

In [2]:
def fetch_stock_data(symbol, start, end, save_path):
    df = yf.Ticker(symbol).history(period='max', start=start, end=end)

    start_year = start[:4]
    end_year = end[:4]

    file_name = f"{symbol}_{start_year}_to_{end_year}_data.csv"
    file_path = f"{save_path}/{file_name}"

    df.to_csv(file_path)

    print(f"Data saved to {file_path}")
    print(f"Date range: {df.index[0]} to {df.index[-1]}")
    print(f"Shape: {df.shape}")

    return df

### **Fetch historical data**

In [3]:
SYMBOL_NAME = '^GSPC'
START_DATE = '2014-01-02'
END_DATE = '2024-12-31'

raw_data = fetch_stock_data(SYMBOL_NAME, START_DATE, END_DATE, RAW_DATA_PATH)
raw_data.index = pd.to_datetime(raw_data.index).strftime('%Y-%m-%d')

print(raw_data.tail().to_markdown())

Data saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/data/raw/^GSPC_2014_to_2024_data.csv
Date range: 2014-01-02 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
Shape: (2767, 7)
| Date       |    Open |    High |     Low |   Close |      Volume |   Dividends |   Stock Splits |
|:-----------|--------:|--------:|--------:|--------:|------------:|------------:|---------------:|
| 2024-12-23 | 5940.25 | 5978.25 | 5902.57 | 5974.07 | 3.59328e+09 |           0 |              0 |
| 2024-12-24 | 5984.63 | 6040.1  | 5981.44 | 6040.04 | 1.75772e+09 |           0 |              0 |
| 2024-12-26 | 6024.97 | 6049.75 | 6007.37 | 6037.59 | 2.90453e+09 |           0 |              0 |
| 2024-12-27 | 6006.17 | 6006.17 | 5932.95 | 5970.84 | 3.15961e+09 |           0 |              0 |
| 2024-12-30 | 5920.67 | 5940.79 | 5869.16 | 5906.94 | 3.43325e+09 |           0 |              0 |


## **Check data types**

In [4]:
raw_data.dtypes

Unnamed: 0,0
Open,float64
High,float64
Low,float64
Close,float64
Volume,int64
Dividends,float64
Stock Splits,float64


### **Function to plot time series data**

In [5]:
def plot_timeseries(df, title, y_col):
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index, utc=True)
    else:
        df.index = pd.to_datetime(df.index, utc=True)

    source = ColumnDataSource(data={
        'date': df.index,
        'value': df[y_col],
        'date_str': [d.strftime('%Y-%m-%d') for d in df.index]
    })

    p = figure(
        width=1024, height=600,
        x_axis_type='datetime',
        title=title
    )

    p.line('date', 'value', source=source, line_width=1.5)

    hover = HoverTool(
        tooltips=[
            ('Date', '@date_str'),
            ('Value', '@value{0,0.00}')
        ]
    )

    p.add_tools(hover)

    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = y_col

    show(p)

    return p

# **Select feature and target columns**

In [6]:
df = raw_data[['Close']].copy()
df['Target'] = df['Close'].shift(-1)
df.dropna(inplace=True)

print(df.tail())

                  Close       Target
Date                                
2024-12-20  5930.850098  5974.069824
2024-12-23  5974.069824  6040.040039
2024-12-24  6040.040039  6037.589844
2024-12-26  6037.589844  5970.839844
2024-12-27  5970.839844  5906.939941


##**S&P500 data chart**

In [7]:
plot_timeseries(df,'S&P 500 Historical Close Price','Close')

  return convert(array.astype("datetime64[us]"))


## **Function to apply log scale on DataDrame**

In [8]:
def apply_log_scale(df):
    df_copy = df.copy()
    for col in df.columns:
        if col in df_copy.columns:
            df_copy[col] = np.log(df_copy[col].replace(0, np.nan))
    return df_copy


# **Apply log scale on dataframe and save it**

In [9]:
df_log = apply_log_scale(raw_data[['Close']].copy())
df_log['Target'] = df_log['Close'].shift(-1)
df_log.dropna(inplace=True)

df_log.to_csv(f"{RAW_DATA_PATH}/{SYMBOL_NAME}_log_data.csv")
print(df_log)

               Close    Target
Date                          
2014-01-02  7.513153  7.512820
2014-01-03  7.512820  7.510305
2014-01-06  7.510305  7.516368
2014-01-07  7.516368  7.516156
2014-01-08  7.516156  7.516504
...              ...       ...
2024-12-20  8.687923  8.695184
2024-12-23  8.695184  8.706166
2024-12-24  8.706166  8.705760
2024-12-26  8.705760  8.694643
2024-12-27  8.694643  8.683883

[2766 rows x 2 columns]


## **S&P500 log scale data chart**

In [10]:
plot_timeseries(df_log,'S&P 500 Historical Prices (Log Scale)','Close')

  return convert(array.astype("datetime64[us]"))


##**Function to split data into train, validation and test**

In [11]:
def split_data(df, train_size=0.7, val_size=0.15):
    n = len(df)
    train_end = int(n * train_size)
    val_end = int(n * (train_size + val_size))

    train = df[:train_end]
    val = df[train_end:val_end]
    test = df[val_end:]

    return train, val, test

## **Simple MinMaxScaler: split the data, fit scaler on train**

In [12]:
def minmax_split_first(df, save_path, file_name):
    scaler = MinMaxScaler()

    train_data, val_data, test_data = split_data(df)

    scaler.fit(train_data)

    train_scaled_array = scaler.transform(train_data)
    val_scaled_array = scaler.transform(val_data)
    test_scaled_array = scaler.transform(test_data)

    train_scaled = pd.DataFrame(
        train_scaled_array,
        index=train_data.index,
        columns= df.columns
    )
    val_scaled = pd.DataFrame(
        val_scaled_array,
        index=val_data.index,
        columns= df.columns
    )
    test_scaled = pd.DataFrame(
        test_scaled_array,
        index=test_data.index,
        columns= df.columns
    )

    os.makedirs(f"{save_path}/minmax_split_first", exist_ok=True)

    train_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_train.csv")
    val_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_val.csv")
    test_scaled.to_csv(f"{save_path}/minmax_split_first/{file_name}_test.csv")

    joblib.dump(scaler, f"{save_path}/minmax_split_first/{file_name}_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_first/")

    return train_scaled, val_scaled, test_scaled, scaler

# **Fit MinMaxScaler on the entire dataframe, then split**

In [13]:
def minmax_split_last(df, save_path, file_name):
    df = df.copy()

    scaler = MinMaxScaler()
    scaler.fit(df)

    df_scaled = scaler.transform(df)
    df_scaled = pd.DataFrame(df_scaled, index=df.index, columns=df.columns)

    train_data, val_data, test_data = split_data(df_scaled)

    os.makedirs(f"{save_path}/minmax_split_last", exist_ok=True)

    train_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_train.csv")
    val_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_val.csv")
    test_data.to_csv(f"{save_path}/minmax_split_last/{file_name}_test.csv")

    joblib.dump(scaler, f"{save_path}/minmax_split_last/{file_name}_scaler.joblib")

    print(f"Data successfully scaled and saved to {save_path}/minmax_split_last/")

    return train_data, val_data, test_data, scaler

# **Normalize datasets using simple MinMaxScaler**

In [14]:
train_minmax_first, val_minmax_first, test_minmax_first, minmax_split_first_scaler = minmax_split_first(df, PROCESSED_DATA_PATH,'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/data/processed/minmax_split_first/


In [15]:
train_minmax_first, val_minmax_first, test_minmax_first

(                              Close    Target
 Date                                         
 2014-01-02 00:00:00+00:00  0.032232  0.032014
 2014-01-03 00:00:00+00:00  0.032014  0.030368
 2014-01-06 00:00:00+00:00  0.030368  0.034343
 2014-01-07 00:00:00+00:00  0.034343  0.034203
 2014-01-08 00:00:00+00:00  0.034203  0.034432
 ...                             ...       ...
 2021-09-02 00:00:00+00:00  1.000000  0.999456
 2021-09-03 00:00:00+00:00  0.999456  0.993946
 2021-09-07 00:00:00+00:00  0.993946  0.991814
 2021-09-08 00:00:00+00:00  0.991814  0.984376
 2021-09-09 00:00:00+00:00  0.984376  0.971961
 
 [1936 rows x 2 columns],
                               Close    Target
 Date                                         
 2021-09-10 00:00:00+00:00  0.971961  0.975593
 2021-09-13 00:00:00+00:00  0.975593  0.966405
 2021-09-14 00:00:00+00:00  0.966405  0.979875
 2021-09-15 00:00:00+00:00  0.979875  0.977389
 2021-09-16 00:00:00+00:00  0.977389  0.962806
 ...                            

In [16]:
train_minmax_last, val_minmax_last, test_minmax_last, minmax_split_last_scaler = minmax_split_last(df, PROCESSED_DATA_PATH, 'normal_data')

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/data/processed/minmax_split_last/


In [17]:
train_minmax_last, val_minmax_last, test_minmax_last, minmax_split_last_scaler

(                              Close    Target
 Date                                         
 2014-01-02 00:00:00+00:00  0.020718  0.020578
 2014-01-03 00:00:00+00:00  0.020578  0.019520
 2014-01-06 00:00:00+00:00  0.019520  0.022075
 2014-01-07 00:00:00+00:00  0.022075  0.021985
 2014-01-08 00:00:00+00:00  0.021985  0.022132
 ...                             ...       ...
 2021-09-02 00:00:00+00:00  0.642782  0.642432
 2021-09-03 00:00:00+00:00  0.642432  0.638891
 2021-09-07 00:00:00+00:00  0.638891  0.637520
 2021-09-08 00:00:00+00:00  0.637520  0.632739
 2021-09-09 00:00:00+00:00  0.632739  0.624759
 
 [1936 rows x 2 columns],
                               Close    Target
 Date                                         
 2021-09-10 00:00:00+00:00  0.624759  0.627093
 2021-09-13 00:00:00+00:00  0.627093  0.621188
 2021-09-14 00:00:00+00:00  0.621188  0.629846
 2021-09-15 00:00:00+00:00  0.629846  0.628248
 2021-09-16 00:00:00+00:00  0.628248  0.618874
 ...                            

# **Compare the two normalization method (fit minmax scaler on train vs on the entire dataset)**

In [21]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_minmax_first, val_minmax_first, test_minmax_first),
    split_last_data=(train_minmax_last, val_minmax_last, test_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="minmax_first_vs_last.png",
    title="Comparison of fitting MinMaxScaler on train vs on the entire dataset"
)

Plot saved as PNG: minmax_first_vs_last.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/figures/data


# **Normalize log scaled data with simple MinMaxScaler**

In [22]:
train_log_minmax_first, val_log_minmax_first, test_log_minmax_first, minmax_split_first_log_scaler = minmax_split_first(df_log, PROCESSED_DATA_PATH,'log_data')

train_log_minmax_first, val_log_minmax_first, test_log_minmax_first,

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/data/processed/minmax_split_first/


(                              Close    Target
 Date                                         
 2014-01-02 00:00:00+00:00  0.052677  0.052329
 2014-01-03 00:00:00+00:00  0.052329  0.049702
 2014-01-06 00:00:00+00:00  0.049702  0.056036
 2014-01-07 00:00:00+00:00  0.056036  0.055814
 2014-01-08 00:00:00+00:00  0.055814  0.056178
 ...                             ...       ...
 2021-09-02 00:00:00+00:00  1.000000  0.999650
 2021-09-03 00:00:00+00:00  0.999650  0.996097
 2021-09-07 00:00:00+00:00  0.996097  0.994719
 2021-09-08 00:00:00+00:00  0.994719  0.989896
 2021-09-09 00:00:00+00:00  0.989896  0.981798
 
 [1936 rows x 2 columns],
                               Close    Target
 Date                                         
 2021-09-10 00:00:00+00:00  0.981798  0.984173
 2021-09-13 00:00:00+00:00  0.984173  0.978153
 2021-09-14 00:00:00+00:00  0.978153  0.986968
 2021-09-15 00:00:00+00:00  0.986968  0.985346
 2021-09-16 00:00:00+00:00  0.985346  0.975785
 ...                            

In [23]:
train_log_minmax_last, val_log_minmax_last, test_log_minmax_last, minmax_split_last_log_scaler = minmax_split_last(df_log, PROCESSED_DATA_PATH,'log_data')

train_log_minmax_last, val_log_minmax_last, test_log_minmax_last,

Data successfully scaled and saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/data/processed/minmax_split_last/


(                              Close    Target
 Date                                         
 2014-01-02 00:00:00+00:00  0.040286  0.040020
 2014-01-03 00:00:00+00:00  0.040020  0.038011
 2014-01-06 00:00:00+00:00  0.038011  0.042855
 2014-01-07 00:00:00+00:00  0.042855  0.042685
 2014-01-08 00:00:00+00:00  0.042685  0.042963
 ...                             ...       ...
 2021-09-02 00:00:00+00:00  0.764774  0.764506
 2021-09-03 00:00:00+00:00  0.764506  0.761789
 2021-09-07 00:00:00+00:00  0.761789  0.760735
 2021-09-08 00:00:00+00:00  0.760735  0.757047
 2021-09-09 00:00:00+00:00  0.757047  0.750854
 
 [1936 rows x 2 columns],
                               Close    Target
 Date                                         
 2021-09-10 00:00:00+00:00  0.750854  0.752670
 2021-09-13 00:00:00+00:00  0.752670  0.748066
 2021-09-14 00:00:00+00:00  0.748066  0.754807
 2021-09-15 00:00:00+00:00  0.754807  0.753567
 2021-09-16 00:00:00+00:00  0.753567  0.746255
 ...                            

In [24]:
plot_comparison_datetime_bokeh(
    split_first_data=(train_log_minmax_first, val_log_minmax_first, test_log_minmax_first),
    split_last_data=(train_log_minmax_last, val_log_minmax_last, test_log_minmax_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="log_minmax_first_vs_last.png",
    title="Comparison of fitting MinMaxScaler on train vs on the entire dataset"
)

Plot saved as PNG: log_minmax_first_vs_last.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/figures/data


# **Custom data normalization method combined with MinMaxScaler**

In [25]:
def subtract_first_value(df, column_name, first_value):
    df = df.copy()
    df[column_name] = df[column_name] - first_value
    return df

def custom_normalize(data,last_value, index, n):
    A_prime = data
    numerator = (A_prime - (last_value / n) * index) * index
    denominator = np.sqrt(index**2 + ((last_value / n) * index)**2)
    return 0 if denominator == 0 else numerator / denominator

def normalize(df, column_name,last_value, last_index):
    df = df.copy().reset_index(drop=True)

    df['Custom_Normalized'] = [
        custom_normalize(row[column_name], last_value, i, last_index)
        for i, row in df.iterrows()
    ]
    return df

In [26]:
def custom_split_first(df, save_dir, file_prefix,cols_to_scale=['Custom_Normalized','Target']):
    save_path = f"{save_dir}/custom_split_first"
    os.makedirs(save_path, exist_ok=True)

    train, val, test = split_data(df)

    custom_scaler = {
        'first_value': train['Close'].iloc[0],
        'first_index': 0,
        'last_value': train['Close'].iloc[-1] - train['Close'].iloc[0],
        'last_index': len(train) - 1
    }

    print(custom_scaler)

    custom_scaler_df = pd.DataFrame(custom_scaler, index=[0])

    custom_scaler_df.to_csv(f"{save_path}/{file_prefix}_custom_scaler.csv")

    df = subtract_first_value(df, 'Close', custom_scaler['first_value'])

    df_norm = normalize(df, 'Close', custom_scaler['last_value'], custom_scaler['last_index'])

    df_norm['Target'] = df_norm['Custom_Normalized'].shift(-1)
    df_norm.dropna(inplace=True)

    train, val, test = split_data(df_norm[cols_to_scale])

    scaler = MinMaxScaler()

    scaler.fit(train)

    train_scaled_arr = scaler.transform(train[cols_to_scale])
    val_scaled_arr = scaler.transform(val[cols_to_scale])
    test_scaled_arr = scaler.transform(test[cols_to_scale])

    train_scaled = pd.DataFrame(
        train_scaled_arr,
        index=train.index,
        columns=cols_to_scale
    )

    val_scaled = pd.DataFrame(
        val_scaled_arr,
        index=val.index,
        columns=cols_to_scale
    )

    test_scaled = pd.DataFrame(
        test_scaled_arr,
        index=test.index,
        columns=cols_to_scale
    )

    train_scaled.to_csv(f"{save_path}/{file_prefix}_train.csv")
    val_scaled.to_csv(f"{save_path}/{file_prefix}_val.csv")
    test_scaled.to_csv(f"{save_path}/{file_prefix}_test.csv")

    joblib.dump(scaler, f"{save_path}/{file_prefix}_scaler.joblib")

    return train_scaled, val_scaled, test_scaled, scaler

In [27]:
def custom_split_last(df, save_dir, file_prefix,colst_to_scale=['Custom_Normalized','Target']):
    save_path = f"{save_dir}/custom_split_last"
    os.makedirs(save_path, exist_ok=True)

    custom_scaler = {
        'first_value': df['Close'].iloc[0],
        'first_index': 0,
        'last_value': df['Close'].iloc[-1] - df['Close'].iloc[0],
        'last_index': len(df) - 1
    }

    custom_scaler_df = pd.DataFrame(custom_scaler, index=[0])

    custom_scaler_df.to_csv(f"{save_path}/{file_prefix}_custom_scaler.csv")

    df = subtract_first_value(df, 'Close', custom_scaler['first_value'])

    df_norm = normalize(df, 'Close', custom_scaler['last_value'], custom_scaler['last_index'])

    df_norm['Target'] = df_norm['Custom_Normalized'].shift(-1)
    df_norm.dropna(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_norm[colst_to_scale])

    df_scaled_arr = scaler.transform(df_norm[colst_to_scale])

    df_scaled = pd.DataFrame(
        df_scaled_arr,
        index=df_norm.index,
        columns=colst_to_scale
    )

    train_scaled, val_scaled, test_scaled = split_data(df_scaled)

    joblib.dump(scaler, f"{save_path}/{file_prefix}_scaler.joblib")

    train_scaled.to_csv(f"{save_path}/{file_prefix}_train.csv")
    val_scaled.to_csv(f"{save_path}/{file_prefix}_val.csv")
    test_scaled.to_csv(f"{save_path}/{file_prefix}_test.csv")

    return train_scaled, val_scaled, test_scaled, scaler

In [28]:
train_custom_first, val_custom_first, test_custom_first, custom_first_scaler = custom_split_first(
    df,
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='normal_data'
)

train_custom_first, val_custom_first, test_custom_first,

{'first_value': np.float64(1831.97998046875), 'first_index': 0, 'last_value': np.float64(2661.2998046875), 'last_index': 1935}


(      Custom_Normalized    Target
 0              0.972624  0.971519
 1              0.971519  0.968192
 2              0.968192  0.973612
 3              0.973612  0.972629
 4              0.972629  0.972220
 ...                 ...       ...
 1930           0.993606  1.000000
 1931           1.000000  0.998388
 1932           0.998388  0.989048
 1933           0.989048  0.984964
 1934           0.984964  0.972624
 
 [1935 rows x 2 columns],
       Custom_Normalized    Target
 1935           0.972624  0.952540
 1936           0.952540  0.957425
 1937           0.957425  0.942362
 1938           0.942362  0.962558
 1939           0.962558  0.957923
 ...                 ...       ...
 2345           0.459409  0.477645
 2346           0.477645  0.475983
 2347           0.475983  0.448332
 2348           0.448332  0.431516
 2349           0.431516  0.414309
 
 [415 rows x 2 columns],
       Custom_Normalized    Target
 2350           0.414309  0.455316
 2351           0.455316  0.455591


In [29]:
train_custom_last, val_custom_last, test_custom_last, custom_last_scaler = custom_split_last(
    df,
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='normal_data_last'
)

train_custom_last, val_custom_last, test_custom_last

(      Custom_Normalized    Target
 0              0.932428  0.931414
 1              0.931414  0.928479
 2              0.928479  0.933106
 3              0.933106  0.932198
 4              0.932198  0.931786
 ...                 ...       ...
 1930           0.837674  0.843143
 1931           0.843143  0.841691
 1932           0.841691  0.833558
 1933           0.833558  0.829969
 1934           0.829969  0.819241
 
 [1935 rows x 2 columns],
       Custom_Normalized    Target
 1935           0.819241  0.801819
 1936           0.801819  0.805984
 1937           0.805984  0.792903
 1938           0.792903  0.810304
 1939           0.810304  0.806239
 ...                 ...       ...
 2345           0.351558  0.367265
 2346           0.367265  0.365769
 2347           0.365769  0.341805
 2348           0.341805  0.327208
 2349           0.327208  0.312274
 
 [415 rows x 2 columns],
       Custom_Normalized    Target
 2350           0.312274  0.347668
 2351           0.347668  0.347847


In [30]:
plot_comparison_bokeh(
    split_first_data=(train_custom_first, val_custom_first, test_custom_first),
    split_last_data=(train_custom_last, val_custom_last, test_custom_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="custom_normalized_first_vs_last.png",
    column_name="Custom_Normalized",
    title="Comparison of split First vs split Last Custom Normalization"
)

Plot saved as PNG: custom_normalized_first_vs_last.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/figures/data


In [31]:
train_custom_log_first, val_custom_log_first, test_custom_log_first, custom_log_scaler = custom_split_first(
    df_log,
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='log_data'
)

train_custom_log_first, val_custom_log_first, test_custom_log_first,

{'first_value': np.float64(7.513152617482637), 'first_index': 0, 'last_value': np.float64(0.8971855609944477), 'last_index': 1935}


(      Custom_Normalized    Target
 0              0.959795  0.958340
 1              0.958340  0.952902
 2              0.952902  0.963126
 3              0.963126  0.961892
 4              0.961892  0.961681
 ...                 ...       ...
 1930           0.976503  0.980839
 1931           0.980839  0.979381
 1932           0.979381  0.972325
 1933           0.972325  0.969069
 1934           0.969069  0.959795
 
 [1935 rows x 2 columns],
       Custom_Normalized    Target
 1935           0.959795  0.944794
 1936           0.944794  0.948099
 1937           0.948099  0.936731
 1938           0.936731  0.951290
 1939           0.951290  0.947610
 ...                 ...       ...
 2345           0.461174  0.475334
 2346           0.475334  0.473782
 2347           0.473782  0.451659
 2348           0.451659  0.437991
 2349           0.437991  0.423917
 
 [415 rows x 2 columns],
       Custom_Normalized    Target
 2350           0.423917  0.456492
 2351           0.456492  0.456471


In [32]:
train_custom_log_last, val_custom_log_last, test_custom_log_last, custom_last_log_scaler = custom_split_last(
    df_log,
    save_dir=PROCESSED_DATA_PATH,
    file_prefix='log_data_last'
)

train_custom_log_last, val_custom_log_last, test_custom_log_last,

(      Custom_Normalized    Target
 0              0.820582  0.819251
 1              0.819251  0.814101
 2              0.814101  0.823966
 3              0.823966  0.822847
 4              0.822847  0.822708
 ...                 ...       ...
 1930           0.959430  0.963651
 1931           0.963651  0.962316
 1932           0.962316  0.955615
 1933           0.955615  0.952557
 1934           0.952557  0.943729
 
 [1935 rows x 2 columns],
       Custom_Normalized    Target
 1935           0.943729  0.929412
 1936           0.929412  0.932644
 1937           0.932644  0.921808
 1938           0.921808  0.935830
 1939           0.935830  0.932365
 ...                 ...       ...
 2345           0.491781  0.505420
 2346           0.505420  0.503996
 2347           0.503996  0.482850
 2348           0.482850  0.469809
 2349           0.469809  0.456380
 
 [415 rows x 2 columns],
       Custom_Normalized    Target
 2350           0.456380  0.487674
 2351           0.487674  0.487718


In [33]:
plot_comparison_bokeh(
    split_first_data=(train_custom_log_first, val_custom_log_first, test_custom_log_first),
    split_last_data=(train_custom_log_last, val_custom_log_last, test_custom_log_last),
    save_path=f"{FIGURES_PATH}/data",
    file_name="comparison_custom_log_scaled_data.png",
    column_name="Custom_Normalized",
    title="Comparison of Split First vs Split Last Custom Log Normalization"
)

Plot saved as PNG: comparison_custom_log_scaled_data.png to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/results/figures/data


In [19]:
def plot_comparison_datetime_bokeh(split_first_data, split_last_data, save_path, file_name, col_name="Close",
                                  title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Make continuous indices for split_first
    train_first = train_first.copy().reset_index()
    val_first = val_first.copy().reset_index()
    test_first = test_first.copy().reset_index()

    train_first['plot_index'] = range(len(train_first))
    val_first['plot_index'] = range(len(train_first), len(train_first) + len(val_first))
    test_first['plot_index'] = range(len(train_first) + len(val_first),
                                    len(train_first) + len(val_first) + len(test_first))

    # Make continuous indices for split_last
    train_last = train_last.copy().reset_index()
    val_last = val_last.copy().reset_index()
    test_last = test_last.copy().reset_index()

    train_last['plot_index'] = range(len(train_last))
    val_last['plot_index'] = range(len(train_last), len(train_last) + len(val_last))
    test_last['plot_index'] = range(len(train_last) + len(val_last),
                                   len(train_last) + len(val_last) + len(test_last))

    # Create sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=900, height=500, title="Split First")
    p1.title.text_font_size = '12pt'

    colors = Category10[3]
    p1.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = col_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p1.add_tools(hover1)
    p1.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=900, height=500, title="Split Last" )
    p2.title.text_font_size = '12pt'

    p2.line('plot_index', col_name, line_color=colors[0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('plot_index', col_name, line_color=colors[1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('plot_index', col_name, line_color=colors[2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = col_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@plot_index'),
                                ('Date', '@index{%F}'),
                                (col_name, f'@{col_name}{{0.0000}}')],
                      formatters={'@index': 'datetime'})
    p2.add_tools(hover2)
    p2.add_tools(PanTool(), BoxZoomTool(), ResetTool(), SaveTool())

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1800)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first['plot_index'], train_first[col_name], label="Train", linewidth=2)
    ax1.plot(val_first['plot_index'], val_first[col_name], label="Validation", linewidth=2)
    ax1.plot(test_first['plot_index'], test_first[col_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(col_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last['plot_index'], train_last[col_name], label="Train", linewidth=2)
    ax2.plot(val_last['plot_index'], val_last[col_name], label="Validation", linewidth=2)
    ax2.plot(test_last['plot_index'], test_last[col_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(col_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)

In [20]:
def plot_comparison_bokeh(split_first_data, split_last_data, save_path, file_name, column_name,
                         title="Comparison of Split First vs Split Last"):
    reset_output()
    output_notebook()

    # Unpack the data tuples
    train_first, val_first, test_first = split_first_data
    train_last, val_last, test_last = split_last_data

    # Process split first data
    train_first = train_first.copy().reset_index(drop=True)
    val_first = val_first.copy().reset_index(drop=True)
    test_first = test_first.copy().reset_index(drop=True)

    val_first["Index"] = val_first.index + len(train_first)
    test_first["Index"] = test_first.index + len(train_first) + len(val_first)
    train_first["Index"] = train_first.index

    # Process split last data
    train_last = train_last.copy().reset_index(drop=True)
    val_last = val_last.copy().reset_index(drop=True)
    test_last = test_last.copy().reset_index(drop=True)

    val_last["Index"] = val_last.index + len(train_last)
    test_last["Index"] = test_last.index + len(train_last) + len(val_last)
    train_last["Index"] = train_last.index

    # Create data sources for split first
    train_first_source = ColumnDataSource(train_first)
    val_first_source = ColumnDataSource(val_first)
    test_first_source = ColumnDataSource(test_first)

    # Create data sources for split last
    train_last_source = ColumnDataSource(train_last)
    val_last_source = ColumnDataSource(val_last)
    test_last_source = ColumnDataSource(test_last)

    # Create first figure (Split First)
    p1 = figure(width=900, height=500, title="Split First")
    p1.title.text_font_size = '12pt'

    p1.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_first_source, legend_label='Train')
    p1.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_first_source, legend_label='Validation')
    p1.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_first_source, legend_label='Test')

    p1.xaxis.axis_label = 'Index'
    p1.yaxis.axis_label = column_name
    p1.xaxis.axis_label_text_font_size = '10pt'
    p1.yaxis.axis_label_text_font_size = '10pt'

    hover1 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p1.add_tools(hover1)

    p1.legend.click_policy = "hide"
    p1.legend.location = "top_left"
    p1.legend.label_text_font_size = '8pt'

    # Create second figure (Split Last)
    p2 = figure(width=900, height=500, title="Split Last")
    p2.title.text_font_size = '12pt'

    p2.line('Index', column_name, line_color=Category10[3][0], line_width=2, source=train_last_source, legend_label='Train')
    p2.line('Index', column_name, line_color=Category10[3][1], line_width=2, source=val_last_source, legend_label='Validation')
    p2.line('Index', column_name, line_color=Category10[3][2], line_width=2, source=test_last_source, legend_label='Test')

    p2.xaxis.axis_label = 'Index'
    p2.yaxis.axis_label = column_name
    p2.xaxis.axis_label_text_font_size = '10pt'
    p2.yaxis.axis_label_text_font_size = '10pt'

    hover2 = HoverTool(tooltips=[('Index', '@Index'), (column_name, f'@{column_name}{{0.0000}}')])
    p2.add_tools(hover2)

    p2.legend.click_policy = "hide"
    p2.legend.location = "top_left"
    p2.legend.label_text_font_size = '8pt'

    # Create layout with title
    header = Div(text=f"<h2>{title}</h2>", width=1800)
    layout = column(header, row(p1, p2))

    # Save as PNG using Matplotlib
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot Split First
    ax1.plot(train_first["Index"], train_first[column_name], label="Train", linewidth=2)
    ax1.plot(val_first["Index"], val_first[column_name], label="Validation", linewidth=2)
    ax1.plot(test_first["Index"], test_first[column_name], label="Test", linewidth=2)
    ax1.set_title("Split First", fontsize=12)
    ax1.set_xlabel("Index", fontsize=10)
    ax1.set_ylabel(column_name, fontsize=10)
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot Split Last
    ax2.plot(train_last["Index"], train_last[column_name], label="Train", linewidth=2)
    ax2.plot(val_last["Index"], val_last[column_name], label="Validation", linewidth=2)
    ax2.plot(test_last["Index"], test_last[column_name], label="Test", linewidth=2)
    ax2.set_title("Split Last", fontsize=12)
    ax2.set_xlabel("Index", fontsize=10)
    ax2.set_ylabel(column_name, fontsize=10)
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    fig.suptitle(title, fontsize=14)

    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Full file path
    file_path = os.path.join(save_path, file_name)

    plt.tight_layout()
    plt.savefig(file_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Plot saved as PNG: {file_name} to {save_path}")

    show(layout)