In [1]:
!pip install yfinance
!pip install pandas_ta
!pip install scikit-learn
!pip install matplotlib
!pip install plotly
!pip install "notebook>=5.3" "ipywidgets>=7.5"
!pip install bokeh

Collecting yfinance
  Downloading yfinance-0.2.52-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.8.tar.gz (948 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m948.2/948.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Downloading yfinance-0.2.52-py2.py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: peewee
  Building wheel for peewee (pyproject.toml) ... [?25l- \ | / done
[?25h  C

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import tensorflow as tf
import keras
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/__script__.py
/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/__results__.html
/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/input_requirements.txt
/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/__script__.ipynb
/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/__output__.json
/kaggle/input/pm-70183365-at-01-27-2025-19-23-16/custom.css


In [3]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource
import pandas as pd

def plot_time_series_datetime(df, y_col, title='Time Series Plot', x_label='Date', y_label='Value'):
    """
    Creates a Bokeh line plot for time series data with a DateTime index.

    Args:
        df: pandas DataFrame containing the data.
        y_col: Column name for y-axis values.
        title: Title of the plot (default: "Time Series Plot").
        x_label: Label for the x-axis (default: "Date").
        y_label: Label for the y-axis (default: "Value").

    Returns:
        None. Displays the plot.
    """
    output_notebook()

    source = ColumnDataSource(data=dict(x=df.index, y=df[y_col]))

    p = figure(title=title,
               x_axis_label=x_label,
               y_axis_label=y_label,
               x_axis_type='datetime',
               width=800,
               height=400)

    p.line('x', 'y', source=source, line_width=2)
    p.xaxis.major_label_orientation = 0.8

    show(p)

def plot_time_series_numerical(df, y_col, title='Time Series Plot', x_label='Index', y_label='Value'):
    """
    Creates a Bokeh line plot for data with a numerical index.

    Args:
        df: pandas DataFrame containing the data.
        y_col: Column name for y-axis values.
        title: Title of the plot (default: "Time Series Plot").
        x_label: Label for the x-axis (default: "Index").
        y_label: Label for the y-axis (default: "Value").

    Returns:
        None. Displays the plot.
    """
    output_notebook()

    source = ColumnDataSource(data=dict(x=list(range(len(df))), y=df[y_col]))

    p = figure(title=title,
               x_axis_label=x_label,
               y_axis_label=y_label,
               x_axis_type='linear',
               width=800,
               height=400)

    p.line('x', 'y', source=source, line_width=2)

    show(p)

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
def custom_normalize(df, column_name, method=None, feature_range=(0, 1)):
    """
    Normalizes the given column in the DataFrame using a custom formula and optional scalers.

    Args:
        df: The pandas DataFrame containing the data.
        column_name: The name of the column to normalize (default: 'Close').
        method: Scaling method after custom normalization ('minmax', 'standard', or None).
        feature_range: The range for MinMaxScaler (default: (0, 1)).

    Returns:
        A new DataFrame with the normalized column.
    """
    df = df.copy().reset_index(drop=True)

    # Custom normalization formula
    def normalize_value(row, i):
        A_prime = row[column_name]  # A' is the current value
        N = len(df) - 1
        numerator = (A_prime - (A_prime / N) * i) * i
        denominator = np.sqrt(i**2 + ((A_prime / N) * i)**2)
        return 0 if denominator == 0 else numerator / denominator

    # Apply custom normalization
    df['Custom_Normalized'] = [normalize_value(row, i) for i, row in df.iterrows()]

    # Apply additional scaling if a method is provided
    if method:
        scaler = None
        if method == 'minmax':
            scaler = MinMaxScaler(feature_range=feature_range)
        elif method == 'standard':
            scaler = StandardScaler()
        else:
            raise ValueError("Invalid method. Choose 'minmax', 'standard', or None.")

        # Fit and transform the custom normalized values
        df['Scaled'] = scaler.fit_transform(df[['Custom_Normalized']])

     # Calculate the target from the normalized close price
    df['Target'] = df['Scaled'].shift(-1)
    df.dropna(inplace=True)

    return df

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

def create_train_test_split(df, feature_column='Scaled', target_column='Target', test_size=0.2, random_state=None):
    """
    Creates X and y arrays from specified columns and splits them into training and testing sets.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature_column (str): The name of the feature column (default: 'Scaled').
        target_column (str): The name of the target column (default: 'Target').
        test_size (float): The proportion of the data to include in the test split (default: 0.2).
        random_state (int, optional): Random seed for reproducibility (default: None).
    
    Returns:
        tuple: X_train, X_test, y_train, y_test as numpy arrays.
    """
    # Ensure no missing values in the specified columns
    df = df[[feature_column, target_column]].dropna()

    # Create feature (X) and target (y) arrays
    X = df[feature_column].values.reshape(-1, 1)  # Reshape for sklearn compatibility
    y = df[target_column].values.reshape(-1, 1)  # Reshape for sklearn compatibility

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    return X_train, X_test, y_train, y_test

In [6]:
import pandas as pd
import yfinance as yf

# Load data
df = yf.Ticker('^GSPC').history(period='max', interval='1d', end='2024-12-31')
df = df[['Close']].dropna()  # Select relevant columns and drop NA

first_value = df['Close'].iloc[0]
df['Close_SubtractedFromFirst'] = df['Close'] - first_value

print("Last 5 rows of the DataFrame:")
print(df.tail())

# Normalize the 'Close' column
df_normalized = custom_normalize(df,column_name='Close_SubtractedFromFirst',method='minmax')

# ???? Target before or after normalizing closing price ?

print("\nLast 5 rows of the Normalized DataFrame:")
print(df_normalized.tail(5))

# Print min and max values from the normalized DataFrame
min_value = df_normalized['Scaled'].min()
max_value = df_normalized['Scaled'].max()

print(f"Minimum value in 'Scaled': {min_value}")
print(f"Maximum value in 'Scaled': {max_value}")

start_date = df.index[0].strftime('%Y-%m-%d')
end_date = df.index[-1].strftime('%Y-%m-%d')
title = f'S&P 500 Closing Prices from {start_date} to {end_date}'
plot_time_series_datetime(df, y_col='Close_SubtractedFromFirst', title=title)
plot_time_series_numerical(
    df=df_normalized,
    y_col='Scaled',
    title='Time Series with Numerical Index'
)

Last 5 rows of the DataFrame:
                                 Close  Close_SubtractedFromFirst
Date                                                             
2024-12-23 00:00:00-05:00  5974.069824                5956.409824
2024-12-24 00:00:00-05:00  6040.040039                6022.380039
2024-12-26 00:00:00-05:00  6037.589844                6019.929844
2024-12-27 00:00:00-05:00  5970.839844                5953.179844
2024-12-30 00:00:00-05:00  5906.939941                5889.279942

Last 5 rows of the Normalized DataFrame:
             Close  Close_SubtractedFromFirst  Custom_Normalized    Scaled  \
24360  5930.850098                5913.190098           1.179229  0.034767   
24361  5974.069824                5956.409824           0.949891  0.034191   
24362  6040.040039                6022.380039           0.719856  0.033613   
24363  6037.589844                6019.929844           0.479720  0.033010   
24364  5970.839844                5953.179844           0.237351  0.032401  

In [7]:
# Assuming df_normalized contains 'Scaled' and 'Target' columns
X_train, X_test, y_train, y_test = create_train_test_split(
    df_normalized,
    feature_column='Scaled',
    target_column='Target',
    test_size=0.2,
    random_state=42
)

# Print the shapes of the splits
print("Shapes of splits:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

Shapes of splits:
X_train: (19492, 1), X_test: (4873, 1)
y_train: (19492, 1), y_test: (4873, 1)


In [8]:
def create_sliding_windows(X, y, window_size):
    """
    Create sliding windows for features (X) and target (y).
    """
    X_windows = []
    y_windows = []
    for i in range(window_size, len(X)):
        X_windows.append(X[i - window_size:i])  # Append `window_size` rows
        y_windows.append(y[i])  # Append the target at the end of the window
    return np.array(X_windows), np.array(y_windows)

In [9]:
# Parameters
backcandles = 14  # Number of past timesteps to use

# Create sliding windows for training and testing sets
X_train, y_train = create_sliding_windows(X_train, y_train, backcandles)
X_test, y_test = create_sliding_windows(X_test, y_test, backcandles)

# Print shapes to verify
print("X_train shape:", X_train.shape)  # (samples_train, backcandles, num_features)
print("y_train shape:", y_train.shape)  # (samples_train, 1)
print("X_test shape:", X_test.shape)    # (samples_test, backcandles, num_features)
print("y_test shape:", y_test.shape)    # (samples_test, 1)

# Example: print the first sliding window sample
print("\nFirst sample of X_train (features):")
print(X_train[0])

print("\nFirst target value of y_train:")
print(y_train[0])

X_train shape: (19478, 14, 1)
y_train shape: (19478, 1)
X_test shape: (4859, 14, 1)
y_test shape: (4859, 1)

First sample of X_train (features):
[[0.03180418]
 [0.03205543]
 [0.03195492]
 [0.03152783]
 [0.03180418]
 [0.03140226]
 [0.03107572]
 [0.03102552]
 [0.03132695]
 [0.03160325]
 [0.03087492]
 [0.03090007]
 [0.03079966]
 [0.03110104]]

First target value of y_train:
[0.03175396]


In [10]:
from keras.models import Model
from keras.layers import Input, LSTM, Dropout, Dense

# Define the input layer
lstm_input = Input(shape=(X_train.shape[1], X_train.shape[2]), name='lstm_input')

# Add the first LSTM layer with Dropout
x = LSTM(units=64, activation='silu', name='lstm_layer_1')(lstm_input)
x = Dropout(0.2)(x)

# Add the Dense output layer
output = Dense(units=1, activation='linear', name='output_layer')(x)

# Create the model
model = Model(inputs=lstm_input, outputs=output, name='LSTM_Model')

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mape'])

# Display the model summary
model.summary()

In [11]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=32, batch_size=15,
    callbacks=[early_stopping]
)

Epoch 1/32


I0000 00:00:1738005925.073223     148 service.cc:145] XLA service 0x5d2c02ceded0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738005925.073267     148 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1738005925.073271     148 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m  48/1299[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 3ms/step - loss: 0.0626 - mape: 106.3891

I0000 00:00:1738005927.179983     148 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.0097 - mape: 85.6086
Epoch 2/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 3.8269e-04 - mape: 149.4133
Epoch 3/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 3.4583e-04 - mape: 390.3584
Epoch 4/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.9038e-04 - mape: 255.1062
Epoch 5/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.9814e-04 - mape: 31.2606
Epoch 6/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.6691e-04 - mape: 1124.7303
Epoch 7/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.7929e-04 - mape: 37.5348
Epoch 8/32
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.5865e-04 - mape: 24.4891
Epoch 9/32
[1m1299/1299[

In [12]:
from sklearn.metrics import mean_absolute_percentage_error, max_error, mean_squared_error
import numpy as np

# Predict scaled values
y_pred=model.predict(X_test)
y_pred = y_pred.flatten()
y_test = y_test.flatten()

print(y_pred.shape,y_test.shape)
# # Inverse transform the predictions and actual target values
# y_pred = scaler_y.inverse_transform(y_pred)
# y_test = scaler_y.inverse_transform(y_test)  # Inverse transform y_test

# Calculate performance metrics
mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # Convert to percentage
mse = mean_squared_error(y_test, y_pred)

# Replace zeros in y_actual to avoid division by zero
y_test_safe = np.where(y_test == 0, 1e-6, y_test)

# Compute percentage deviations
percentage_deviation = np.abs((y_test - y_pred) / y_test_safe) * 100

# Find the maximum percentage deviation and its index
mpd_index = np.argmax(percentage_deviation)
mpd = percentage_deviation[mpd_index]

# Print the metrics
print(f"MAPE: {mape:.4f}%")
print(f"MPD (Maximum Percentage Deviation): {mpd:.4f}%")
print(f"MSE: {mse:.8f}")

# Print the true and predicted values at the point of maximum deviation
print(f"\nPoint of Maximum Deviation:")
print(f"True Value: {y_test[mpd_index]}")
print(f"Predicted Value: {y_pred[mpd_index]}\n")

# Print first 5 pairs of true and predicted values
for i in range(5):
    print(f"Actual: {y_test[i]}, Predicted: {y_pred[i]}")

[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
(4859,) (4859,)
MAPE: 1.3742%
MPD (Maximum Percentage Deviation): 14.6448%
MSE: 0.00006308

Point of Maximum Deviation:
True Value: 0.4071427239251814
Predicted Value: 0.4667677879333496

Actual: 0.6441230692122312, Predicted: 0.6342800855636597
Actual: 0.643538009004198, Predicted: 0.6350719332695007
Actual: 0.6387424268833567, Predicted: 0.6372942924499512
Actual: 0.6366023110293467, Predicted: 0.6384999752044678
Actual: 0.6367619750084982, Predicted: 0.6375225782394409


In [13]:
from bokeh.plotting import figure, show, output_notebook
import numpy as np

y_pred = np.squeeze(y_pred)
y_test = np.squeeze(y_test)

# Enable Bokeh output in the notebook
output_notebook()

x = np.arange(len(y_test))   # Sample numbers from 1 to 100

# Create a figure
p = figure(title="Test vs Predicted Values (Interactive)",
           x_axis_label='Sample',
           y_axis_label='Value',
           width=800,
           height=400)

# Add Test line
p.line(x, y_test, legend_label="Test", line_width=2, color="black")
# Add Predicted line
p.line(x, y_pred, legend_label="Predicted", line_width=2, color="red")

# Show the plot
show(p)