In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install ta

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
import ta
import math
import warnings
import tensorflow as tf

warnings.filterwarnings('ignore')

from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from torch.utils.data import DataLoader
from copy import deepcopy as dc
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, r2_score
from pathlib import Path

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
device

# We can recommend by certain criteria

Risk Profile: Assess the risk associated with the stock/ETF, considering factors like volatility, beta, standard deviation, and historical performance. Investors have different risk tolerances, so offering a range of risk profiles can cater to varying preferences.

Performance Metrics: Analyze historical returns, Sharpe ratio, alpha, beta, and other performance metrics to gauge the past performance and potential future returns.

Industry/ Sector: Segment stocks/ETFs based on industries or sectors they belong to. Some investors may prefer exposure to specific sectors (tech, healthcare, energy, etc.) due to their understanding or belief in their growth potential.

Diversification: Provide recommendations that aid in diversifying a portfolio across different industries, regions, or asset classes. Diversification helps mitigate risk.

Market Trends and Sentiment Analysis: Incorporate sentiment analysis by tracking news sentiment, social media chatter, and expert opinions to gauge market trends and sentiment toward particular stocks/ETFs.

Fundamental Analysis: Consider fundamental factors such as earnings, revenue growth, profit margins, debt levels, P/E ratios, and other financial metrics to assess the underlying strength of the company.

Technical Analysis: For traders interested in technical indicators, consider metrics like moving averages, RSI (Relative Strength Index), MACD (Moving Average Convergence Divergence), etc., to evaluate entry and exit points.

Liquidity and Volume: Include stocks/ETFs with sufficient liquidity and trading volume, as low liquidity can impact trade execution and pricing.

Costs and Fees: Consider expense ratios, trading fees, and other costs associated with investing in the recommended stocks/ETFs.

User Preferences and Constraints: Take into account individual user preferences, investment goals, time horizon, tax considerations, and any constraints or ethical considerations they might have.

Regulatory and Legal Factors: Ensure compliance with regulatory requirements and ethical standards related to investment recommendations.

Adaptability and Learning: Implement a system that continuously learns from user feedback and adapts recommendations based on the changing market conditions and user preferences.

Performance Against Benchmarks: Compare the recommended stocks/ETFs against relevant benchmarks (like S&P 500, sector-specific indices) to provide a benchmark-relative assessment.

# Import Data

In [None]:
# Write the CSV file
symbols_metadata_df = pd.read_csv('/kaggle/input/stock-market-dataset/symbols_valid_meta.csv')

url = "https://en.wikipedia.org/wiki/Nasdaq-100"
nasdaq100_meta = pd.read_html(url)[4]

etf_csv_files = glob.glob("/kaggle/input/stock-market-dataset/etfs/*.csv")

stock_csv_files = glob.glob("/kaggle/input/stock-market-dataset/stocks/*.csv")

etf_dfs = {}
stock_dfs = {}

for etf_csv_filename in etf_csv_files:
    df = pd.read_csv(etf_csv_filename, index_col=None, header=0)
    etf_dfs[Path(etf_csv_filename).stem] = df
    
for stock_csv_filename in stock_csv_files:
    df = pd.read_csv(stock_csv_filename, index_col=None, header=0)
    stock_dfs[Path(stock_csv_filename).stem] = df
    
stock_dfs['A'] # Example of how you would access a dataframe for a particular ticker
print("Count of PDs for each ticker: " + str(len(stock_dfs))) 

In [None]:
stock_dfs['A'] = pd.read_csv("/kaggle/input/stock-market-dataset/stocks/A.csv", index_col=None, header=0)

In [None]:
stock_dfs['A'].head()

# Define helper functions for analyzing data

In [None]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        hist, edges = np.histogram(df[var_name], bins=10)
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( edges , visible=True )
        ax.set_yticklabels( [] , visible=True )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()
    
def plot_correlation_map( df ):
    df = df.select_dtypes(['number'])
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )
    
def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))


# Data Analysis

In [None]:
column_names = symbols_metadata_df.columns.tolist()
column_names

In [None]:
nasdaq100_meta

## Hyperparameters


In [None]:
LEARNING_RATE = 1e-2
BATCH_SIZE = 4
TEST_BATCH_SIZE = 1
NUM_EPOCHS = 5
DEBUG = False

dataset_name = "stock_closing_price"

train_mse_loss_avg_values = []
train_rmse_loss_avg_values = []
train_f1_avg_values = []

valid_mse_loss_avg_values = []
valid_rmse_loss_avg_values = []
valid_f1_avg_values = []

# Helper Function

In [None]:
def mkdir_prep_dir(dirpath):
    """make preprocess directory if doesn't exist"""
    prep_dir = dirpath
    if not os.path.exists(prep_dir):
        os.makedirs(prep_dir)
    return prep_dir

# PyTorch Functions & Classes

In [None]:
def save_close_price_mavg(stock_close_prices, mavg_n_days_close_prices, dst_folder="plots/data_prep/csv", filename="mavg_vs_close_prices.csv"):
    mkdir_prep_dir(f"{dst_folder}")
    dst_filepath = f"{dst_folder}/{filename}"

    # final close price pred for each 100th date after predicting on 100 days of final closing prices
    stock_close_price_res_df = pd.DataFrame({"Close": stock_close_prices, "MAVG": mavg_n_days_close_prices})
    stock_close_price_res_df.to_csv(dst_filepath, index=False)
    return stock_close_price_res_df

def save_checkpoint(state, filename="best_lstm.pth.tar"):
    print("=> Saving Checkpoint")
    torch.save(state, filename)

def rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2))

# Based on perplexity.ai solution to torchmetrics.functional f1 lib incompatibility
# https://www.perplexity.ai/search/Stock-Recommender-System-BY7xxwMfT9Ohxqz5klk1BA?s=c
def f1_score(y_pred, y_true):
    epsilon = 1e-7
    tp = (y_true * y_pred).sum().to(torch.float32)
    fp = ( (1 - y_true) * y_pred ).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    f1 = 2 * (precision * recall) / (precision + recall + epsilon)
    return f1

class NasdaqTimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, stock_close_price_fets, stock_close_price_labels):
        self.stock_close_price_fets_ = stock_close_price_fets
        self.stock_close_price_labels_ = stock_close_price_labels
        self.count_ = 0

    def __len__(self):
        return len(self.stock_close_price_fets_)

    def __getitem__(self, idx):
        if DEBUG and self.count_ == 0:
            print(f"NasdaqDataset __getitem__ idx = {idx}")
        # each array holds 100 days of closing prices
        closing_price_features = self.stock_close_price_fets_[idx]
        closing_price_features_tensor = torch.tensor(closing_price_features).float()

        if DEBUG and self.count_ == 0:
            print(f"closing_price_features_tensor.shape = {closing_price_features_tensor.shape}")
        
        closing_price_labels = self.stock_close_price_labels_[idx]
        closing_price_labels_tensor = torch.tensor(closing_price_labels).float()

        if DEBUG and self.count_ == 0:
            print(f"closing_price_labels_tensor.shape = {closing_price_labels_tensor.shape}")

        self.count_ += 1
        
        return closing_price_features_tensor, closing_price_labels_tensor
    
# Ideally, I am thinking of parts of the preprocessing pipeline code, I can refactor toward nifi later
# Need a better telemetry dashboard to interact with NiFi: Unity, H2O Wave, PySide6 QT
class DataPrepPipeline:
    def __init__(self):
        pass

    # Based on Greg Hogg's Stock Pred Video: https://youtu.be/q_HS4s1L8UI?si=CHWvbvfoPExQcALa
    def prepare_df_for_lstm(self, stock_df, n_steps):
        """
        # 10 days stock price
        # it'll take the first 10 days, to then predict 11th day stock price
        # 10, 11, 12, 15, 16, 17, 18, 19, 20, 17, "18"
        # for the 12th day prediction, it'll take the 11th back to the 1st day
        # .., 11, 12, 15, 16, 17, 18, 19, 20, 17, 18, "19"
        """
        stock_df = dc(stock_df)
        stock_df["Date"] = pd.to_datetime(stock_df["Date"])

        stock_df.set_index("Date", inplace=True)

        for i in range(1, n_steps+1):
            stock_df[f"Close(t-{i})"] = stock_df["Close"].shift(i)

        stock_df.dropna(inplace=True)

        return stock_df

    def split_features_labels(self, stock_features_labels_pd):
        x_features = stock_features_labels_pd[:, 1:]
        y_labels = stock_features_labels_pd[:, 0]

        if DEBUG:
            print(f"x_features shape = {x_features.shape}; y_labels shape = {y_labels.shape}")
        return x_features, y_labels
    
class PyTorchPipeline:
    def __init__(self):
        pass

    def train_model(self, train_loader, lstm_model, optimizer, loss_criterion):
        train_loop = tqdm(train_loader)

        train_mse_loss_values = []
        train_rmse_loss_values = []
        train_f1_metric_values = []

        # closing_price_2d_tensor
        for batch_idx, (closing_price_2d_tensor, final_closing_price_tensor_gt) in enumerate(train_loop):
            # final_closing_price_tensor_gt = final_closing_price_tensor.unsqueeze(1)

            optimizer.zero_grad()
            closing_price_2d_tensor = closing_price_2d_tensor.to(device)
            final_closing_price_tensor_gt = final_closing_price_tensor_gt.to(device)
            pred_final_closing_price = lstm_model(closing_price_2d_tensor)
            mse_loss = loss_criterion(pred_final_closing_price, final_closing_price_tensor_gt)
            rmse_loss = rmse(pred_final_closing_price, final_closing_price_tensor_gt)

            f1_metric = f1_score(pred_final_closing_price, final_closing_price_tensor_gt)
            
            mse_loss.backward()
            optimizer.step()

            train_mse_loss_values.append(mse_loss.item())
            train_rmse_loss_values.append(rmse_loss.item())
            train_f1_metric_values.append(f1_metric.item())

            train_loop.set_postfix(train_mse_loss=mse_loss.item(), train_rmse_loss=rmse_loss.item(), train_f1=f1_metric.item())

        train_mse_loss_avg_values.append( sum(train_mse_loss_values)/len(train_loader) )
        train_rmse_loss_avg_values.append( sum(train_rmse_loss_values)/len(train_loader) )
        train_f1_avg_values.append( sum(train_f1_metric_values)/len(train_loader) )

    def validate_model(self, val_loader, lstm_model, loss_criterion):
        val_loop = tqdm(val_loader)

        val_mse_loss_values = []
        val_rmse_loss_values = []
        val_f1_metric_values = []

        for batch_idx, (closing_price_2d_tensor, final_closing_price_tensor_gt) in enumerate(val_loop):
            # final_closing_price_tensor_gt = final_closing_price_tensor.unsqueeze(1)
            closing_price_2d_tensor = closing_price_2d_tensor.to(device)
            final_closing_price_tensor_gt = final_closing_price_tensor_gt.to(device)
            
            pred_final_closing_price = lstm_model(closing_price_2d_tensor)
            mse_loss = loss_criterion(pred_final_closing_price, final_closing_price_tensor_gt)
            rmse_loss = rmse(pred_final_closing_price, final_closing_price_tensor_gt)

            f1_metric = f1_score(pred_final_closing_price, final_closing_price_tensor_gt)
            
            val_mse_loss_values.append(mse_loss.item())
            val_rmse_loss_values.append(rmse_loss.item())
            val_f1_metric_values.append(f1_metric.item())

            val_loop.set_postfix(val_mse_loss=mse_loss.item(), val_rmse_loss=rmse_loss.item(), val_f1=f1_metric.item())

        valid_mse_loss_avg_values.append( sum(val_mse_loss_values)/len(val_loader) )
        valid_rmse_loss_avg_values.append( sum(val_rmse_loss_values)/len(val_loader) )
        valid_f1_avg_values.append( sum(val_f1_metric_values)/len(val_loader) )

        
    def train_over_epochs(self, train_loader, test_loader, lstm_model, optimizer, loss_criterion, dst_folder="lstm_model"):
        step = 100
        for epoch in range(NUM_EPOCHS):
            self.train_model(train_loader, lstm_model, optimizer, loss_criterion)

            checkpoint = {
                "state_dict": lstm_model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }

            mkdir_prep_dir(f"{dataset_name}/{dst_folder}")
            filename = f"{dataset_name}/{dst_folder}/lstm_model_{step}.pth.tar"
            save_checkpoint(checkpoint, filename=filename)

            self.validate_model(test_loader, lstm_model, loss_criterion)

            step += 100

    def deploy_model(self, test_loader, lstm_model):
        test_loop = tqdm(test_loader)
        final_closing_price_predictions = []
        final_closing_price_gts = []
        count = 0

        for batch_idx, (closing_price_2d_tensor, final_closing_price_tensor) in enumerate(test_loop):
            closing_price_2d_tensor = closing_price_2d_tensor.to(device)
            final_closing_price_pred = lstm_model(closing_price_2d_tensor)
            # if DEBUG and count == 0:
#             print(f"final_closing_price_pred.item() = {final_closing_price_pred.item()}")
            final_closing_price_predictions.append(final_closing_price_pred.item())
            final_closing_price_gts.append(final_closing_price_tensor.item())
            count += 1
        return final_closing_price_gts, final_closing_price_predictions
    
    def test_dataloader_sample(self, data_loader):
        for batch_idx, (closing_price_features_tensor, closing_price_labels_tensor) in enumerate(data_loader):
            closing_price_features_tensor = closing_price_features_tensor.to(device)
            closing_price_labels_tensor = closing_price_labels_tensor.to(device)
            print(f"close price features shape: {closing_price_features_tensor.shape}; close price labels shape: {closing_price_labels_tensor.shape}")
            break
    
class PyTorchUtility:
    def __init__(self):
        pass

    # TODO (JG): Add arguments for naming the two labels
    def plot_loss_curves(self, train_loss_values, test_loss_values, plot_title, plot_filename):
        f, ax = plt.subplots()
        ax.set_title(plot_title)
        ax.plot(train_loss_values, color="blue", label="Train RMSE")
        ax.plot(test_loss_values, color="gold", label="Valid RMSE")
        ax.grid(True)
        ax.set_xlabel("Epochs")
        ax.set_ylabel("RMSE Loss")
        plt.legend()
        plt.savefig(plot_filename)
        plt.show()

    
    def plot_gt_pred_curves(self, final_closing_price_gts, final_closing_price_predictions, plot_title, plot_filename):
        f, ax = plt.subplots()
        ax.set_title(plot_title)
        ax.plot(final_closing_price_predictions, color="blue", label="Pred Price")
        ax.plot(final_closing_price_gts, color="gold", label="GT Price")
        ax.grid(True)
        ax.set_xlabel("Time")
        ax.set_ylabel("Price")
        plt.legend()
        plt.savefig(plot_filename)
        plt.show()

    def plot_f1_curves(self, train_f1_values, test_f1_values, plot_title, plot_filepath):
        # mkdir_prep_dir(plot_filepath)
        f, ax = plt.subplots()
        ax.set_title(plot_title)
        ax.plot(train_f1_values, color="blue", label="Train F1")
        ax.plot(test_f1_values, color="gold", label="Valid F1")
        ax.grid(True)
        ax.set_xlabel("Epochs")
        ax.set_ylabel("F1 Score")
        plt.legend()
        plt.savefig(plot_filepath)
        plt.show()
        

    # TODO (JG): Need to check num_days preds for closing price. Is it suppose to be over
    def save_lstm_predictions(self, final_closing_price_dates, final_closing_price_predictions, scale, num_days, dst_folder="lstm_model", filename="pytorch_final_closing_price_preds.csv"):
        mkdir_prep_dir(f"{dataset_name}/{dst_folder}")
        dst_filepath = f"{dataset_name}/{dst_folder}/{filename}"

        # final close price pred for each 100th date after predicting on 100 days of final closing prices
#         print(final_closing_price_dates[:5], final_closing_price_predictions[:5])
        stock_close_price_res_df = pd.DataFrame({f"{num_days}th Date": final_closing_price_dates, "Close": final_closing_price_predictions})
        stock_close_price_res_df.to_csv(dst_filepath, index=False)
        return stock_close_price_res_df
    
# BEST Results so far with regards to MSE, RMSE, etc
class StockClosePriceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=10, num_stacked_layers=1, dropout_ratio=0.2):
        super(StockClosePriceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True, 
                            dropout=dropout_ratio, bias=True)

        self.fc = nn.Linear(hidden_size, 1)
        # self.dropout1 = nn.Dropout(0.2)
        # self.relu = nn.ReLU()

        self.count_ = 0

    def forward(self, closing_price_2d_features):
        # closing price 2D features: row by col, each row has 100 samples since 100 days of closing prices, 
        # trying to pred final closing price
        if DEBUG and self.count_ == 0:
            print(f"type(closing_price_2d_features) = {type(closing_price_2d_features)}")
        batch_size = closing_price_2d_features.size(0)
        if DEBUG and self.count_ == 0:
            print(f"closing_price_2d_features batch_size = {batch_size}")
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)

        
        lstm_features, lstm_hidden = self.lstm(closing_price_2d_features, (h0, c0))
        # decode hidden state of the last time step
        lstm_features = lstm_features[:, -1, :]
        if DEBUG and self.count_ == 0:
            print(f"type(lstm_features) = {type(lstm_features)}")
        stock_price_pred = self.fc(lstm_features)

        self.count_ += 1
        return stock_price_pred
    
# NOTE: Not so great results when trying multiple LSTM layers with varying hidden features
class StockPriceLSTMV2(nn.Module):
    def __init__(self, input_size, hidden_sizes=[10, 20, 30, 40, 50], num_stacked_layers=5, dropout_ratio=0.3, num_classes=1):
        super(StockPriceLSTMV2, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.num_stacked_layers = num_stacked_layers

        self.lstm_list = nn.ModuleList([
            nn.LSTM(input_size, self.hidden_sizes[0], 
                    batch_first=True, dropout=dropout_ratio, bias=True)
        ])

        for i in range(1, self.num_stacked_layers):
            if i < self.num_stacked_layers-1:
                print(f"Not on last LSTM layer {i}, adding dropout = {dropout_ratio}")
                self.lstm_list.append( nn.LSTM(self.hidden_sizes[i-1], self.hidden_sizes[i], 
                                              batch_first=True, dropout=dropout_ratio, bias=True) )
            else:
                print(f"On last LSTM layer {i}, ignoring dropout = {dropout_ratio}")
                self.lstm_list.append( nn.LSTM(self.hidden_sizes[i-1], self.hidden_sizes[i], 
                                              batch_first=True, bias=True) )
        
        # self.lstm_list.extend([
        #     nn.LSTM(self.hidden_sizes[i-1], self.hidden_sizes[i], batch_first=True, dropout=dropout_ratio, bias=True)
        #         for i in range(1, self.num_stacked_layers)
        # ])

        self.fc = nn.Linear(self.hidden_sizes[-1], num_classes)
        self.count_ = 0

    def forward(self, stock_price_features):
        if DEBUG and self.count_ == 0:
            print(f"type(stock_price_features) = {type(stock_price_features)}")
        batch_size = stock_price_features.size(0)
        if DEBUG and self.count_ == 0:
            print(f"stock_price_features batch_size = {batch_size}")

        hiddens = [torch.zeros(batch_size, hidden_size).to(stock_price_features.device)
                       for hidden_size in self.hidden_sizes]

        for i in range(self.num_stacked_layers):
            stock_price_features, (hiddens[i], _) = self.lstm_list[i](stock_price_features, (
                hiddens[i].unsqueeze(0), hiddens[i].unsqueeze(0)
            ) )
        
        stock_price_features = stock_price_features[:, -1, :]
        if DEBUG and self.count_ == 0:
            print(f"type(stock_price_features) = {type(stock_price_features)}")
        stock_price_pred = self.fc(stock_price_features)

        self.count_ += 1
        return stock_price_pred
    

## TensorFlow Keras Functions & Classes

In [None]:
class Time2Vector(Layer):
    def __init__(self, sequence, **kwargs):
        super(Time2Vector, self).__init__()
        self.sequence = sequence

    def build(self, input_shape):
        self.weights_linear = self.add_weight(name='weight_linear',
                                    shape=(int(self.sequence),),
                                    initializer='uniform',
                                    trainable=True)

        self.bias_linear = self.add_weight(name='bias_linear',
                                    shape=(int(self.sequence),),
                                    initializer='uniform',
                                    trainable=True)

        self.weights_periodic = self.add_weight(name='weight_periodic',
                                    shape=(int(self.sequence),),
                                    initializer='uniform',
                                    trainable=True)

        self.bias_periodic = self.add_weight(name='bias_periodic',
                                    shape=(int(self.sequence),),
                                    initializer='uniform',
                                    trainable=True)

    def call(self, x):
        x = tf.math.reduce_mean(x[:,:,:4], axis=-1) 
        time_linear = self.weights_linear * x + self.bias_linear
        time_linear = tf.expand_dims(time_linear, axis=-1) 

        time_periodic = tf.math.sin(tf.multiply(x, self.weights_periodic) + self.bias_periodic)
        time_periodic = tf.expand_dims(time_periodic, axis=-1) 
        return tf.concat([time_linear, time_periodic], axis=-1) 

    def get_config(self): 
        config = super().get_config().copy()
        config.update({'sequence': self.sequence})
        return config

class SingleAttention(Layer):
    def __init__(self, d_k, d_v):
        super(SingleAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v

    def build(self, input_shape):
        self.query = Dense(self.d_k, 
                           input_shape=input_shape, 
                           kernel_initializer='glorot_uniform', 
                           bias_initializer='glorot_uniform')

        self.key = Dense(self.d_k, 
                         input_shape=input_shape, 
                         kernel_initializer='glorot_uniform', 
                         bias_initializer='glorot_uniform')

        self.value = Dense(self.d_v, 
                           input_shape=input_shape, 
                           kernel_initializer='glorot_uniform', 
                           bias_initializer='glorot_uniform')

    def call(self, inputs): # inputs = (in_seq, in_seq, in_seq)
        q = self.query(inputs[0])
        k = self.key(inputs[1])

        attn_weights = tf.matmul(q, k, transpose_b=True)
        attn_weights = tf.map_fn(lambda x: x/np.sqrt(self.d_k), attn_weights)
        attn_weights = tf.nn.softmax(attn_weights, axis=-1)

        v = self.value(inputs[2])
        attn_out = tf.matmul(attn_weights, v)
        return attn_out    

class MultiAttention(Layer):
    def __init__(self, d_k, d_v, n_heads):
        super(MultiAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.attn_heads = list()

    def build(self, input_shape):
        for n in range(self.n_heads):
            self.attn_heads.append(SingleAttention(self.d_k, self.d_v))  

        # input_shape[0]=(batch, seq_len, 7), input_shape[0][-1]=7 
        self.linear = Dense(input_shape[0][-1], 
                            input_shape=input_shape, 
                            kernel_initializer='glorot_uniform', 
                            bias_initializer='glorot_uniform')

    def call(self, inputs):
        attn = [self.attn_heads[i](inputs) for i in range(self.n_heads)]
        concat_attn = tf.concat(attn, axis=-1)
        multi_linear = self.linear(concat_attn)
        return multi_linear   

class TransformerEncoder(Layer):
    def __init__(self, d_k, d_v, n_heads, ff_dim, dropout=0.1, **kwargs):
        super(TransformerEncoder, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.ff_dim = ff_dim
        self.attn_heads = list()
        self.dropout_rate = dropout

    def build(self, input_shape):
        self.attn_multi = MultiAttention(self.d_k, self.d_v, self.n_heads)
        self.attn_dropout = Dropout(self.dropout_rate)
        self.attn_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)

        self.ff_conv1D_1 = Conv1D(filters=self.ff_dim, kernel_size=1, activation='relu')
        self.ff_conv1D_2 = Conv1D(filters=input_shape[0][-1], kernel_size=1) 
        self.ff_dropout = Dropout(self.dropout_rate)
        self.ff_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)    

    def call(self, inputs): 
        attn_layer = self.attn_multi(inputs)
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(inputs[0] + attn_layer)

        ff_layer = self.ff_conv1D_1(attn_layer)
        ff_layer = self.ff_conv1D_2(ff_layer)
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(inputs[0] + ff_layer)
        return ff_layer 

    def get_config(self): 
        config = super().get_config().copy()
        config.update({'d_k': self.d_k,
                       'd_v': self.d_v,
                       'n_heads': self.n_heads,
                       'ff_dim': self.ff_dim,
                       'attn_heads': self.attn_heads,
                       'dropout_rate': self.dropout_rate})
        return config  

def create_model():
    time_embedding = Time2Vector(sequence)
    attn_layer1 = TransformerEncoder(d_k, d_v, n_heads, ff_dim)
    attn_layer2 = TransformerEncoder(d_k, d_v, n_heads, ff_dim)

    in_seq = Input(shape=(sequence, 6))
    x = time_embedding(in_seq)
    x = Concatenate(axis=-1)([in_seq, x])
    x = attn_layer1((x, x, x))
    x = attn_layer2((x, x, x))

    x = GlobalAveragePooling1D(data_format='channels_first')(x)
    x = Dropout(0.1)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    out = Dense(1, activation='linear')(x)

    model = Model(inputs=in_seq, outputs=out)
    model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mape'])
    return model

## Train & Test PyTorch LSTM Model 

on stock closing price predictions

In [None]:
one_hundred_num_days = 100
two_hundred_num_days = 200
apple_mavg_100_days = stock_dfs['A'].Close.rolling(one_hundred_num_days).mean()
apple_mavg_200_days = stock_dfs['A'].Close.rolling(two_hundred_num_days).mean()

train_ratio = 0.60
valid_ratio = 0.20 + train_ratio # 0.80
test_ratio = 0.20

lookback = 100

scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
tickers = ['AAPL', 'ACH', 'ACGL', 'ACER', 'ABT']

In [None]:
# First get moving average 100 vs closing price predictions
# mkdir_prep_dir("/kaggle/working/stock_closing_price/data_explore/plots")
mkdir_prep_dir("/kaggle/working/stock_closing_price/data_explore/mavg100_close_price")

In [None]:
# Save At least 5 company tickers in data explore folder
for ticker_stock in tickers:
    mavg_100_days = stock_dfs[ticker_stock].Close.rolling(lookback).mean()
    stock_close_prices = stock_dfs[ticker_stock].Close
    save_close_price_mavg(stock_close_prices, mavg_100_days,
                          dst_folder=f"/kaggle/working/stock_closing_price/data_explore/mavg100_close_price", 
                          filename=f"{ticker_stock}_mavg100_vs_close_prices.csv")

In [None]:
nifi_pipeline = DataPrepPipeline()
pytorch_pipeline = PyTorchPipeline()
pytorch_utility = PyTorchUtility()

for ticker_stock in tickers:
    print(f"Training PyTorch LSTM on {ticker_stock}")
    ticker_pd = stock_dfs[ticker_stock]
    ticker_pd = ticker_pd[["Date", "Close"]]
    # Array slicing on 1st 100 Days Stock Data
    ticker_stock_fets_labels_df = nifi_pipeline.prepare_df_for_lstm(ticker_pd, n_steps=lookback)
    ticker_pd_scale = scaler.fit_transform(ticker_stock_fets_labels_df)
    x_stock_fets_list, y_stock_labels_list = nifi_pipeline.split_features_labels(ticker_pd_scale)
    # 60% for training: 0 to 60%
    train_split_index = int(len(x_stock_fets_list) * train_ratio)
    valid_split_index = int(len(x_stock_fets_list)*valid_ratio)
    test_split_index = len(x_stock_fets_list)
    X_ticker_stock_train = x_stock_fets_list[0: train_split_index]
    
    # 20% for validation: 60% to 80%
    X_ticker_stock_valid = x_stock_fets_list[train_split_index: valid_split_index]
    
    X_ticker_stock_test = x_stock_fets_list[valid_split_index: test_split_index]
    
    y_ticker_stock_train = y_stock_labels_list[0: train_split_index]

    y_ticker_stock_valid = y_stock_labels_list[train_split_index: valid_split_index]

    y_ticker_stock_test = y_stock_labels_list[valid_split_index: test_split_index]
    
    Date_ticker_stock_train = ticker_pd.index[0: train_split_index]
    Date_ticker_stock_valid = ticker_pd.index[train_split_index: valid_split_index]
    Date_ticker_stock_test = ticker_pd.index[valid_split_index: test_split_index]
    
    # Add extra dimension for LSTM prep
    X_ticker_stock_train = X_ticker_stock_train.reshape( (-1, lookback, 1) )
    X_ticker_stock_valid = X_ticker_stock_valid.reshape( (-1, lookback, 1) )
    X_ticker_stock_test = X_ticker_stock_test.reshape( (-1, lookback, 1) )

    y_ticker_stock_train = y_ticker_stock_train.reshape( (-1, 1) )
    y_ticker_stock_valid = y_ticker_stock_valid.reshape( (-1, 1) )
    y_ticker_stock_test = y_ticker_stock_test.reshape( (-1, 1) )

    stock_train_dataset = NasdaqTimeSeriesDataset(X_ticker_stock_train, y_ticker_stock_train)

    stock_valid_dataset = NasdaqTimeSeriesDataset(X_ticker_stock_valid, y_ticker_stock_valid)
    stock_test_dataset = NasdaqTimeSeriesDataset(X_ticker_stock_test, y_ticker_stock_test)
    
    # Create PyTorch Nasdaq DataLoader (Train, Test)
    stock_train_loader = DataLoader(stock_train_dataset, batch_size = BATCH_SIZE, shuffle=True)
    stock_valid_loader = DataLoader(stock_valid_dataset, batch_size = TEST_BATCH_SIZE, shuffle=False)
    stock_test_loader = DataLoader(stock_test_dataset, batch_size = TEST_BATCH_SIZE, shuffle=False)
    
    # Choose model
    lstm_model = StockClosePriceLSTM(input_size=1, hidden_size=10, num_stacked_layers=1, dropout_ratio=0.3)
    lstm_model.to(device)
    mse_criterion = nn.MSELoss()
    lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)

    pytorch_pipeline.train_over_epochs(
        stock_train_loader, stock_valid_loader,
        lstm_model,
        lstm_optimizer,
        mse_criterion,
        dst_folder="lstm_stock_close"
    )
    
    # Get predictions
    final_closing_price_gts, final_closing_price_predictions = pytorch_pipeline.deploy_model(stock_test_loader, lstm_model)

    unscale = 1/scaler.scale_
    final_closing_price_unscaled_preds = final_closing_price_predictions*np.array([unscale[0]])
    final_closing_price_unscaled_gts = final_closing_price_gts*np.array([unscale[0]])

    pytorch_utility.save_lstm_predictions(Date_ticker_stock_test, final_closing_price_unscaled_preds, np.array([unscale[0]]),
                                       num_days=100, dst_folder="predictions/", 
                                       filename=f"{ticker_stock}_pytorch_final_closing_price_preds.csv")

# Train & Test TensorFlow Keras Transformer Model

on stock closing price predictions

In [None]:
sequence = 128
batch_size = 128
d_k = 64
d_v = 64
n_heads = 4
ff_dim = 64

In [None]:
## Transformers Utils

def prepare_data(data, sequence):
    X, y = [], []
    for i in range(sequence, len(data)):
        X.append(data[i - sequence:i])
        y.append(data[:, 3][i]) 
    return np.array(X), np.array(y)

def save_tfms_predictions(stock_test_df, final_closing_price_predictions, dst_folder="lstm_model", filename="pytorch_final_closing_price_preds.csv"):
    mkdir_prep_dir(f"{dataset_name}/{dst_folder}")
    dst_filepath = f"{dataset_name}/{dst_folder}/{filename}"

    # final close price pred for each 100th date after predicting on 100 days of final closing prices
    stock_test_df['Close'] = final_closing_price_predictions
    stock_test_df.to_csv(dst_filepath, index=False)
    print("\nExported to csv successfully...")
    return stock_test_df


In [None]:
for ticker_stock in tickers:
    # Loading individual ticker
    
    ticker_pd = stock_dfs[ticker_stock]
    ticker_pd.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
    
    ticker_pd.set_index('Date', inplace=True)
    ticker_pd.sort_index(inplace=True)
    
    cols = ['Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']
    ticker_pd[cols] = ticker_pd[cols].rolling(100).mean()
    
    ticker_pd.dropna(axis=0, inplace=True)
    
    scaler = MinMaxScaler().set_output(transform="pandas")
    ticker_pd = scaler.fit_transform(ticker_pd)
    
    train_df = ticker_pd.iloc[: int(0.8*len(ticker_pd))]
    val_df = ticker_pd.iloc[int(0.8*len(ticker_pd)): int(0.9*len(ticker_pd))]
    test_df = ticker_pd.iloc[int(0.9*len(ticker_pd)): ]
    print(train_df.shape, val_df.shape, test_df.shape) 
    
    # Preparing model input and labels
    X_train, y_train = prepare_data(train_df.values, sequence)
    X_val, y_val = prepare_data(val_df.values, sequence)
    X_test, y_test = prepare_data(test_df.values , sequence)
    
    print('Training set shape: ', X_train.shape, y_train.shape)
    print('Validation set shape: ', X_val.shape, y_val.shape)
    print('Testing set shape: ' ,X_test.shape, y_test.shape)
    
    t_model = create_model()
    callback = tf.keras.callbacks.ModelCheckpoint('Transformer_timeEmbd.hdf5', monitor = 'val_loss', save_best_only = True, verbose = 1)
    history = t_model.fit(X_train, y_train, batch_size = batch_size, epochs = 20, callbacks = [callback], validation_data = (X_val, y_val))
    t_model = tf.keras.models.load_model('Transformer_timeEmbd.hdf5', 
                                         custom_objects={'Time2Vector': Time2Vector, 
                                                         'SingleAttention': SingleAttention, 
                                                         'MultiAttention': MultiAttention,
                                                         'TransformerEncoder': TransformerEncoder})
    
    test_predictions = t_model.predict(X_test)
    close_idx = list(scaler.feature_names_in_).index("Close")
    min_close, max_close = scaler.data_max_[close_idx], scaler.data_min_[close_idx]
    test_predictions = test_predictions * (max_close - min_close) + min_close
    
    test_df.reset_index(inplace=True)
    save_tfms_predictions(test_df[['Date']].iloc[sequence:], test_predictions, 
                          dst_folder="predictions/", 
                          filename=f"{ticker_stock}_transformers_final_closing_price_preds.csv")

# Stock Technical Indicator Helper Functions

In [None]:
def MACDecision(stock_price_pred_df, model_strategy_type_used):
    stock_price_pred_df[f"MACD_diff_{model_strategy_type_used}"] = ta.trend.macd_diff(stock_price_pred_df.Close)
    # stock_price_pred_df.loc[(stock_price_pred_df.MACD_diff > 0) & 
    #                         (stock_price_pred_df.MACD_diff.shift(1) < 0), "Decision MACD"] = "Buy"
    stock_price_pred_df[f"Decision MACD_{model_strategy_type_used}"] = np.where((stock_price_pred_df[f"MACD_diff_{model_strategy_type_used}"] > 0) & 
                            (stock_price_pred_df[f"MACD_diff_{model_strategy_type_used}"].shift(1) < 0), True, False)
    
def GoldenCrossDecision(stock_price_pred_df, model_strategy_type_used):
    # if short term moving avg > long term moving avg for the day before, then buy, else sell
    stock_price_pred_df[f"SMA20_{model_strategy_type_used}"] = ta.trend.sma_indicator(stock_price_pred_df.Close, window = 20)
    stock_price_pred_df[f"SMA50_{model_strategy_type_used}"] = ta.trend.sma_indicator(stock_price_pred_df.Close, window = 50)
    stock_price_pred_df[f"Signal_{model_strategy_type_used}"] = np.where(stock_price_pred_df[f"SMA20_{model_strategy_type_used}"] > stock_price_pred_df[f"SMA50_{model_strategy_type_used}"],
                                             True, False)
    stock_price_pred_df[f"Decision GC_{model_strategy_type_used}"] = stock_price_pred_df[f"Signal_{model_strategy_type_used}"].diff()
    
def RSI_SMA_Decision(stock_price_pred_df, model_strategy_type_used):
    # NOTE: our PyTorch model was trained on 100 days to predict final closing price
    stock_price_pred_df[f"RSI_{model_strategy_type_used}"] = ta.momentum.rsi(stock_price_pred_df.Close, window=10)
    # This probably would be simple moving average based on 200 days of predicted closing stock prices
    stock_price_pred_df[f"SMA200_{model_strategy_type_used}"] = ta.trend.sma_indicator(stock_price_pred_df.Close, window = 200)
    # stock_price_pred_df.loc[(stock_price_pred_df.Close > stock_price_pred_df.SMA200) &
    #                         (stock_price_pred_df.RSI < 30), "Decision RSI/SMA"] = "Buy"
    stock_price_pred_df[f"Decision RSI/SMA_{model_strategy_type_used}"] = np.where((stock_price_pred_df.Close > stock_price_pred_df[f"SMA200_{model_strategy_type_used}"]) &
                            (stock_price_pred_df[f"RSI_{model_strategy_type_used}"] < 30), True, False)
    
def FinancialStatusDecision(stock_price_pred_df, model_strategy_type_used, symbols_metadata_df, company_ticker):
    financial_status_val = symbols_metadata_df[symbols_metadata_df.Symbol == company_ticker].iloc[0]["Financial Status"]
    always_buy_condition = pd.isna(financial_status_val) or financial_status_val == "N"
    short_term_decision = stock_price_pred_df[f"Decision GC_{model_strategy_type_used}"] is True
    long_term_decision = stock_price_pred_df[f"Decision RSI/SMA_{model_strategy_type_used}"] is True
    conditionaly_buy_condition = (financial_status_val == "D" or financial_status_val == "E" or financial_status_val == "H") and short_term_decision and not long_term_decision
    stock_price_pred_df["Decision Financial Status"] = np.where(always_buy_condition | conditionaly_buy_condition, True, False)


## Stock Predicted Close Price Indicator Recommender System

Ingest Stock Close Price Predicted Data and then perform stock closing price recommendation based on technical indicators.

In [None]:
stock_price_predict_pds = {}
directory_path = "/kaggle/working/stock_closing_price/predictions"
directory_path_recommend_output = "/kaggle/working/stock_closing_price"
for file in os.listdir(directory_path):
    if file.endswith('.csv'):
        stock_price_pred_pd = pd.read_csv(os.path.join(directory_path, file))
        filename_without_extension = os.path.splitext(file)[0]  # Get filename without extension
        filename_splitted = filename_without_extension.split("_")
        print(filename_splitted)
        company_ticker = filename_splitted[0]
        model_strategy_type_used = filename_splitted[1]
        MACDecision(stock_price_pred_pd, model_strategy_type_used)
        GoldenCrossDecision(stock_price_pred_pd, model_strategy_type_used)
        RSI_SMA_Decision(stock_price_pred_pd, model_strategy_type_used)
        FinancialStatusDecision(stock_price_pred_pd, model_strategy_type_used, symbols_metadata_df, company_ticker)
        if company_ticker in stock_price_predict_pds:
            # Get columns from df1 that are not in df2
            columns_to_add = stock_price_pred_pd.columns.difference(stock_price_predict_pds[company_ticker].columns)

            # Add columns from df1 to df2
            for col in columns_to_add:
                stock_price_predict_pds[company_ticker][col] = stock_price_pred_pd[col]
        else:
            stock_price_predict_pds[company_ticker] = stock_price_pred_pd
        
#         if stock_price_pred_pd["Decision MACD"].iloc[-1] == "True":
#             print(f"Buying Signal MACD for {company_ticker}")
#         if stock_price_pred_pd["Decision RSI/SMA"].iloc[-1] == "True":
#             print(f"Buying Signal RSI/SMA for {company_ticker}")

for key, value in stock_price_predict_pds.items():
    value.to_csv(f"{directory_path_recommend_output}/{key}_recommend.csv", index=False)