Ethan Bartiromo Final Project Notebook
4-15-2025

Here we import all the needed libraries:

In [7]:
from statsmodels.tsa.seasonal import seasonal_decompose
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error

Here we load in the datasets and combined them:

In [8]:
df_aapl = pd.read_csv("AAPL.csv")
df_googl = pd.read_csv("GOOGL.csv")
df_meta = pd.read_csv("META.csv")
df_nvda = pd.read_csv("NVDA.csv")
df_tsla = pd.read_csv("TSLA.csv")
df_aapl["Date"] = pd.to_datetime(df_aapl["Date"], format="%Y-%m-%d")
df_googl["Date"] = pd.to_datetime(df_googl["Date"], format="%Y-%m-%d")
df_meta["Date"] = pd.to_datetime(df_meta["Date"], format="%Y-%m-%d")
df_nvda["Date"] = pd.to_datetime(df_nvda["Date"], format="%Y-%m-%d")
df_tsla["Date"] = pd.to_datetime(df_tsla["Date"], format="%Y-%m-%d")
df_aapl["Volume"] = df_aapl["Volume"].astype(float)
df_googl["Volume"] = df_googl["Volume"].astype(float)
df_meta["Volume"] = df_meta["Volume"].astype(float)
df_nvda["Volume"] = df_nvda["Volume"].astype(float)
df_tsla["Volume"] = df_tsla["Volume"].astype(float)
df = pd.merge(df_aapl, df_googl, on="Date", suffixes=("", "_googl"))
df = pd.merge(df, df_meta, on="Date", suffixes=("", "_meta"))
df = pd.merge(df, df_nvda, on="Date", suffixes=("", "_nvda"))
df = pd.merge(df, df_tsla, on="Date", suffixes=("_aapl", "_tsla"))

!!!DEPRECATED!!!  This is where I make all of the helper functions I need in order to create and use any model easily:

In [None]:
### DO NOT RUN BLOCK DEPRECATED ###
# 
#
#
#
#
###################################
splits = [df["Date"][(len(df["Date"])*i)//20] for i in range(10,16)]
names = df.columns
y_names = ["Date"]
for val in names:
    if val[0] == "O" or val[0] == "C":
        y_names.append(val)
no_date = names[1:]
no_date_y = y_names[1:]
X = df.copy()
y = df.copy()[y_names]


pd.options.mode.chained_assignment = None  # default='warn'

def create_model(num_layers, num_neurons, drop_rate, activations, optim):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(25,)))
    model.add(tf.keras.layers.BatchNormalization())
    for i in range(num_layers):
        model.add(tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=0)))
        model.add(tf.keras.layers.SimpleRNN(num_neurons[i], activation=activations[i]))
    if drop_rate > 0.0:
        model.add(tf.keras.layers.Dropout(rate=drop_rate))
    model.add(tf.keras.layers.Dense(100))
    model.compile(optimizer=optim, loss="mse")
    return model

def use_model(epochs=100, split=splits[0], num_layers=1, num_neurons=[10], drop_rate=0.0, activations=["relu"], optim="Adam"):
    model = create_model(num_layers, num_neurons, drop_rate, activations, optim)
    X_train = X[X["Date"] < split]
    X_train
    y_train = pd.DataFrame(columns=y.columns)
    y_train["X_date"] = pd.Series(dtype=X["Date"].dtype)
    for val in X_train["Date"]:
        i = 0
        num = 0
        while num != 10:
            if not y[y["Date"] == val + pd.DateOffset(days=i)].empty:
                y_train_new = y[y["Date"] == val + pd.DateOffset(days=i)]
                y_train_new["X_date"] = val
                y_train = pd.concat([y_train, y_train_new])
                num += 1
            i += 1
    y_train = y_train.groupby("X_date").agg(lambda x: list(x))
    X_test = X[X["Date"] >= split]
    y_test = pd.DataFrame(columns=y.columns)
    y_test["X_date"] = pd.Series(dtype=X["Date"].dtype)
    for val in X_test["Date"][:-10]:
        i = 0
        num = 0
        while num != 10:
            if not y[y["Date"] == val + pd.DateOffset(days=i)].empty:
                y_test_new = y[y["Date"] == val + pd.DateOffset(days=i)]
                y_test_new["X_date"] = val
                y_test = pd.concat([y_test, y_test_new])
                num += 1
            i += 1
    y_test = y_test.groupby("X_date").agg(lambda x: list(x))
    history = fit_mod(X_train, y_train, model, epochs)
    loss = evaluate_mod(X_test, y_test, model)
    return (model, history, loss)

def fit_mod(X_train, y_train, model, epochs):
    y_train_used = np.array(y_train[:][no_date_y].values.tolist())
    y_train_used = y_train_used.reshape((len(y_train_used), 100))
    return model.fit(X_train[no_date], y_train_used, epochs=epochs)

def predict_mod(X_test, model):
    return model.predict(X_test[no_date][:-10], batch_size=1)

def evaluate_mod(X_test, y_test, model):
    y_test_used = np.array(y_test[:][no_date_y].values.tolist())
    y_test_used = y_test_used.reshape((len(y_test_used), 100))
    return mean_squared_error(predict_mod(X_test, model), y_test_used) 


def grid_search(epochs_lst = [100], splits_lst=splits, num_layers_lst=[1], num_neurons_lst = [[10]], drop_rate_lst = [0.0], activations_lst = [["relu"]], optim_lst = ["Adam"]):
    models = []
    histories = []
    losses = []
    min_loss = float("inf")
    min_loss_index = -1
    min_loss_params = [None, None, None, None, None, None, None]
    i = 0
    for epochs in epochs_lst:
        for split in splits_lst:
            for num_layers in num_layers_lst:
                for num_neurons in num_neurons_lst:
                    if len(num_neurons) == num_layers:    
                        for activations in activations_lst:
                            if len(activations) == num_layers:
                                for drop_rate in drop_rate_lst:
                                    for optim in optim_lst:
                                        model, history, loss = use_model(epochs, split, num_layers, num_neurons, drop_rate, activations, optim)
                                        models.append(model)
                                        histories.append(history)
                                        losses.append(loss)
                                        if loss < min_loss:
                                            min_loss = loss
                                            min_loss_index = i
                                            min_loss_params = [epochs, split, num_layers, num_neurons, drop_rate, activations, optim]
                                        i += 1
    return (models[min_loss_index], min_loss_params)              

### DEPRECATED ### 


This optimizes the above data, and uses 100 previous day input data points instead of only 1:

In [105]:
splits = [df["Date"][(len(df["Date"])*i)//20] for i in range(10,16)]
names = df.columns
y_names = ["Date"]
for val in names:
    if val[0] == "O" or val[0] == "C":
        y_names.append(val)
no_date = names[1:]
no_date_y = y_names[1:]
X = df.copy()
y = df.copy()[y_names]


pd.options.mode.chained_assignment = None  # default='warn'

def create_model(num_neurons, drop_rate, activations, optim):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(2500,)))
    model.add(tf.keras.layers.BatchNormalization())
    for i in range(min(len(num_neurons), len(activations))):
        model.add(tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=0)))
        model.add(tf.keras.layers.SimpleRNN(num_neurons[i],  activation=activations[i]))
    if drop_rate > 0.0:
        model.add(tf.keras.layers.Dropout(rate=drop_rate))
    model.add(tf.keras.layers.Dense(100))
    model.compile(optimizer=optim, loss="mse")
    return model

def use_model(epochs=100, split=splits[0], num_neurons=[10], drop_rate=0.0, activations=["relu"], optim="Adam"):
    model = create_model(num_neurons, drop_rate, activations, optim)
    X_train = pd.DataFrame(columns=X.columns)
    X_test = pd.DataFrame(columns=X.columns)
    X_test["X_date"] = pd.Series(dtype=X["Date"].dtype)
    X_train = X[:100]
    X_train["X_date"] = X["Date"][99]
    prev_val = X["Date"][99]
    for val in X["Date"][100:-10]:
        if val < split:
            X_train_new = X_train[X_train["X_date"] == prev_val][1:]
            X_train_new = pd.concat([X_train_new, X[X["Date"] == val]])
            X_train_new["X_date"] = val
            X_train = pd.concat([X_train, X_train_new])
            prev_val = val
        elif val == split:
            X_test = X_train[X_train["X_date"] == prev_val][1:]
            X_test = pd.concat([X_test, X[X["Date"] == val]])
            X_test["X_date"] = val
            prev_val = val
        else:
            X_test_new = X_test[X_test["X_date"] == prev_val][1:]
            X_test_new = pd.concat([X_test_new, X[X["Date"] == val]])
            X_test_new["X_date"] = val
            X_test = pd.concat([X_test, X_test_new])
            prev_val = val
    X_train = X_train.groupby("X_date").agg(lambda x: list(x))
    X_test = X_test.groupby("X_date").agg(lambda x: list(x))
    y_train = pd.DataFrame(columns=y.columns)
    y_train["X_date"] = pd.Series(dtype=y["Date"].dtype)
    y_test = pd.DataFrame(columns=y.columns)
    y_test["X_date"] = pd.Series(dtype=y["Date"].dtype)
    y_train = y[100:110]
    y_train["X_date"] = y["Date"][99]
    prev_val = y["Date"][99]
    for val in y["Date"][100:-10]:
        if val < split:
            y_train_new = y_train[y_train["X_date"] == prev_val][1:]
            y_train_new = pd.concat([y_train_new, y[y["Date"] == val]])
            y_train_new["X_date"] = val
            y_train = pd.concat([y_train, y_train_new])
            prev_val = val
        elif val == split:
            y_test = y_train[y_train["X_date"] == prev_val][1:]
            y_test = pd.concat([y_test, y[y["Date"] == val]])
            y_test["X_date"] = val
            prev_val = val
        else:
            y_test_new = y_test[y_test["X_date"] == prev_val][1:]
            y_test_new = pd.concat([y_test_new, y[y["Date"] == val]])
            y_test_new["X_date"] = val
            y_test = pd.concat([y_test, y_test_new])
            prev_val = val
    y_train = y_train.groupby("X_date").agg(lambda x: list(x))
    y_test = y_test.groupby("X_date").agg(lambda x: list(x))
    history = fit_mod(X_train, y_train, model, epochs)
    loss = evaluate_mod(X_test, y_test, model)
    return (model, history, loss)

def fit_mod(X_train, y_train, model, epochs):
    y_train_used = np.array(y_train[:][no_date_y].values.tolist())
    y_train_used = y_train_used.reshape((len(y_train_used), 100))
    X_train_used = np.array(X_train[:][no_date].values.tolist())
    X_train_used = X_train_used.reshape((len(X_train_used), 2500))
    return model.fit(X_train_used, y_train_used, epochs=epochs)

def predict_mod(X_test, model):
    X_test_used = np.array(X_test[:][no_date].values.tolist())
    X_test_used = X_test_used.reshape((len(X_test_used), 2500))
    return model.predict(X_test_used, batch_size=1)

def evaluate_mod(X_test, y_test, model):
    y_test_used = np.array(y_test[:][no_date_y].values.tolist())
    y_test_used = y_test_used.reshape((len(y_test_used), 100))
    return mean_squared_error(predict_mod(X_test, model), y_test_used) 


def grid_search(epochs_lst = [100], splits_lst=splits, num_neurons_lst = [[10]], drop_rate_lst = [0.0], activations_lst = [["relu"]], optim_lst = ["Adam"]):
    models = []
    histories = []
    losses = []
    min_loss = float("inf")
    min_loss_index = -1
    min_loss_params = [None, None, None, None, None, None]
    i = 0
    for epochs in epochs_lst:
        for split in splits_lst:
            for num_neurons in num_neurons_lst:  
                for activations in activations_lst:
                    if len(activations) == len(num_neurons):
                        for drop_rate in drop_rate_lst:
                            for optim in optim_lst:
                                model, history, loss = use_model(epochs, split, num_neurons, drop_rate, activations, optim)
                                models.append(model)
                                histories.append(history)
                                losses.append(loss)
                                if loss < min_loss:
                                    min_loss = loss
                                    min_loss_index = i
                                    min_loss_params = [epochs, split, num_neurons, drop_rate, activations, optim]
                                i += 1
    return (models, histories, losses, min_loss_index, min_loss_params)                   

Here is where I complete the grid search:

In [None]:
### DO NOT RUN!!! TAKES OVER THREE HOURS!
# values = grid_search(splits_lst=splits, num_neurons_lst=[[10], [25], [50], [100], [10, 10], [25, 25], [50, 50], [25, 25, 25]], activations_lst=[["relu"], ["relu", "relu"], ['relu', 'relu', 'relu']])


Epoch 1/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 2846.3721
Epoch 2/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2119.8428
Epoch 3/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1211.5654
Epoch 4/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 808.3501
Epoch 5/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 692.4478
Epoch 6/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 637.2757
Epoch 7/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 627.6744
Epoch 8/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 609.7730
Epoch 9/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 585.4927
Epoch 10/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5m

The best with each split for neuron numbers for each layer: [10], [25], [50], [100], [10, 10], [25, 25], [50, 50], [25, 25, 25]:
50/50 split: Overall best for everything: [10,10], Overall second best for everything: [25, 25, 25]
55/45 split: Overall third best for everything: [25, 25, 25]
60/40 split: [25]
65/35 split: [25, 25, 25]
70/30 split: [100]
75/25 split: Overall fourth best for everything: [10]

I do not plan on running that many options at once anymore, it took over 3 hours to run the code, I will be more optimized with it later.

However, here I am just putting the information I gained from it.

I will focus on three or more layers splits, as that tends to be the mode of top values.  I will focus on different numbers of neurons.

I originally forgot to implement early stopping, which is probably why this took so long, however I also will take the train test splitting
process out of the model creation, and after completing a few more small tests just choose the best train test split from our options.

0
16844.17725503635
[31093.94559680044, 31614.160160901243, 28477.337136599148, 27329.138615963748, 9219.050315632008, 14024.393427830808, 23241.993963667457, 12514.918745334582, 31614.725192554306, 29625.62296529599, 32024.193449590843, 24495.00074167491, 29801.11660030419, 30833.735186240254, 21960.436286778244, 14688.753224218455, 29689.801120951666, 23935.967962121962, 32129.301415730763, 30941.76495020004, 24385.619647521547, 37224.21363704647, 348382.0249504921, 80352.38872524719, 35241.63331987094, 36868.201094159056, 20857.28898020838, 33723.89607734079, 25700.097904839065, 27365.61572583416, 19802.419992914998, 17923.56760867715, 33245.510890228885, 33711.309537689034, 28626.0234241736, 26245.07491271642, 30051.00848047981, 27510.28309582618, 26401.322761041738, 35027.94327338077, 16844.17725503635, 33260.86685938259, 41154.48272937149, 37041.20508963101, 34998.375741014366, 20784.05573177042, 38448.76534254013, 37214.026082076976]


NameError: name 'X' is not defined