<a href="https://colab.research.google.com/github/Aditya-Patel/Stat598-FinalProject/blob/main/MPLA_CNN_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Dataset
import yfinance as yf

# Visualization
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.dates as mdates
import seaborn as sns
mpl.rcParams['figure.dpi'] = 125
mpl.rcParams['figure.figsize'] = (10, 5)

# Date Manipulation
from datetime import datetime

# PyTorch Libraries
import tensorflow as tf
from tensorflow import keras

# statstical testing, plotting and decompositions
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Set device usage to GPU if available
RANDOM_SEED = 42
tf.random.set_seed(RANDOM_SEED)
device = tf.device('/device:gpu:1')

In [None]:
# Neural Network Constants
TRAINING_EPOCHS = 500
BATCH_SIZE = 32
NEURON_CT = 256
POOL_SZ = 4
STRIDES = 1
LEARN_RATE = 0.1

In [None]:
start_date = '2020-06-01'
end_date = '2023-12-01'
etf_ticker = 'MLPA'
moving_average_list = []
etf_tickers_url = "https://raw.githubusercontent.com/Aditya-Patel/Stat598-FinalProject/main/mlpa_full-holdings.csv"
crude_oil_stock_url = "https://raw.githubusercontent.com/Aditya-Patel/Stat598-FinalProject/main/crude%20oil%20spot%20price.csv"

<h1>Create joint dataset between spot price and ETF Data</h1>

In [None]:
df_holdings = pd.read_csv(etf_tickers_url)
df_holdings = df_holdings[(df_holdings['Name'] != 'OTHER PAYABLE & RECEIVABLES') & (df_holdings['Name'] != 'CASH')]
df_holdings[f'Market Value ($)'] = df_holdings[f'Market Value ($)'].str.replace(',', '').astype(float)
total_market_value = df_holdings[f'Market Value ($)'].sum()
df_holdings['Percentage Holdings By Value'] = (df_holdings[f'Market Value ($)'] / total_market_value)

df_crude_price = pd.read_csv(crude_oil_stock_url,usecols=[0, 1])
df_crude_price['Date'] = pd.to_datetime(df_crude_price['Date'], format='%b %d, %Y')
df_crude_price.set_index('Date', inplace=True)
df_crude_price.rename(columns={'WTI Barrell Spot Price':'Spot Price'}, inplace=True)
df_crude_price['Spot Price'] = df_crude_price['Spot Price'].fillna(method='ffill')

all_stocks_data = yf.download(etf_ticker, start=start_date, end=end_date)

# Join spot price and etf data
all_stocks_data['Ticker'] = etf_ticker
all_stocks_data.columns = [f'{etf_ticker}_{col}' if col not in ['Ticker', 'Date'] else col for col in all_stocks_data.columns]
all_stocks_data = all_stocks_data.join(df_crude_price, how='left')

<h1>Load all tickers within the ETF</h1>

In [None]:
# load all tickers part of that etf
for ticker in df_holdings['Ticker']:
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    percentage_holding = df_holdings.loc[df_holdings['Ticker'] == ticker, 'Percentage Holdings By Value'].iloc[0]
    all_stocks_data[f'{ticker}_Percent_Holding'] = percentage_holding
    stock_data.columns = [f'{ticker}_{col}' if col != 'Ticker' else col for col in stock_data.columns]
    all_stocks_data = all_stocks_data.join(stock_data, how='outer')

all_stocks_data.fillna(0, inplace=True)
sum_values = pd.Series(0, index=all_stocks_data.index)

In [None]:
# Get all closing values

all_stocks_data[f'{etf_ticker}_Next_Close'] = all_stocks_data[f'{etf_ticker}_Close'].shift(-1)
all_stocks_data = all_stocks_data.drop(all_stocks_data.index[-1])
close_values = all_stocks_data[[col for col in all_stocks_data.columns if '_Close' in col or col == 'Spot Price' or col == f'{etf_ticker}_Next_Close']]


In [None]:
# Check correlation
df = close_values.drop(columns=[f'{etf_ticker}_Next_Close'])
sns.heatmap(df.corr(), annot=False)
plt.show()

Based on the plot of the correlation matrix, we see that most of the stocks are correlated positively with each other with the exception of SMLP, NGL and USDP, which have negative correlation. We expect to see this as a good ETF consists of a variety of tickers to protect against large market swings either way.

<h1>Model Development<h1>
<h2> A Dense Neural Network is developed in TensorFlow to perform future analysis based on the previous closing price trend. <h2>

In [None]:
# Generate X and y input datasets - Since we are predicting the next day value, we use the 'Next_Close' as the target value
y = close_values[f'{etf_ticker}_Next_Close']
X = close_values.drop(columns=[f'{etf_ticker}_Next_Close'])

In [None]:
# Split data for training and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=RANDOM_SEED)

# Convert to tensors and prefetch
train_df = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_df = tf.data.Dataset.from_tensor_slices((X_test, y_test))

train_df = train_df.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
test_df = test_df.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
# CNN - 3 Convolution Layers, 3 Dense Layers
model_1 = keras.models.Sequential([
    # Convolution Layer
    keras.layers.Conv1D(input_shape=(22,1), filters=NEURON_CT/4, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT/2, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    # DNN Layer
    keras.layers.Dense(NEURON_CT, activation='relu'),
    keras.layers.Dense(NEURON_CT/2, activation='relu'),
    keras.layers.Dense(1)
])

model_1.compile(optimizer='adam', loss='mean_squared_error', metrics=['root_mean_squared_error'])
model_1.summary()

In [None]:
# CNN 1- 2 Convolution Layers, 4 Dense Layers
model_2 = keras.models.Sequential([
    # Convolution Layer
    keras.layers.Conv1D(input_shape=(22,1), filters=NEURON_CT/4, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    # DNN Layer
    keras.layers.Dense(NEURON_CT, activation='relu'),
    keras.layers.Dense(NEURON_CT/2, activation='relu'),
    keras.layers.Dense(NEURON_CT/4, activation='relu'),
    keras.layers.Dense(1)
])

model_2.compile(optimizer='adam', loss='mean_squared_error', metrics=['root_mean_squared_error'])
model_2.summary()

In [None]:
# CNN 1- 4 Convolution Layers, 2 Dense Layers
model_3 = keras.models.Sequential([
    # Convolution Layer
    keras.layers.Conv1D(input_shape=(22,1), filters=NEURON_CT/8, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT/4, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT/2, kernel_size=(3,), activation='relu'),
    keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    keras.layers.Conv1D(filters=NEURON_CT, kernel_size=(3,), activation='relu'),
    # keras.layers.MaxPool1D(pool_size=POOL_SZ, strides=STRIDES, padding='valid'),
    # DNN Layer
    keras.layers.Dense(NEURON_CT, activation='relu'),
    keras.layers.Dense(1)
])

model_3.compile(optimizer='adam', loss='mean_squared_error', metrics=['root_mean_squared_error'])
model_3.summary()

In [None]:
# CNN - 1 Convolution Layers, 3 Dense Layers
model_4 = keras.models.Sequential([
    # Convolution Layer
    keras.layers.Conv1D(input_shape=(22,1), filters=NEURON_CT, kernel_size=(3,), activation='relu'),
    # DNN Layer
    keras.layers.Dense(NEURON_CT, activation='relu'),
    keras.layers.Dense(NEURON_CT/2, activation='relu'),
    keras.layers.Dense(1)
])

model_4.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_logarithmic_error'])
model_4.summary()

In [None]:
# lr_reducer = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=LEARN_RATE, patience=5)    

In [None]:
# Fit model to training data with 20% validation split
model_1.fit(X_train,
            y_train,
            epochs = TRAINING_EPOCHS,
            batch_size = BATCH_SIZE,
            # callbacks = [lr_reducer],
            validation_split = 0.3)

model_2.fit(X_train,
            y_train,
            epochs = TRAINING_EPOCHS,
            batch_size = BATCH_SIZE,
            # callbacks = [lr_reducer],
            validation_split = 0.3)

model_3.fit(X_train,
            y_train,
            epochs = TRAINING_EPOCHS,
            batch_size = BATCH_SIZE,
            # callbacks = [lr_reducer],
            validation_split = 0.3)

model_4.fit(X_train,
            y_train,
            epochs = TRAINING_EPOCHS,
            batch_size = BATCH_SIZE,
            # callbacks = [lr_reducer],
            validation_split = 0.3)

In [None]:
# Predict next close with all models
y1_pred = [val[0] for val in [val[0] for val in model_1.predict(X_test)]]
mse1 = mean_squared_error(y_true=y_test, y_pred=y1_pred)


y2_pred = [val[0] for val in [val[0] for val in model_2.predict(X_test)]]
mse2 = mean_squared_error(y_true=y_test, y_pred=y2_pred)


y3_pred = [val[0] for val in [val[0] for val in model_3.predict(X_test)]]
mse3 = mean_squared_error(y_true=y_test, y_pred=y3_pred)

y4_pred = [val[0] for val in [val[0] for val in model_4.predict(X_test)]]
mse4 = mean_squared_error(y_true=y_test, y_pred=y4_pred)

In [None]:
results_df = pd.DataFrame(y_test)
results_df.columns = ['y_actual']
results_df['y1_pred'] = y1_pred
results_df['y2_pred'] = y2_pred
results_df['y3_pred'] = y3_pred
results_df['y4_pred'] = y4_pred
results_df.sort_index(inplace=True)

In [None]:
print(f'Hyperparameters: [Training Epochs: {TRAINING_EPOCHS} || Batch Size: {BATCH_SIZE} || Neurons: {NEURON_CT}]')
print(f'Price forecast: model_1: MSE: {mse1:.4f}')
print(f'Price forecast: model_2: MSE: {mse2:.4f}')
print(f'Price forecast: model_3: MSE: {mse3:.4f}')
print(f'Price forecast: model_4: MSE: {mse4:.4f}')

In [None]:
x = [x for x in range(len(y_test))]
plt.plot(results_df.y_actual, '.-k', label='Actual')
plt.plot(results_df.y1_pred, '.b', label='Model 1')
plt.plot(results_df.y2_pred, 'xg', label='Model 2')
plt.plot(results_df.y3_pred, '.r', label='Model 3')
plt.plot(results_df.y4_pred, '+y', label='Model 4')
plt.title('Next Day Close Prediction')
plt.legend()
plt.show()

In [None]:
y1_diff = results_df.y_actual - results_df.y1_pred
y2_diff = results_df.y_actual - results_df.y2_pred
y3_diff = results_df.y_actual - results_df.y3_pred
y4_diff = results_df.y_actual - results_df.y4_pred

plt.bar(results_df.index, y1_diff, color='blue', label='Model 1')
plt.title('Model 1: Error')
plt.legend()
plt.show()

In [None]:
plt.bar(results_df.index, y2_diff, color='green', label='Model 2')
plt.title('Model 2: Error')
plt.legend()
plt.show()

In [None]:
plt.bar(results_df.index, y3_diff, color='red', label='Model 3')
plt.title('Model 3: Error')
plt.legend()
plt.show()

In [None]:
plt.bar(results_df.index, y4_diff, color='goldenrod', label='Model 4')
plt.title('Model 4: Error')
plt.legend()
plt.show()