In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import load_data

In [3]:
# Data manipulation and analysis
import pandas as pd

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Model evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Pipeline
from sklearn.pipeline import Pipeline

In [4]:
file_name = "sp500_adj_close_raw"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `sp500_adj_close_raw.csv.bz2` loaded from `sp500_adj_close_raw.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


In [33]:
raw_data

Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.727250,22.540232,21.392035,24.351929,short
1,2008-01-02,AAPL,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.197630,6.135834,5.403559,4.637376,6.026839,buy
2,2008-01-02,ABT,18.130205,-0.006092,0.010484,34.677586,18.138458,17.628250,17.709028,19.233109,18.221804,16.775562,19.134010,short
3,2008-01-02,ACGL,7.608889,0.020444,0.016022,45.154190,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy
4,2008-01-02,ACN,26.437078,-0.017194,0.024039,54.812183,26.577982,27.784420,28.471031,28.227205,24.273773,24.765505,29.215664,sell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979674,2024-10-23,XYL,131.740000,0.002817,0.009287,45.930984,133.569260,134.558320,130.656600,137.970140,131.130870,126.710000,137.530000,buy
1979675,2024-10-23,YUM,134.010000,0.004874,0.010931,33.050730,134.760960,133.576890,133.945360,140.494570,130.855440,129.710000,139.920000,buy
1979676,2024-10-23,ZBH,104.680000,0.004028,0.010726,51.860280,107.599990,108.161766,115.560480,108.536896,101.405220,101.770000,115.912370,buy
1979677,2024-10-23,ZBRA,368.080000,-0.010538,0.009870,55.442924,354.979200,335.774200,309.989750,378.896670,362.719330,320.770000,377.680000,short


### Exploratory Data Analysis (EDA):
___

In [5]:
# Remove rows with today's date and seperate into a new dataframe
todays_date = "2024-10-23"
filter_data_by_date = raw_data["Date"] == todays_date

# Create a new dataframe with today's data
todays_data = raw_data[filter_data_by_date].reset_index(drop=True)

print("Shape:", todays_data.shape)
display(todays_data.tail())

Shape: (501, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
496,2024-10-23,XYL,131.74,0.002817,0.009287,45.930984,133.56926,134.55832,130.6566,137.97014,131.13087,126.71,137.53,buy
497,2024-10-23,YUM,134.01,0.004874,0.010931,33.05073,134.76096,133.57689,133.94536,140.49457,130.85544,129.71,139.92,buy
498,2024-10-23,ZBH,104.68,0.004028,0.010726,51.86028,107.59999,108.161766,115.56048,108.536896,101.40522,101.77,115.91237,buy
499,2024-10-23,ZBRA,368.08,-0.010538,0.00987,55.442924,354.9792,335.7742,309.98975,378.89667,362.71933,320.77,377.68,short
500,2024-10-23,ZTS,188.99,-0.002744,0.010509,45.437954,189.215,183.13512,179.39548,196.47697,186.50803,180.9,196.48,sell


In [6]:
# Create a new dataframe with historical data (excluding today's data)
historical_data = raw_data[~filter_data_by_date]

print("Shape:", historical_data.shape)
display(historical_data.tail())

Shape: (1979178, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1979173,2024-10-22,XYL,131.37,0.002817,0.009529,41.234707,133.56168,134.64737,130.55756,137.83467,131.45334,126.71,137.53,buy
1979174,2024-10-22,YUM,133.36,0.004874,0.01231,22.212175,134.8189,133.60426,133.90901,140.5323,130.7257,129.71,139.92,buy
1979175,2024-10-22,ZBH,104.26,0.004028,0.010689,44.982456,107.687744,108.26137,115.64309,108.65896,101.441536,101.77,115.91237,buy
1979176,2024-10-22,ZBRA,372.0,-0.010538,0.009591,54.217484,354.26,335.2168,309.4548,379.0747,362.23032,320.77,377.68,short
1979177,2024-10-22,ZTS,189.51,-0.002744,0.010508,43.550476,189.2028,182.9368,179.42395,196.46294,186.77606,180.9,196.48,sell


___
___

In [10]:
# Drop columns that are not needed
columns_to_drop = ["Date", "Ticker", "Action"]

data = historical_data.copy().drop(columns=columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1979178, 11)


Unnamed: 0,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance
0,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.72725,22.540232,21.392035,24.351929
1,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.19763,6.135834,5.403559,4.637376,6.026839
2,18.130205,-0.006092,0.010484,34.677586,18.138458,17.62825,17.709028,19.233109,18.221804,16.775562,19.13401
3,7.608889,0.020444,0.016022,45.15419,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778
4,26.437078,-0.017194,0.024039,54.812183,26.577982,27.78442,28.471031,28.227205,24.273773,24.765505,29.215664


In [25]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Return")

y = data["Return"].values.reshape(-1, 1)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1979178, 10)
Shape of y: (1979178, 1)


In [26]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1583342, 10)
Shape of X_test: (395836, 10)
Shape of y_train: (1583342, 1)
Shape of y_test: (395836, 1)


___
___

In [27]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    vif = pd.DataFrame()
    vif["Feature"] = X.columns

    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif

vif_df = calc_vif(X).sort_values("VIF").reset_index(drop=True)

In [28]:
vif_df

Unnamed: 0,Feature,VIF
0,Volatility,2.778646
1,RSI,2.830205
2,SMA_200,393.757437
3,Lower Band,717.828251
4,Adjusted Close,770.805699
5,Support,984.58673
6,Upper Band,1143.79425
7,SMA_100,1852.515674
8,Resistance,1898.216509
9,SMA_50,5593.733141


In [29]:
# Check P-Values

import statsmodels.api as sm

ols_model = sm.OLS(
    y_train,
    X_train
).fit()

ols_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x177426c7e20>

In [31]:
def highlight_p_values(row):
    return ["background-color: black" if value <= 0.05 else "" for value in row]

p_values = ols_model.pvalues.sort_values()

p_values_df = p_values.to_frame(name="p_value")

p_values_df.style.apply(highlight_p_values)

Unnamed: 0,p_value
Volatility,0.0
Adjusted Close,0.0
Lower Band,0.0
RSI,0.0
SMA_100,1.9e-05
SMA_200,0.000912
Upper Band,0.017002
Support,0.034043
SMA_50,0.574185
Resistance,0.82935


In [None]:
# Note: Volatility and RSI are statistically significant in both VIF and P-Value

___
___
