In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import load_data
from utilities import temporal_train_test_split

In [2]:
# Data manipulation and analysis
import pandas as pd

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Model evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
def split_dataset_by_date(raw_data: pd.DataFrame, todays_date: str):
    # Filter data by today's date
    filter_data_by_date = raw_data["Date"] == todays_date
    
    # Create a new dataframe with today's data
    todays_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    historical_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, todays_data

In [5]:
def filter_data_by_date_range(data: pd.DataFrame, end_date: str):
    # Determine the start date as the minimum date in the dataframe
    start_date = data["Date"].min()
    
    # Create a date range tuple
    date_range = (start_date, end_date)
    
    # Create a filter to exclude data within the specified date range
    remove_data_by_date = (data["Date"] >= date_range[0]) & (data["Date"] < date_range[1])
    
    # Filter the dataframe using the created filter
    filtered_data = data[~remove_data_by_date].reset_index(drop=True)
    
    return filtered_data

In [6]:
file_name = "sp500_adj_close_raw"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `sp500_adj_close_raw.csv.bz2` loaded from `sp500_adj_close_raw.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

In [10]:
todays_date = "2024-10-23"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

In [13]:
print("Shape:", todays_data.shape)
display(todays_data.head(2))
display(todays_data.tail(2))

Shape: (501, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2024-10-23,A,133.02,-0.003297,0.017509,22.136305,140.62486,136.66023,137.94637,151.75916,133.48938,133.02,148.244,short
1,2024-10-23,AAPL,230.75,-0.021665,0.013737,57.282116,226.4906,221.05618,200.30858,237.57426,221.87874,216.32,236.48,short


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
499,2024-10-23,ZBRA,368.08,-0.010538,0.00987,55.442924,354.9792,335.7742,309.98975,378.89667,362.71933,320.77,377.68,short
500,2024-10-23,ZTS,188.99,-0.002744,0.010509,45.437954,189.215,183.13512,179.39548,196.47697,186.50803,180.9,196.48,sell


In [12]:
print("Shape:", historical_data.shape)
display(historical_data.head(2))
display(historical_data.tail(2))

Shape: (1979178, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.72725,22.540232,21.392035,24.351929,short
1,2008-01-02,AAPL,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.19763,6.135834,5.403559,4.637376,6.026839,buy


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1979176,2024-10-22,ZBRA,372.0,-0.010538,0.009591,54.217484,354.26,335.2168,309.4548,379.0747,362.23032,320.77,377.68,short
1979177,2024-10-22,ZTS,189.51,-0.002744,0.010508,43.550476,189.2028,182.9368,179.42395,196.46294,186.77606,180.9,196.48,sell


In [14]:
remove_up_to = "2024-01-01"
historical_data = filter_data_by_date_range(raw_data, remove_up_to)

print("Shape:", historical_data.shape)
display(historical_data.head())
display(historical_data.tail())

Shape: (102272, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2024-01-02,A,138.04858,-0.054703,0.012683,78.40992,120.803246,117.69346,121.688,144.18768,123.93916,100.4886,138.8783,short
1,2024-01-02,AAPL,184.93822,-0.007488,0.011905,31.663397,186.06238,181.40567,178.60231,199.09311,187.49692,166.04037,197.36108,short
2,2024-01-02,ABBV,154.1838,0.004004,0.009123,76.44889,140.28699,141.73233,140.11014,154.52327,139.2701,132.7474,154.1838,hold
3,2024-01-02,ABNB,134.48,-0.007882,0.022394,34.148132,129.653,131.168,127.63273,149.52948,131.10352,114.09,147.5,short
4,2024-01-02,ABT,107.700714,-0.003004,0.007304,69.32909,99.63036,98.55591,100.87648,109.39906,101.37569,91.033325,108.23996,short


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
102267,2024-10-23,XYL,131.74,0.002817,0.009287,45.930984,133.56926,134.55832,130.6566,137.97014,131.13087,126.71,137.53,buy
102268,2024-10-23,YUM,134.01,0.004874,0.010931,33.05073,134.76096,133.57689,133.94536,140.49457,130.85544,129.71,139.92,buy
102269,2024-10-23,ZBH,104.68,0.004028,0.010726,51.86028,107.59999,108.161766,115.56048,108.536896,101.40522,101.77,115.91237,buy
102270,2024-10-23,ZBRA,368.08,-0.010538,0.00987,55.442924,354.9792,335.7742,309.98975,378.89667,362.71933,320.77,377.68,short
102271,2024-10-23,ZTS,188.99,-0.002744,0.010509,45.437954,189.215,183.13512,179.39548,196.47697,186.50803,180.9,196.48,sell


___
___

In [15]:
# Set the "Date" and "Ticker" columns as the index
data = historical_data.copy().set_index(["Date", "Ticker"])

# Drop the "Action" column
data = data.drop(columns=["Action"])

print("Shape:", data.shape)
data.head()

Shape: (102272, 11)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-01-02,A,138.04858,-0.054703,0.012683,78.40992,120.803246,117.69346,121.688,144.18768,123.93916,100.4886,138.8783
2024-01-02,AAPL,184.93822,-0.007488,0.011905,31.663397,186.06238,181.40567,178.60231,199.09311,187.49692,166.04037,197.36108
2024-01-02,ABBV,154.1838,0.004004,0.009123,76.44889,140.28699,141.73233,140.11014,154.52327,139.2701,132.7474,154.1838
2024-01-02,ABNB,134.48,-0.007882,0.022394,34.148132,129.653,131.168,127.63273,149.52948,131.10352,114.09,147.5
2024-01-02,ABT,107.700714,-0.003004,0.007304,69.32909,99.63036,98.55591,100.87648,109.39906,101.37569,91.033325,108.23996


In [16]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Return")

y = data["Return"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (102272, 10)
Shape of y: (102272,)


___
___

In [None]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set the threshold for VIF (Based on the domain knowledge of the data)
THRESHOLD = 1880

def highlight_vif(row):
    return ["background-color: black" if value < THRESHOLD else "" for value in row]

def calc_vif(X):
    # Calculate VIF values
    vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    # Create a dataframe to display the VIF values
    vif = pd.DataFrame(
        data={"VIF": vif_values},
        index=X.columns
    )

    return vif

vif_df = calc_vif(X).sort_values("VIF")
vif_df.style.apply(highlight_vif)

In [None]:
# Correctly access the "VIF" column and apply the condition
drop_vif_condition = (vif_df["VIF"] > THRESHOLD) | (vif_df["VIF"].isna())

# Select features that do not meet the drop condition
drop_vif_features = vif_df.loc[drop_vif_condition, :].index.tolist()

drop_vif_features

In [None]:
X_vif = X.drop(columns=drop_vif_features)

print("Shape of X_vif:", X_vif.shape)

___
___

In [None]:
# Split the data into training and testing sets
X_train, X_test, X_train_vif, X_test_vif, y_train, y_test = train_test_split(
    X,
    X_vif,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of X_train_vif:", X_train_vif.shape)
print("Shape of X_test_vif:", X_test_vif.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

___
___

In [None]:
# Check P-Values
import statsmodels.api as sm

ols_model = sm.OLS(
    y_train,
    X_train_vif
).fit()

ols_model

In [None]:
def highlight_p_values(row):
    return ["background-color: black" if value <= 0.05 else "" for value in row]

p_values = ols_model.pvalues.sort_values()

p_values_df = p_values.to_frame(name="p_value")

p_values_df.style.apply(highlight_p_values)

In [15]:
# Note: Volatility and RSI are statistically significant in both VIF and P-Value
# Both VIF and P-Value also indicated 'Resistance' and 'SMA_50' are not statistically significant

___
___


In [None]:
from sklearn.ensemble import RandomForestRegressor

all_feature_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
)

all_feature_model.fit(X_train, y_train)

In [None]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

In [30]:
import numpy as np

def adj_r2_score(model, X, y):
    r2 = model.score(X, y)
    n_cols = X.shape[1]
    return 1 - (1 - r2) * (len(y) -1) / (len(y) - n_cols - 1)

predict_y_all = lr_model.predict(X_test)

r2_all = lr_model.score(X_test, y_test)
adj_r2_all = adj_r2_score(lr_model, X_test, y_test)
mse_all = mean_squared_error(y_test, predict_y_all)
rmse = np.sqrt(mse_all)


In [None]:
print("R2:", r2_all)
print("Adjusted R2:", adj_r2_all)
print("MSE:", mse_all)
print("RMSE:", rmse)