In [8]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import load_data
from utilities import temporal_train_test_split
from utilities import print_title, print_label

In [9]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Model evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
def split_dataset_by_date(raw_data: pd.DataFrame, todays_date: str):
    # Filter data by today's date
    filter_data_by_date = raw_data["Date"] == todays_date
    
    # Create a new dataframe with today's data
    todays_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    historical_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, todays_data

In [11]:
def filter_data_by_date_range(data: pd.DataFrame, end_date: str):
    # Determine the start date as the minimum date in the dataframe
    start_date = data["Date"].min()
    
    # Create a date range tuple
    date_range = (start_date, end_date)
    
    # Create a filter to exclude data within the specified date range
    remove_data_by_date = (data["Date"] >= date_range[0]) & (data["Date"] < date_range[1])
    
    # Filter the dataframe using the created filter
    filtered_data = data[~remove_data_by_date].reset_index(drop=True)
    
    return filtered_data

In [12]:
file_name = "test_w_na"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96m     File `test_w_na.csv.bz2` loaded from `test_w_na.zip`      [0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [13]:
todays_date = date.today().strftime("%Y-%m-%d")

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

#### Ensure Missing Values on Todays Data is what we are Predicting

In [14]:
todays_data.isnull().sum()

Date                    0
Ticker                  0
Adjusted Close          0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle Missing Values (NA's)


In [15]:
historical_data.isnull().sum()

Date                       0
Ticker                     0
Adjusted Close             0
Next Day Close             0
Previous Day Close         0
Return                140634
Volatility            142254
RSI                   140913
SMA_50                     0
SMA_100                    0
SMA_200                    0
Upper Band                 0
Lower Band                 0
Support                    0
Resistance                 0
Action                141539
dtype: int64

In [17]:
# Remove all rows where `Adjusted Close` is 0
# Stock did not trade on these days because it did not exist yet
historical_data = historical_data[historical_data["Adjusted Close"] != 0].reset_index(drop=True)

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                  81
Volatility            1701
RSI                    360
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1067
dtype: int64

In [18]:
# Remove all rows where `Volatility` is NaN
# This is because we don't have the necessary data to calculate volatility at these points
# We will not be able to use these rows for training

historical_data = historical_data.dropna(subset=["Volatility"]).reset_index(drop=True)

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                   0
Volatility               0
RSI                    358
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1050
dtype: int64

In [19]:
# Backfill the `RSI` column
# This is because the RSI is calculated based on the previous day's data
historical_data["RSI"] = historical_data["RSI"].bfill()

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                   0
Volatility               0
RSI                      0
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1050
dtype: int64

In [20]:
# Backfill the `Action` column (Trading Signals)
# This is because the trading signal is based on the previous day's data

historical_data["Action"] = historical_data["Action"].bfill()

historical_data.isnull().sum()

Date                  0
Ticker                0
Adjusted Close        0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [21]:
# Print tickers that do not have any missing values
print_title("Tickers that do not have any missing values", closed_corners=False)

num_tickers = len(historical_data["Ticker"].unique())
print_label("Number of unique tickers:", num_tickers)

missing_val_filter = historical_data.isnull().any(axis=1)

# Group by ticker and check for missing values within each group
grouped = historical_data.groupby("Ticker")

# Tickers with no missing values
tickers_no_missing_values = grouped.filter(lambda x: not x.isnull().any().any())["Ticker"].unique()
print_label("Number of tickers with no missing values:", len(tickers_no_missing_values))

# Tickers with missing values
tickers_missing_values = grouped.filter(lambda x: x.isnull().any().any())["Ticker"].unique()
print_label("Number of tickers with missing values:", len(tickers_missing_values), closed_corners=True)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


In [23]:
import numpy as np

# create numpy arraw with missing values and ticker
tickers_missing_val_count = np.array([(ticker, value) for ticker, value in historical_data[missing_val_filter]["Ticker"].value_counts().items()])

tickers_missing_val_count

array([], dtype=float64)

In [24]:
tickers_no_missing_values

array(['A', 'AAPL', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP',
       'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM',
       'ALB', 'ALGN', 'ALL', 'AMAT', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT',
       'AMZN', 'ANSS', 'AON', 'AOS', 'APA', 'APD', 'APH', 'ARE', 'ATO',
       'AVB', 'AVY', 'AXON', 'AXP', 'AZO', 'BA', 'BAC', 'BALL', 'BAX',
       'BBY', 'BDX', 'BEN', 'BG', 'BIIB', 'BK', 'BKNG', 'BKR', 'BLDR',
       'BLK', 'BMY', 'BR', 'BRO', 'BSX', 'BWA', 'BX', 'BXP', 'C', 'CAG',
       'CAH', 'CAT', 'CB', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CE', 'CF',
       'CHD', 'CHRW', 'CI', 'CINF', 'CL', 'CLX', 'CMCSA', 'CME', 'CMG',
       'CMI', 'CMS', 'CNC', 'CNP', 'COF', 'COO', 'COP', 'COR', 'COST',
       'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CSCO', 'CSGP', 'CSX', 'CTAS',
       'CTRA', 'CTSH', 'CVS', 'CVX', 'D', 'DAL', 'DD', 'DE', 'DECK',
       'DFS', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOC', 'DOV',
       'DPZ', 'DRI', 'DTE', 'DUK', 'DVA', 'DVN', 'DXCM', 'EA', 'EBAY',


### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [25]:
print("Shape:", todays_data.shape)
display(todays_data.head(2))
display(todays_data.tail(2))

Shape: (501, 16)


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2024-10-25,A,130.16,,130.69,-0.004055,0.012137,18.866226,140.33243,136.65535,137.94844,152.15059,129.75082,130.16,148.244,
1,2024-10-25,AAPL,231.4,,230.57,0.0036,0.013669,66.374374,226.8014,221.79668,200.76884,237.80908,222.31091,216.32,236.48,


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
499,2024-10-25,ZBRA,359.97,,362.05,-0.005745,0.009788,43.769238,355.8908,336.8905,311.04135,380.01144,359.82355,320.77,377.68,
500,2024-10-25,ZTS,179.91,,181.5,-0.00876,0.012576,36.496883,189.094,183.3149,179.2437,197.88783,182.27017,179.91,196.48,


#### Historical Data:

In [26]:
print("Shape:", historical_data.shape)
display(historical_data.head(2))
display(historical_data.tail(2))

Shape: (1978479, 16)


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256388,23.025743,23.53828,-0.011976,0.015704,48.82759,23.314175,23.299887,23.564934,24.727251,22.540236,21.392029,24.351946,short
1,2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.06735,5.518483,4.939064,4.19763,6.135833,5.403559,4.637376,6.02684,buy


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1978477,2024-10-24,ZBRA,362.05,359.97,368.09,-0.016409,0.010532,44.39913,355.5556,336.3185,310.51785,379.3288,361.3692,320.77,377.68,short
1978478,2024-10-24,ZTS,181.5,179.91,188.99,-0.039632,0.013328,35.00878,189.1774,183.23051,179.33058,197.27248,184.37552,180.9,196.48,short


#### Optional: For initial phase of training, filter large dataset.

In [28]:
remove_up_to = "2024-01-01"
historical_data = filter_data_by_date_range(historical_data, remove_up_to)

print("Shape:", historical_data.shape)
display(historical_data.head())
display(historical_data.tail())

Shape: (102689, 16)


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2024-01-02,A,138.04857,130.49696,138.32718,-0.002014,0.012683,78.40966,120.803246,117.69346,121.688,144.18768,123.93916,100.488594,138.8783,short
1,2024-01-02,AAPL,184.93822,183.55347,191.80217,-0.035787,0.011905,31.663397,186.06238,181.40567,178.60231,199.09311,187.49692,166.04036,197.36108,short
2,2024-01-02,ABBV,154.1838,154.80124,149.50482,0.031297,0.009123,76.448715,140.28699,141.73233,140.11014,154.52325,139.2701,132.74742,154.1838,hold
3,2024-01-02,ABNB,134.48,133.42,136.14,-0.012193,0.022394,34.148132,129.653,131.168,127.63273,149.52948,131.10352,114.09,147.5,short
4,2024-01-02,ABT,107.70071,107.37717,107.9164,-0.001999,0.007304,69.328995,99.63036,98.55591,100.87648,109.399055,101.375694,91.033325,108.23995,short


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
102684,2024-10-24,XYL,130.65,130.41,131.72,-0.008123,0.009364,41.637012,133.55902,134.4822,130.75377,138.21664,130.53435,126.71,137.53,short
102685,2024-10-24,YUM,133.16,133.04,134.02,-0.006417,0.010962,32.3681,134.69278,133.5222,133.97957,140.23546,130.62454,129.71,139.92,short
102686,2024-10-24,ZBH,104.0,102.33,104.7,-0.006686,0.010517,47.462685,107.51261,108.07578,115.47507,108.18526,101.41174,101.77,115.91237,short
102687,2024-10-24,ZBRA,362.05,359.97,368.09,-0.016409,0.010532,44.39913,355.5556,336.3185,310.51785,379.3288,361.3692,320.77,377.68,short
102688,2024-10-24,ZTS,181.5,179.91,188.99,-0.039632,0.013328,35.00878,189.1774,183.23051,179.33058,197.27248,184.37552,180.9,196.48,short


#### Inspect Tickers Individually

In [30]:
# Inspect tickers

select_ticker = raw_data["Ticker"] == "AAPL"

raw_data[select_ticker]

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1,2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.067350,5.518483,4.939064,4.197630,6.135833,5.403559,4.637376,6.02684,buy
502,2008-01-03,AAPL,5.879055,5.430276,5.876342,0.000462,0.018287,56.259520,5.530891,4.960155,4.213231,6.122779,5.462215,4.637376,6.02684,sell
1003,2008-01-04,AAPL,5.430276,5.357593,5.879055,-0.076335,0.025363,37.311500,5.527205,4.975917,4.226226,6.142305,5.426252,4.637376,6.02684,short
1504,2008-01-07,AAPL,5.357593,5.164871,5.430276,-0.013385,0.024475,36.547832,5.522205,4.992085,4.238845,6.171195,5.360235,4.637376,6.02684,short
2005,2008-01-08,AAPL,5.164871,5.410674,5.357593,-0.035972,0.024816,36.241930,5.515250,5.007573,4.250567,6.214129,5.247783,4.637376,6.02684,buy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2118730,2024-10-21,AAPL,236.480000,235.860000,235.000000,0.006298,0.012880,66.655846,225.934400,220.221050,199.802290,236.515660,221.650340,216.320000,236.48000,sell
2119231,2024-10-22,AAPL,235.860000,230.760000,236.480000,-0.002622,0.012744,64.702080,226.301000,220.668960,200.079120,237.478500,221.536500,216.320000,236.48000,short
2119732,2024-10-23,AAPL,230.760000,230.570000,235.860000,-0.021623,0.013733,57.298534,226.490800,221.056290,200.308620,237.575040,221.878950,216.320000,236.48000,short
2120233,2024-10-24,AAPL,230.570000,231.400000,230.760000,-0.000823,0.013688,55.555565,226.667800,221.423930,200.539280,237.665250,222.093750,216.320000,236.48000,buy


### Perform work: Multiple Versions of Dataset
___

In [None]:
main_data = historical_data.copy().reset_index(drop=True)

# Create multiple versions of the dataset

# Data with out dates and tickers (Set as index for reference)
data_v1 = main_data.copy().set_index(["Date", "Ticker"])

# Data with dates seperated into year, month, and day
data_v2 = main_data.copy()
data_v2["Date"] = pd.to_datetime(data_v2["Date"])
data_v2["Year"] = data_v2["Date"].dt.year
data_v2["Month"] = data_v2["Date"].dt.month
data_v2["Day"] = data_v2["Date"].dt.day
data_v2 = data_v2.set_index(["Date", "Ticker"])

data_v2

In [None]:
# Set the "Date" and "Ticker" columns as the index
data = historical_data.copy().set_index(["Date", "Ticker"])

# Drop the "Action" column
data = data.drop(columns=["Action"])

print("Shape:", data.shape)
data.head()

In [None]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Return")

y = data["Return"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

___
___

In [None]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set the threshold for VIF (Based on the domain knowledge of the data)
THRESHOLD = 2600

def highlight_vif(row):
    return ["background-color: black" if value < THRESHOLD else "" for value in row]

def calc_vif(X):
    # Calculate VIF values
    vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    # Create a dataframe to display the VIF values
    vif = pd.DataFrame(
        data={"VIF": vif_values},
        index=X.columns
    )

    return vif

vif_df = calc_vif(X).sort_values("VIF")
vif_df.style.apply(highlight_vif)

In [None]:
# Correctly access the "VIF" column and apply the condition
drop_vif_condition = (vif_df["VIF"] > THRESHOLD) | (vif_df["VIF"].isna())

# Select features that do not meet the drop condition
drop_vif_features = vif_df.loc[drop_vif_condition, :].index.tolist()

drop_vif_features

In [None]:
X_vif = X.drop(columns=drop_vif_features)

print("Shape of X_vif:", X_vif.shape)

___
___

In [None]:
# Split the data into training and testing sets
X_train, X_test, X_train_vif, X_test_vif, y_train, y_test = train_test_split(
    X,
    X_vif,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of X_train_vif:", X_train_vif.shape)
print("Shape of X_test_vif:", X_test_vif.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

___
___

In [None]:
# Check P-Values
import statsmodels.api as sm

ols_model = sm.OLS(
    y_train,
    X_train_vif
).fit()

ols_model

In [None]:
def highlight_p_values(row):
    return ["background-color: black" if value <= 0.05 else "" for value in row]

p_values = ols_model.pvalues.sort_values()

p_values_df = p_values.to_frame(name="p_value")

p_values_df.style.apply(highlight_p_values)

In [26]:
# Note: Volatility and RSI are statistically significant in both VIF and P-Value
# Both VIF and P-Value also indicated 'Resistance' and 'SMA_50' are not statistically significant

___
___


In [None]:
from sklearn.ensemble import RandomForestRegressor

all_feature_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
)

all_feature_model.fit(X_train, y_train)

In [None]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

In [30]:
import numpy as np

def adj_r2_score(model, X, y):
    r2 = model.score(X, y)
    n_cols = X.shape[1]
    return 1 - (1 - r2) * (len(y) -1) / (len(y) - n_cols - 1)

predict_y_all = lr_model.predict(X_test)

r2_all = lr_model.score(X_test, y_test)
adj_r2_all = adj_r2_score(lr_model, X_test, y_test)
mse_all = mean_squared_error(y_test, predict_y_all)
rmse = np.sqrt(mse_all)


In [None]:
print("R2:", r2_all)
print("Adjusted R2:", adj_r2_all)
print("MSE:", mse_all)
print("RMSE:", rmse)